diff --git a/.dockerignore b/.dockerignore
new file mode 120000
index 0000000000000000000000000000000000000000..3e4e48b0b5fe6b468434d6767749b399319f2da2
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1 @@
+.gitignore
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index ee8489c1d71bd050b9a1d9358a664d2294165292..1c9730a5ad57cd70613c0692529bcb1ccf056d59 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,7 @@ build/
 .cproject
 .pydevproject
 Makefile
+.test_env/
+
+*~
+bazel-*
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..f635e65784af47a21df80cc92073ef14eba9a731
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "warp-ctc"]
+	path = warp-ctc
+	url = https://github.com/baidu-research/warp-ctc.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 90c25e435083d78ad4c123999a588aaf9092f719..b9902a863d864b28f0fad0fefe64248e356010e4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,10 +2,12 @@
     sha: c25201a00e6b0514370501050cf2a8538ac12270
     hooks:
     -   id: remove-crlf
+        files: (?!.*warp-ctc)^.*$
 -   repo: https://github.com/reyoung/mirrors-yapf.git
     sha: v0.13.2
     hooks:
-    -   id: yapf
+    - id: yapf
+      files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$  # Bazel BUILD files follow Python syntax.
 -   repo: https://github.com/pre-commit/pre-commit-hooks
     sha: 7539d8bd1a00a3c1bfd34cdb606d3a6372e83469
     hooks:
@@ -13,6 +15,7 @@
     -   id: check-merge-conflict
     -   id: check-symlinks
     -   id: detect-private-key
+        files: (?!.*warp-ctc)^.*$
     -   id: end-of-file-fixer
 -   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
     sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
diff --git a/.travis.yml b/.travis.yml
index ffe3bc193b49eb3b3318cbbc7f1c3d86dc205c14..5b14f8e61e6143bb22a3aad5e0a9b11688b1b4be 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,10 +8,13 @@ os:
 env:
   - JOB=DOCS
   - JOB=BUILD_AND_TEST
+  - JOB=PRE_COMMIT
 matrix:
   exclude:
     - os: osx
-      env: JOB=DOCS  # Only generate documentation in linux
+      env: JOB=DOCS  # Only generate documentation in linux.
+    - os: osx
+      env: JOB=PRE_COMMIT # Only check pre-commit hook in linux
 
 addons:
   apt:
@@ -26,10 +29,6 @@ addons:
       - python-pip
       - python2.7-dev
       - m4
-      - libprotobuf-dev
-      - doxygen
-      - protobuf-compiler
-      - python-protobuf
       - python-numpy
       - python-wheel
       - libgoogle-glog-dev
@@ -39,18 +38,25 @@ addons:
       - lcov
       - graphviz
       - swig
+      - clang-format-3.8
+      - automake
+      - libtool
 before_install:
   - |
     if [ ${JOB} == "BUILD_AND_TEST" ]; then
-      if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)'
-      then
-        echo "Only markdown docs were updated, stopping build process."
-        exit
+      local change_list=`git diff --name-only $TRAVIS_COMMIT_RANGE`
+      if [ $? -eq 0 ]; then  # if git diff return no zero, then rerun unit test.
+        if ! echo ${change_list} | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)'
+        then
+          echo "Only markdown docs were updated, stopping build process."
+          exit
+        fi
       fi
     fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
-  - pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy
+  - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
+  - pip install wheel protobuf sphinx recommonmark virtualenv numpy sphinx_rtd_theme pre-commit
 script:
   - paddle/scripts/travis/main.sh
 notifications:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 090ac9e188422099cc4270b87064b5590e7b620c..65fbbb481c432f7b905f4dec7ea39c51ec853ae8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,10 +1,6 @@
 cmake_minimum_required(VERSION 2.8)
 
 project(paddle CXX C)
-set(PADDLE_MAJOR_VERSION 0)
-set(PADDLE_MINOR_VERSION 9)
-set(PADDLE_PATCH_VERSION 0a0)
-set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
 set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
@@ -12,14 +8,25 @@ include(package)
 find_package(SWIG 2.0)
 find_package(CUDA QUIET)
 find_package(Protobuf REQUIRED)
+
+# Check protobuf library version.
+execute_process(COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --version
+    OUTPUT_VARIABLE PROTOBUF_VERSION)
+string(REPLACE "libprotoc " "" PROTOBUF_VERSION ${PROTOBUF_VERSION})
+
+set(PROTOBUF_3 OFF)
+if (${PROTOBUF_VERSION} VERSION_GREATER "3.0.0" OR ${PROTOBUF_VERSION} VERSION_EQUAL "3.0.0")
+    set(PROTOBUF_3 ON)
+endif()
+
 find_package(PythonLibs 2.7 REQUIRED)
 find_package(PythonInterp 2.7 REQUIRED)
 find_package(ZLIB REQUIRED)
 find_package(NumPy REQUIRED)
 find_package(Threads REQUIRED)
 find_package(AVX QUIET)
-find_package(Glog)
-find_package(Gflags QUIET)
+find_package(Glog REQUIRED)
+find_package(Gflags REQUIRED)
 find_package(GTest)
 find_package(Sphinx)
 find_package(Doxygen)
@@ -33,9 +40,8 @@ option(WITH_AVX "Compile PaddlePaddle with avx intrinsics" ${AVX_FOUND})
 option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
 option(WITH_STYLE_CHECK "Style Check for PaddlePaddle" ${PYTHONINTERP_FOUND})
 option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF)
-option(WITH_GLOG "Compile PaddlePaddle use glog, otherwise use a log implement internally" ${LIBGLOG_FOUND})
-option(WITH_GFLAGS "Compile PaddlePaddle use gflags, otherwise use a flag implement internally" ${GFLAGS_FOUND})
 option(WITH_TIMER "Compile PaddlePaddle use timer" OFF)
+option(WITH_PROFILER "Compile PaddlePaddle use gpu profiler" OFF)
 option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
 option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
 option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
@@ -43,13 +49,7 @@ option(ON_TRAVIS "Running test on travis-ci or not." OFF)
 option(ON_COVERALLS "Generating code coverage data on coveralls or not." OFF)
 option(COVERALLS_UPLOAD "Uploading the generated coveralls json." ON)
 
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING 
-        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
-        FORCE)
-endif()
 
-include(enableCXX11)
 include(cpplint)
 include(ccache)
 if(WITH_RDMA)
@@ -63,50 +63,25 @@ include(check_packages)
 include(swig)
 include(coveralls)
 
-# add PaddlePaddle version
-if(DEFINED ENV{PADDLE_VERSION})
-    add_definitions(-DPADDLE_VERSION=\"$ENV{PADDLE_VERSION}\")
-else()
-    if(EXISTS ${PROJ_ROOT}/.svn/)
-        find_package(Subversion REQUIRED)
-        if(SUBVERSION_FOUND)
-            Subversion_WC_INFO(${PROJ_ROOT} Project)
-            add_definitions(-DPADDLE_VERSION=${Project_WC_REVISION})
-        endif()
-    elseif(EXISTS ${PROJ_ROOT}/.git/)
-        find_package(Git REQUIRED)
-        execute_process(
-            COMMAND ${GIT_EXECUTABLE} log -1 --format=%H
-            WORKING_DIRECTORY ${PROJ_ROOT}
-            OUTPUT_VARIABLE GIT_SHA1
-            RESULT_VARIABLE GIT_RESULT
-            ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-        if(NOT ${GIT_RESULT})
-            add_definitions(-DPADDLE_VERSION=\"${GIT_SHA1}\")
-        else()
-            message(WARNING "Cannot add paddle version from git tag")
-        endif()
-    endif()
-endif()
-
+# Set PaddlePaddle version to Git tag name or Git commit ID.
+find_package(Git REQUIRED)
+# version.cmake will get the current PADDLE_VERSION
+include(version)
+add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION})
 
 if(NOT WITH_GPU)
     add_definitions(-DPADDLE_ONLY_CPU)
     add_definitions(-DHPPL_STUB_FUNC)
+
     list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
-    if(${CUDA_VERSION_MAJOR} GREATER 6)
-        if(COMPILER_SUPPORT_CXX11)
-            LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
-        endif()
+    if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
+        message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile")
     endif()
 
-    # TODO(yuyang18): Change it to remove std=c++11 in cuda compile.
-    set(CUDA_PROPAGATE_HOST_FLAGS OFF)
     if(NOT CUDNN_FOUND)
         message(FATAL_ERROR "Paddle need cudnn to compile")
     endif()
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-g -O3 --use_fast_math")
 
     if(WITH_AVX)
         set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
@@ -114,16 +89,15 @@ else()
         set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
     endif(WITH_AVX)
 
-    if(WITH_DSO)
-        set(CUDA_LIBRARIES "")
-        add_definitions(-DPADDLE_USE_DSO)
-    endif(WITH_DSO)
-
     # Include cuda and cudnn
     include_directories(${CUDNN_INCLUDE_DIR})
     include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
 
+if(WITH_DSO)
+    add_definitions(-DPADDLE_USE_DSO)
+endif(WITH_DSO)
+
 if(WITH_DOUBLE)
     add_definitions(-DPADDLE_TYPE_DOUBLE)
     set(ACCURACY double)
@@ -135,6 +109,10 @@ if(NOT WITH_TIMER)
     add_definitions(-DPADDLE_DISABLE_TIMER)
 endif(NOT WITH_TIMER)
 
+if(NOT WITH_PROFILER)
+    add_definitions(-DPADDLE_DISABLE_PROFILER)
+endif(NOT WITH_PROFILER)
+
 if(WITH_AVX)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
@@ -156,16 +134,12 @@ else(WITH_RDMA)
   add_definitions(-DPADDLE_DISABLE_RDMA)
 endif(WITH_RDMA)
 
-if(WITH_GLOG)
-    add_definitions(-DPADDLE_USE_GLOG)
-    include_directories(${LIBGLOG_INCLUDE_DIR})
-endif()
+# glog
+include_directories(${LIBGLOG_INCLUDE_DIR})
 
-if(WITH_GFLAGS)
-    add_definitions(-DPADDLE_USE_GFLAGS)
-    add_definitions(-DGFLAGS_NS=${GFLAGS_NAMESPACE})
-    include_directories(${GFLAGS_INCLUDE_DIRS})
-endif()
+#gflags
+add_definitions(-DGFLAGS_NS=${GFLAGS_NAMESPACE})
+include_directories(${GFLAGS_INCLUDE_DIRS})
 
 if(WITH_TESTING)
     enable_testing()
@@ -189,5 +163,4 @@ add_subdirectory(paddle)
 add_subdirectory(python)
 if(WITH_DOC)
     add_subdirectory(doc)
-    add_subdirectory(doc_cn)
 endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..0d4bb973ae87bb45ef4386a63c26ed62602f2cee
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1 @@
+./doc/howto/dev/contribute_to_paddle_en.md
diff --git a/LICENSE b/LICENSE
index 2ff3140db0d7025369cb8fe3036ec0679eb6a023..e77bd090ee4f1ae549089f7de660a58364f42170 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 
                                  Apache License
                            Version 2.0, January 2004
@@ -188,7 +188,7 @@ Copyright (c) 2016 Baidu, Inc. All Rights Reserved
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/RELEASE.md b/RELEASE.md
new file mode 100644
index 0000000000000000000000000000000000000000..a8a245ab442ba0fc63d1f1fda932e7590a6fe4ca
--- /dev/null
+++ b/RELEASE.md
@@ -0,0 +1,69 @@
+# Release v0.9.0
+
+## New Features:
+
+* New Layers
+  * bilinear interpolation layer.
+  * spatial pyramid-pool layer.
+  * de-convolution layer.
+  * maxout layer.
+* Support rectangle padding, stride, window and input for Pooling Operation.
+* Add —job=time in trainer, which can be used to print time info without compiler option -WITH_TIMER=ON.
+* Expose cost_weight/nce_layer in `trainer_config_helpers`
+* Add FAQ, concepts, h-rnn docs.
+* Add Bidi-LSTM and DB-LSTM to quick start demo @alvations
+* Add usage track scripts.
+
+## Improvements
+
+* Add Travis-CI for Mac OS X. Enable swig unittest in Travis-CI. Skip Travis-CI when only docs are changed.
+* Add code coverage tools.
+* Refine convolution layer to speedup and reduce GPU memory.
+* Speed up PyDataProvider2
+* Add ubuntu deb package build scripts.
+* Make Paddle use git-flow branching model.
+* PServer support no parameter blocks.
+
+## Bug Fixes
+
+* add zlib link to py_paddle
+* add input sparse data check for sparse layer at runtime
+* Bug fix for sparse matrix multiplication
+* Fix floating-point overflow problem of tanh
+* Fix some nvcc compile options
+* Fix a bug in yield dictionary in DataProvider
+* Fix SRL hang when exit.
+
+# Release v0.8.0beta.1
+New features:
+
+* Mac OSX is supported by source code. #138
+   * Both GPU and CPU versions of PaddlePaddle are supported.
+
+* Support CUDA 8.0
+
+* Enhance `PyDataProvider2`
+   * Add dictionary yield format. `PyDataProvider2` can yield a dictionary with key is data_layer's name, value is features.
+   * Add `min_pool_size` to control memory pool in provider.
+
+* Add `deb` install package & docker image for no_avx machines.
+   * Especially for cloud computing and virtual machines
+
+* Automatically disable `avx` instructions in cmake when machine's CPU don't support `avx` instructions.
+
+* Add Parallel NN api in trainer_config_helpers.
+
+* Add `travis ci` for Github
+
+Bug fixes:
+
+* Several bugs in trainer_config_helpers. Also complete the unittest for trainer_config_helpers
+* Check if PaddlePaddle is installed when unittest.
+* Fix bugs in GTX series GPU
+* Fix bug in MultinomialSampler
+
+Also more documentation was written since last release.
+
+# Release v0.8.0beta.0
+
+PaddlePaddle v0.8.0beta.0 release. The install package is not stable yet and it's a pre-release version.
diff --git a/WORKSPACE b/WORKSPACE
new file mode 100644
index 0000000000000000000000000000000000000000..f097c41da85affd1ff0b24200dbdbc63bf9c3ab6
--- /dev/null
+++ b/WORKSPACE
@@ -0,0 +1,31 @@
+# External dependency to Google protobuf.
+http_archive(
+    name="protobuf",
+    url="http://github.com/google/protobuf/archive/v3.1.0.tar.gz",
+    sha256="0a0ae63cbffc274efb573bdde9a253e3f32e458c41261df51c5dbc5ad541e8f7",
+    strip_prefix="protobuf-3.1.0")
+
+# External dependency to gtest 1.7.0.  This method comes from
+# https://www.bazel.io/versions/master/docs/tutorial/cpp.html.
+new_http_archive(
+    name="gtest",
+    url="https://github.com/google/googletest/archive/release-1.7.0.zip",
+    sha256="b58cb7547a28b2c718d1e38aee18a3659c9e3ff52440297e965f5edffe34b6d0",
+    build_file="third_party/gtest.BUILD",
+    strip_prefix="googletest-release-1.7.0")
+
+# External dependency to gflags.  This method comes from
+# https://github.com/gflags/example/blob/master/WORKSPACE.
+new_git_repository(
+    name="gflags",
+    tag="v2.2.0",
+    remote="https://github.com/gflags/gflags.git",
+    build_file="third_party/gflags.BUILD")
+
+# External dependency to glog.  This method comes from
+# https://github.com/reyoung/bazel_playground/blob/master/WORKSPACE
+new_git_repository(
+    name="glog",
+    remote="https://github.com/google/glog.git",
+    commit="b6a5e0524c28178985f0d228e9eaa43808dbec3c",
+    build_file="third_party/glog.BUILD")
diff --git a/benchmark/.gitignore b/benchmark/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..7b66e8a5b5020fd847982db401665d24ba3a069c
--- /dev/null
+++ b/benchmark/.gitignore
@@ -0,0 +1,9 @@
+paddle/image/logs
+paddle/image/*.pyc
+paddle/image/train.list
+paddle/rnn/logs
+paddle/rnn/*.pyc
+paddle/rnn/imdb.pkl
+caffe/image/logs
+tensorflow/image/logs
+tensorflow/rnn/logs
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..367013f0457f9bbb9ae1335ea63dce181316d444
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,168 @@
+# Benchmark
+
+Machine: 
+
+- CPU: 12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz
+- GPU: Tesla K40m
+- cuDNN: v5.1
+- system: Docker 1.12.1, all platforms are tested in docker environment.
+
+Platforms: 
+
+- PaddlePaddle: paddledev/paddle:gpu-devel-v0.9.0a0 
+- Tensorflow: gcr.io/tensorflow/tensorflow:0.11.0rc0-gpu 
+- Caffe: kaixhin/cuda-caffe
+
+Several convolutional neural networks and recurrent neural networks are used to test.
+
+## Image
+
+### Benchmark Model
+
+AlexNet, GoogleNet and a small network used in Caffe.
+
+- [AlexNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet): but the group size is one.
+
+- [GoogleNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet): but remove loss1 and loss2 when testing benchmark.
+
+- [SmallNet](https://github.com/BVLC/caffe/blob/master/examples/cifar10/cifar10\_quick\_train\_test.prototxt)
+
+
+### Single-GPU
+
+- AlexNet:  input - 3 * 227 * 227,  Time: ms/batch
+
+| BatchSize    | 64  | 128  | 256   | 512  |
+|--------------|-----| -----| ------| -----|
+| PaddlePaddle | 195 | 334  | 602   | 1629 |
+| TensorFlow   | 223 | 364  | 645   | 1235 |
+| Caffe        | 324 | 627  | 1232  | 2513 |
+ 
+**Notation**
+
+All platforms use cuDNN-v5.1. We see that caffe is slower in this experiment, because its workspace limit size of cuDNN-conv interface is 8 * 1024 * 1024, which is smaller in PaddlePaddle and TensorFlow. Note that Caffe will be faster if increasing the workspace limit size.
+ 
+- GoogletNet:  input - 3 * 224 * 224, Time: ms/batch
+
+
+| BatchSize    | 64    |   128  | 256     |
+|--------------|-------| -------| --------|
+| PaddlePaddle | 613   | 1149   | 2348    |
+| TensorFlow   | 644   | 1176   | 2219    |
+| Caffe        | 694   | 1364   | out of memory   |
+
+- SmallNet: input - 3 * 32 * 32, Time ms/batch
+
+| BatchSize    | 64     |   128    | 256     | 512     |
+|--------------|--------| -------- | --------|---------|
+| PaddlePaddle | 10.463 | 18.184   | 33.113  |  63.039 |
+| TensorFlow   | 9     | 15       | 28      | 59       |
+| Caffe        | 9.373  | 16.6606  | 31.4797 | 59.719  |
+
+**Notation**
+
+All the single-GPU experiments in caffe use `caffe time` to calculate elapsed time, which does not include parameter updating time. However, both PaddlePaddle and TensorFlow experiments contain the parameter updating time. As compared with the total time, this part is relatively little on single machine, we can ignore it.
+
+In Tensorflow, they implement algorithm searching method instead of using the algorithm searching interface in cuDNN.
+
+### Multi-GPU: 4 GPUs
+
+- AlexNet,  ms / batch
+
+| total-BatchSize | 128 * 4  | 256 * 4    |
+|------------------|----------| -----------|
+| PaddlePaddle     | 347      | 622        |
+| TensorFlow       | 377      | 675        |
+| Caffe            | 1229     | 2435       |
+
+For example, if `total-BatchSize = 128 * 4`, the speedup ratio is calculated by 
+
+```
+  time_at_1gpu_batch_128 * 4 / time_at_4gpu_total_batch_512 
+= (334 * 4)/347 
+= 3.85
+``` 
+
+<img src="figs/alexnet-4gpu.png" width="420">
+
+
+- GoogleNet, ms / batch
+
+| total-BatchSize  | 128 * 4      |  256 * 4    |
+|-------------------|--------------| ----------- |
+| PaddlePaddle      | 1178         | 2367        |
+| TensorFlow        | 1210         | 2292        |
+| Caffe             | 2007         | out of memory  |
+
+<img src="figs/googlenet-4gpu.png" width="420">
+
+
+## RNN
+We use lstm network for text classfication to test benchmark.
+
+### Dataset
+-  [IMDB](http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl)
+- Sequence length is 100. In fact, PaddlePaddle supports training with variable-length sequence, but TensorFlow needs to pad. Thus, we also pad sequence length to 100 in PaddlePaddle in order to compare.
+- Dictionary size=30000 
+- Peephole connection is used in `lstmemory` by default in PaddlePaddle. It is also configured in TensorFlow.
+
+### Single-GPU
+
+#### LSTM in Text Classification
+
+Testing `2 lstm layer + fc` network with different hidden size and batch size.
+  
+- Batch size = 64, ms / batch
+ 
+| hidden_size  | 256   | 512    |  1280   |
+|--------------|-------| -------| --------|
+| PaddlePaddle | 83    | 184    | 641     |
+| TensorFlow   | 175   | 280    | 818     |
+
+- Batch size = 128, ms / batch
+ 
+| hidden_size  | 256    | 512    |  1280   |
+|--------------|------- | -------| --------|
+| PaddlePaddle | 110    | 261    | 1007    |
+| TensorFlow   | 181    | 361    | 1237    |
+
+
+- Batch size = 256, ms / batch
+ 
+| hidden_size  | 256   | 512    |  1280   |
+|--------------|-------| -------| --------|
+| PaddlePaddle | 170   | 414    | 1655    |
+| TensorFlow   | 238   | 536    | 1905    |
+
+<img src="figs/rnn_lstm_cls.png" width="600">
+
+#### Seq2Seq
+
+The benchmark of sequence-to-sequence network will be added later.
+ 
+
+### Multi GPU: 4 GPUs
+
+#### LSTM in Text Classification
+
+- hidden_size = 256, ms / batch
+ 
+| batch_size   | 256    |  512    |
+|--------------| -------| --------|
+| PaddlePaddle | 90     | 118     |
+| TensorFlow   | 226    | 118     |
+
+
+- hidden_size = 512, ms / batch
+ 
+| batch_size   | 256    |  512    |
+|--------------| -------| --------|
+| PaddlePaddle | 189    | 268     |
+| TensorFlow   | 297    | 383     |
+
+
+<img src="figs/rnn_lstm_4gpus.png" width="420">
+
+#### Seq2Seq
+
+The benchmark of sequence-to-sequence network will be added later.
diff --git a/benchmark/caffe/image/alexnet.prototxt b/benchmark/caffe/image/alexnet.prototxt
new file mode 100644
index 0000000000000000000000000000000000000000..aca184ddaf2ca2b5e2bea17d131055e0621b8271
--- /dev/null
+++ b/benchmark/caffe/image/alexnet.prototxt
@@ -0,0 +1,347 @@
+name: "alexnet"
+input: "data"
+input_dim: 64
+input_dim: 3
+input_dim: 227
+input_dim: 227
+input: "label"
+input_dim: 64
+input_dim: 1
+input_dim: 1
+input_dim: 1 
+force_backward: true
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc8"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "fc8"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "fc8"
+  bottom: "label"
+  top: "loss"
+}
diff --git a/benchmark/caffe/image/googlenet.prototxt b/benchmark/caffe/image/googlenet.prototxt
new file mode 100644
index 0000000000000000000000000000000000000000..c5f3b4fe3efcb6f7397031c086997fa914c67b7f
--- /dev/null
+++ b/benchmark/caffe/image/googlenet.prototxt
@@ -0,0 +1,2334 @@
+name: "googlenet"
+input: "data"
+input_dim: 128
+input_dim: 3
+input_dim: 224
+input_dim: 224
+input: "label"
+input_dim: 128
+input_dim: 1
+input_dim: 1
+input_dim: 1 
+layer {
+  name: "conv1/7x7_s2"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1/7x7_s2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 3
+    kernel_size: 7
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "conv1/relu_7x7"
+  type: "ReLU"
+  bottom: "conv1/7x7_s2"
+  top: "conv1/7x7_s2"
+}
+layer {
+  name: "pool1/3x3_s2"
+  type: "Pooling"
+  bottom: "conv1/7x7_s2"
+  top: "pool1/3x3_s2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+#layer {
+#  name: "pool1/norm1"
+#  type: "LRN"
+#  bottom: "pool1/3x3_s2"
+#  top: "pool1/norm1"
+#  lrn_param {
+#    local_size: 5
+#    alpha: 0.0001
+#    beta: 0.75
+#  }
+#}
+layer {
+  name: "conv2/3x3_reduce"
+  type: "Convolution"
+#  bottom: "pool1/norm1"
+  bottom: "pool1/3x3_s2"
+  top: "conv2/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "conv2/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "conv2/3x3_reduce"
+  top: "conv2/3x3_reduce"
+}
+layer {
+  name: "conv2/3x3"
+  type: "Convolution"
+  bottom: "conv2/3x3_reduce"
+  top: "conv2/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 192
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "conv2/relu_3x3"
+  type: "ReLU"
+  bottom: "conv2/3x3"
+  top: "conv2/3x3"
+}
+#layer {
+#  name: "conv2/norm2"
+#  type: "LRN"
+#  bottom: "conv2/3x3"
+#  top: "conv2/norm2"
+#  lrn_param {
+#    local_size: 5
+#    alpha: 0.0001
+#    beta: 0.75
+#  }
+#}
+layer {
+  name: "pool2/3x3_s2"
+  type: "Pooling"
+#  bottom: "conv2/norm2"
+  bottom: "conv2/3x3"
+  top: "pool2/3x3_s2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "inception_3a/1x1"
+  type: "Convolution"
+  bottom: "pool2/3x3_s2"
+  top: "inception_3a/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3a/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_3a/1x1"
+  top: "inception_3a/1x1"
+}
+layer {
+  name: "inception_3a/3x3_reduce"
+  type: "Convolution"
+  bottom: "pool2/3x3_s2"
+  top: "inception_3a/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3a/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_3a/3x3_reduce"
+  top: "inception_3a/3x3_reduce"
+}
+layer {
+  name: "inception_3a/3x3"
+  type: "Convolution"
+  bottom: "inception_3a/3x3_reduce"
+  top: "inception_3a/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3a/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_3a/3x3"
+  top: "inception_3a/3x3"
+}
+layer {
+  name: "inception_3a/5x5_reduce"
+  type: "Convolution"
+  bottom: "pool2/3x3_s2"
+  top: "inception_3a/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 16
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3a/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_3a/5x5_reduce"
+  top: "inception_3a/5x5_reduce"
+}
+layer {
+  name: "inception_3a/5x5"
+  type: "Convolution"
+  bottom: "inception_3a/5x5_reduce"
+  top: "inception_3a/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 32
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3a/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_3a/5x5"
+  top: "inception_3a/5x5"
+}
+layer {
+  name: "inception_3a/pool"
+  type: "Pooling"
+  bottom: "pool2/3x3_s2"
+  top: "inception_3a/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_3a/pool_proj"
+  type: "Convolution"
+  bottom: "inception_3a/pool"
+  top: "inception_3a/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3a/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_3a/pool_proj"
+  top: "inception_3a/pool_proj"
+}
+layer {
+  name: "inception_3a/output"
+  type: "Concat"
+  bottom: "inception_3a/1x1"
+  bottom: "inception_3a/3x3"
+  bottom: "inception_3a/5x5"
+  bottom: "inception_3a/pool_proj"
+  top: "inception_3a/output"
+}
+layer {
+  name: "inception_3b/1x1"
+  type: "Convolution"
+  bottom: "inception_3a/output"
+  top: "inception_3b/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3b/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_3b/1x1"
+  top: "inception_3b/1x1"
+}
+layer {
+  name: "inception_3b/3x3_reduce"
+  type: "Convolution"
+  bottom: "inception_3a/output"
+  top: "inception_3b/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3b/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_3b/3x3_reduce"
+  top: "inception_3b/3x3_reduce"
+}
+layer {
+  name: "inception_3b/3x3"
+  type: "Convolution"
+  bottom: "inception_3b/3x3_reduce"
+  top: "inception_3b/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 192
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3b/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_3b/3x3"
+  top: "inception_3b/3x3"
+}
+layer {
+  name: "inception_3b/5x5_reduce"
+  type: "Convolution"
+  bottom: "inception_3a/output"
+  top: "inception_3b/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3b/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_3b/5x5_reduce"
+  top: "inception_3b/5x5_reduce"
+}
+layer {
+  name: "inception_3b/5x5"
+  type: "Convolution"
+  bottom: "inception_3b/5x5_reduce"
+  top: "inception_3b/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3b/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_3b/5x5"
+  top: "inception_3b/5x5"
+}
+layer {
+  name: "inception_3b/pool"
+  type: "Pooling"
+  bottom: "inception_3a/output"
+  top: "inception_3b/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_3b/pool_proj"
+  type: "Convolution"
+  bottom: "inception_3b/pool"
+  top: "inception_3b/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3b/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_3b/pool_proj"
+  top: "inception_3b/pool_proj"
+}
+layer {
+  name: "inception_3b/output"
+  type: "Concat"
+  bottom: "inception_3b/1x1"
+  bottom: "inception_3b/3x3"
+  bottom: "inception_3b/5x5"
+  bottom: "inception_3b/pool_proj"
+  top: "inception_3b/output"
+}
+layer {
+  name: "pool3/3x3_s2"
+  type: "Pooling"
+  bottom: "inception_3b/output"
+  top: "pool3/3x3_s2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "inception_4a/1x1"
+  type: "Convolution"
+  bottom: "pool3/3x3_s2"
+  top: "inception_4a/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 192
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4a/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_4a/1x1"
+  top: "inception_4a/1x1"
+}
+layer {
+  name: "inception_4a/3x3_reduce"
+  type: "Convolution"
+  bottom: "pool3/3x3_s2"
+  top: "inception_4a/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4a/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_4a/3x3_reduce"
+  top: "inception_4a/3x3_reduce"
+}
+layer {
+  name: "inception_4a/3x3"
+  type: "Convolution"
+  bottom: "inception_4a/3x3_reduce"
+  top: "inception_4a/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 208
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4a/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_4a/3x3"
+  top: "inception_4a/3x3"
+}
+layer {
+  name: "inception_4a/5x5_reduce"
+  type: "Convolution"
+  bottom: "pool3/3x3_s2"
+  top: "inception_4a/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 16
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4a/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_4a/5x5_reduce"
+  top: "inception_4a/5x5_reduce"
+}
+layer {
+  name: "inception_4a/5x5"
+  type: "Convolution"
+  bottom: "inception_4a/5x5_reduce"
+  top: "inception_4a/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 48
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4a/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_4a/5x5"
+  top: "inception_4a/5x5"
+}
+layer {
+  name: "inception_4a/pool"
+  type: "Pooling"
+  bottom: "pool3/3x3_s2"
+  top: "inception_4a/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_4a/pool_proj"
+  type: "Convolution"
+  bottom: "inception_4a/pool"
+  top: "inception_4a/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4a/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_4a/pool_proj"
+  top: "inception_4a/pool_proj"
+}
+layer {
+  name: "inception_4a/output"
+  type: "Concat"
+  bottom: "inception_4a/1x1"
+  bottom: "inception_4a/3x3"
+  bottom: "inception_4a/5x5"
+  bottom: "inception_4a/pool_proj"
+  top: "inception_4a/output"
+}
+#layer {
+#  name: "loss1/ave_pool"
+#  type: "Pooling"
+#  bottom: "inception_4a/output"
+#  top: "loss1/ave_pool"
+#  pooling_param {
+#    pool: AVE
+#    kernel_size: 5
+#    stride: 3
+#  }
+#}
+#layer {
+#  name: "loss1/conv"
+#  type: "Convolution"
+#  bottom: "loss1/ave_pool"
+#  top: "loss1/conv"
+#  param {
+#    lr_mult: 1
+#    decay_mult: 1
+#  }
+#  param {
+#    lr_mult: 2
+#    decay_mult: 0
+#  }
+#  convolution_param {
+#    num_output: 128
+#    kernel_size: 1
+#    weight_filler {
+#      type: "xavier"
+#    }
+#    bias_filler {
+#      type: "constant"
+#      value: 0.2
+#    }
+#  }
+#}
+#layer {
+#  name: "loss1/relu_conv"
+#  type: "ReLU"
+#  bottom: "loss1/conv"
+#  top: "loss1/conv"
+#}
+#layer {
+#  name: "loss1/fc"
+#  type: "InnerProduct"
+#  bottom: "loss1/conv"
+#  top: "loss1/fc"
+#  param {
+#    lr_mult: 1
+#    decay_mult: 1
+#  }
+#  param {
+#    lr_mult: 2
+#    decay_mult: 0
+#  }
+#  inner_product_param {
+#    num_output: 1024
+#    weight_filler {
+#      type: "xavier"
+#    }
+#    bias_filler {
+#      type: "constant"
+#      value: 0.2
+#    }
+#  }
+#}
+#layer {
+#  name: "loss1/relu_fc"
+#  type: "ReLU"
+#  bottom: "loss1/fc"
+#  top: "loss1/fc"
+#}
+#layer {
+#  name: "loss1/drop_fc"
+#  type: "Dropout"
+#  bottom: "loss1/fc"
+#  top: "loss1/fc"
+#  dropout_param {
+#    dropout_ratio: 0.7
+#  }
+#}
+#layer {
+#  name: "loss1/classifier"
+#  type: "InnerProduct"
+#  bottom: "loss1/fc"
+#  top: "loss1/classifier"
+#  param {
+#    lr_mult: 1
+#    decay_mult: 1
+#  }
+#  param {
+#    lr_mult: 2
+#    decay_mult: 0
+#  }
+#  inner_product_param {
+#    num_output: 1000
+#    weight_filler {
+#      type: "xavier"
+#    }
+#    bias_filler {
+#      type: "constant"
+#      value: 0
+#    }
+#  }
+#}
+#layer {
+#  name: "loss1/loss"
+#  type: "SoftmaxWithLoss"
+#  bottom: "loss1/classifier"
+#  bottom: "label"
+#  top: "loss1/loss1"
+#  loss_weight: 0.3
+#}
+layer {
+  name: "inception_4b/1x1"
+  type: "Convolution"
+  bottom: "inception_4a/output"
+  top: "inception_4b/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 160
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4b/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_4b/1x1"
+  top: "inception_4b/1x1"
+}
+layer {
+  name: "inception_4b/3x3_reduce"
+  type: "Convolution"
+  bottom: "inception_4a/output"
+  top: "inception_4b/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 112
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4b/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_4b/3x3_reduce"
+  top: "inception_4b/3x3_reduce"
+}
+layer {
+  name: "inception_4b/3x3"
+  type: "Convolution"
+  bottom: "inception_4b/3x3_reduce"
+  top: "inception_4b/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 224
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4b/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_4b/3x3"
+  top: "inception_4b/3x3"
+}
+layer {
+  name: "inception_4b/5x5_reduce"
+  type: "Convolution"
+  bottom: "inception_4a/output"
+  top: "inception_4b/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 24
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4b/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_4b/5x5_reduce"
+  top: "inception_4b/5x5_reduce"
+}
+layer {
+  name: "inception_4b/5x5"
+  type: "Convolution"
+  bottom: "inception_4b/5x5_reduce"
+  top: "inception_4b/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4b/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_4b/5x5"
+  top: "inception_4b/5x5"
+}
+layer {
+  name: "inception_4b/pool"
+  type: "Pooling"
+  bottom: "inception_4a/output"
+  top: "inception_4b/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_4b/pool_proj"
+  type: "Convolution"
+  bottom: "inception_4b/pool"
+  top: "inception_4b/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4b/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_4b/pool_proj"
+  top: "inception_4b/pool_proj"
+}
+layer {
+  name: "inception_4b/output"
+  type: "Concat"
+  bottom: "inception_4b/1x1"
+  bottom: "inception_4b/3x3"
+  bottom: "inception_4b/5x5"
+  bottom: "inception_4b/pool_proj"
+  top: "inception_4b/output"
+}
+layer {
+  name: "inception_4c/1x1"
+  type: "Convolution"
+  bottom: "inception_4b/output"
+  top: "inception_4c/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4c/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_4c/1x1"
+  top: "inception_4c/1x1"
+}
+layer {
+  name: "inception_4c/3x3_reduce"
+  type: "Convolution"
+  bottom: "inception_4b/output"
+  top: "inception_4c/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4c/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_4c/3x3_reduce"
+  top: "inception_4c/3x3_reduce"
+}
+layer {
+  name: "inception_4c/3x3"
+  type: "Convolution"
+  bottom: "inception_4c/3x3_reduce"
+  top: "inception_4c/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4c/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_4c/3x3"
+  top: "inception_4c/3x3"
+}
+layer {
+  name: "inception_4c/5x5_reduce"
+  type: "Convolution"
+  bottom: "inception_4b/output"
+  top: "inception_4c/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 24
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4c/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_4c/5x5_reduce"
+  top: "inception_4c/5x5_reduce"
+}
+layer {
+  name: "inception_4c/5x5"
+  type: "Convolution"
+  bottom: "inception_4c/5x5_reduce"
+  top: "inception_4c/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4c/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_4c/5x5"
+  top: "inception_4c/5x5"
+}
+layer {
+  name: "inception_4c/pool"
+  type: "Pooling"
+  bottom: "inception_4b/output"
+  top: "inception_4c/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_4c/pool_proj"
+  type: "Convolution"
+  bottom: "inception_4c/pool"
+  top: "inception_4c/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4c/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_4c/pool_proj"
+  top: "inception_4c/pool_proj"
+}
+layer {
+  name: "inception_4c/output"
+  type: "Concat"
+  bottom: "inception_4c/1x1"
+  bottom: "inception_4c/3x3"
+  bottom: "inception_4c/5x5"
+  bottom: "inception_4c/pool_proj"
+  top: "inception_4c/output"
+}
+layer {
+  name: "inception_4d/1x1"
+  type: "Convolution"
+  bottom: "inception_4c/output"
+  top: "inception_4d/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 112
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4d/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_4d/1x1"
+  top: "inception_4d/1x1"
+}
+layer {
+  name: "inception_4d/3x3_reduce"
+  type: "Convolution"
+  bottom: "inception_4c/output"
+  top: "inception_4d/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 144
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4d/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_4d/3x3_reduce"
+  top: "inception_4d/3x3_reduce"
+}
+layer {
+  name: "inception_4d/3x3"
+  type: "Convolution"
+  bottom: "inception_4d/3x3_reduce"
+  top: "inception_4d/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 288
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4d/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_4d/3x3"
+  top: "inception_4d/3x3"
+}
+layer {
+  name: "inception_4d/5x5_reduce"
+  type: "Convolution"
+  bottom: "inception_4c/output"
+  top: "inception_4d/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4d/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_4d/5x5_reduce"
+  top: "inception_4d/5x5_reduce"
+}
+layer {
+  name: "inception_4d/5x5"
+  type: "Convolution"
+  bottom: "inception_4d/5x5_reduce"
+  top: "inception_4d/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4d/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_4d/5x5"
+  top: "inception_4d/5x5"
+}
+layer {
+  name: "inception_4d/pool"
+  type: "Pooling"
+  bottom: "inception_4c/output"
+  top: "inception_4d/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_4d/pool_proj"
+  type: "Convolution"
+  bottom: "inception_4d/pool"
+  top: "inception_4d/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4d/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_4d/pool_proj"
+  top: "inception_4d/pool_proj"
+}
+layer {
+  name: "inception_4d/output"
+  type: "Concat"
+  bottom: "inception_4d/1x1"
+  bottom: "inception_4d/3x3"
+  bottom: "inception_4d/5x5"
+  bottom: "inception_4d/pool_proj"
+  top: "inception_4d/output"
+}
+#layer {
+#  name: "loss2/ave_pool"
+#  type: "Pooling"
+#  bottom: "inception_4d/output"
+#  top: "loss2/ave_pool"
+#  pooling_param {
+#    pool: AVE
+#    kernel_size: 5
+#    stride: 3
+#  }
+#}
+#layer {
+#  name: "loss2/conv"
+#  type: "Convolution"
+#  bottom: "loss2/ave_pool"
+#  top: "loss2/conv"
+#  param {
+#    lr_mult: 1
+#    decay_mult: 1
+#  }
+#  param {
+#    lr_mult: 2
+#    decay_mult: 0
+#  }
+#  convolution_param {
+#    num_output: 128
+#    kernel_size: 1
+#    weight_filler {
+#      type: "xavier"
+#    }
+#    bias_filler {
+#      type: "constant"
+#      value: 0.2
+#    }
+#  }
+#}
+#layer {
+#  name: "loss2/relu_conv"
+#  type: "ReLU"
+#  bottom: "loss2/conv"
+#  top: "loss2/conv"
+#}
+#layer {
+#  name: "loss2/fc"
+#  type: "InnerProduct"
+#  bottom: "loss2/conv"
+#  top: "loss2/fc"
+#  param {
+#    lr_mult: 1
+#    decay_mult: 1
+#  }
+#  param {
+#    lr_mult: 2
+#    decay_mult: 0
+#  }
+#  inner_product_param {
+#    num_output: 1024
+#    weight_filler {
+#      type: "xavier"
+#    }
+#    bias_filler {
+#      type: "constant"
+#      value: 0.2
+#    }
+#  }
+#}
+#layer {
+#  name: "loss2/relu_fc"
+#  type: "ReLU"
+#  bottom: "loss2/fc"
+#  top: "loss2/fc"
+#}
+#layer {
+#  name: "loss2/drop_fc"
+#  type: "Dropout"
+#  bottom: "loss2/fc"
+#  top: "loss2/fc"
+#  dropout_param {
+#    dropout_ratio: 0.7
+#  }
+#}
+#layer {
+#  name: "loss2/classifier"
+#  type: "InnerProduct"
+#  bottom: "loss2/fc"
+#  top: "loss2/classifier"
+#  param {
+#    lr_mult: 1
+#    decay_mult: 1
+#  }
+#  param {
+#    lr_mult: 2
+#    decay_mult: 0
+#  }
+#  inner_product_param {
+#    num_output: 1000
+#    weight_filler {
+#      type: "xavier"
+#    }
+#    bias_filler {
+#      type: "constant"
+#      value: 0
+#    }
+#  }
+#}
+#layer {
+#  name: "loss2/loss"
+#  type: "SoftmaxWithLoss"
+#  bottom: "loss2/classifier"
+#  bottom: "label"
+#  top: "loss2/loss1"
+#  loss_weight: 0.3
+#}
+layer {
+  name: "inception_4e/1x1"
+  type: "Convolution"
+  bottom: "inception_4d/output"
+  top: "inception_4e/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4e/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_4e/1x1"
+  top: "inception_4e/1x1"
+}
+layer {
+  name: "inception_4e/3x3_reduce"
+  type: "Convolution"
+  bottom: "inception_4d/output"
+  top: "inception_4e/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 160
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4e/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_4e/3x3_reduce"
+  top: "inception_4e/3x3_reduce"
+}
+layer {
+  name: "inception_4e/3x3"
+  type: "Convolution"
+  bottom: "inception_4e/3x3_reduce"
+  top: "inception_4e/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 320
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4e/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_4e/3x3"
+  top: "inception_4e/3x3"
+}
+layer {
+  name: "inception_4e/5x5_reduce"
+  type: "Convolution"
+  bottom: "inception_4d/output"
+  top: "inception_4e/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4e/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_4e/5x5_reduce"
+  top: "inception_4e/5x5_reduce"
+}
+layer {
+  name: "inception_4e/5x5"
+  type: "Convolution"
+  bottom: "inception_4e/5x5_reduce"
+  top: "inception_4e/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4e/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_4e/5x5"
+  top: "inception_4e/5x5"
+}
+layer {
+  name: "inception_4e/pool"
+  type: "Pooling"
+  bottom: "inception_4d/output"
+  top: "inception_4e/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_4e/pool_proj"
+  type: "Convolution"
+  bottom: "inception_4e/pool"
+  top: "inception_4e/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4e/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_4e/pool_proj"
+  top: "inception_4e/pool_proj"
+}
+layer {
+  name: "inception_4e/output"
+  type: "Concat"
+  bottom: "inception_4e/1x1"
+  bottom: "inception_4e/3x3"
+  bottom: "inception_4e/5x5"
+  bottom: "inception_4e/pool_proj"
+  top: "inception_4e/output"
+}
+layer {
+  name: "pool4/3x3_s2"
+  type: "Pooling"
+  bottom: "inception_4e/output"
+  top: "pool4/3x3_s2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "inception_5a/1x1"
+  type: "Convolution"
+  bottom: "pool4/3x3_s2"
+  top: "inception_5a/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5a/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_5a/1x1"
+  top: "inception_5a/1x1"
+}
+layer {
+  name: "inception_5a/3x3_reduce"
+  type: "Convolution"
+  bottom: "pool4/3x3_s2"
+  top: "inception_5a/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 160
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5a/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_5a/3x3_reduce"
+  top: "inception_5a/3x3_reduce"
+}
+layer {
+  name: "inception_5a/3x3"
+  type: "Convolution"
+  bottom: "inception_5a/3x3_reduce"
+  top: "inception_5a/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 320
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5a/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_5a/3x3"
+  top: "inception_5a/3x3"
+}
+layer {
+  name: "inception_5a/5x5_reduce"
+  type: "Convolution"
+  bottom: "pool4/3x3_s2"
+  top: "inception_5a/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5a/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_5a/5x5_reduce"
+  top: "inception_5a/5x5_reduce"
+}
+layer {
+  name: "inception_5a/5x5"
+  type: "Convolution"
+  bottom: "inception_5a/5x5_reduce"
+  top: "inception_5a/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5a/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_5a/5x5"
+  top: "inception_5a/5x5"
+}
+layer {
+  name: "inception_5a/pool"
+  type: "Pooling"
+  bottom: "pool4/3x3_s2"
+  top: "inception_5a/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_5a/pool_proj"
+  type: "Convolution"
+  bottom: "inception_5a/pool"
+  top: "inception_5a/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5a/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_5a/pool_proj"
+  top: "inception_5a/pool_proj"
+}
+layer {
+  name: "inception_5a/output"
+  type: "Concat"
+  bottom: "inception_5a/1x1"
+  bottom: "inception_5a/3x3"
+  bottom: "inception_5a/5x5"
+  bottom: "inception_5a/pool_proj"
+  top: "inception_5a/output"
+}
+layer {
+  name: "inception_5b/1x1"
+  type: "Convolution"
+  bottom: "inception_5a/output"
+  top: "inception_5b/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5b/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_5b/1x1"
+  top: "inception_5b/1x1"
+}
+layer {
+  name: "inception_5b/3x3_reduce"
+  type: "Convolution"
+  bottom: "inception_5a/output"
+  top: "inception_5b/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 192
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5b/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_5b/3x3_reduce"
+  top: "inception_5b/3x3_reduce"
+}
+layer {
+  name: "inception_5b/3x3"
+  type: "Convolution"
+  bottom: "inception_5b/3x3_reduce"
+  top: "inception_5b/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5b/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_5b/3x3"
+  top: "inception_5b/3x3"
+}
+layer {
+  name: "inception_5b/5x5_reduce"
+  type: "Convolution"
+  bottom: "inception_5a/output"
+  top: "inception_5b/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 48
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5b/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_5b/5x5_reduce"
+  top: "inception_5b/5x5_reduce"
+}
+layer {
+  name: "inception_5b/5x5"
+  type: "Convolution"
+  bottom: "inception_5b/5x5_reduce"
+  top: "inception_5b/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5b/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_5b/5x5"
+  top: "inception_5b/5x5"
+}
+layer {
+  name: "inception_5b/pool"
+  type: "Pooling"
+  bottom: "inception_5a/output"
+  top: "inception_5b/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_5b/pool_proj"
+  type: "Convolution"
+  bottom: "inception_5b/pool"
+  top: "inception_5b/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5b/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_5b/pool_proj"
+  top: "inception_5b/pool_proj"
+}
+layer {
+  name: "inception_5b/output"
+  type: "Concat"
+  bottom: "inception_5b/1x1"
+  bottom: "inception_5b/3x3"
+  bottom: "inception_5b/5x5"
+  bottom: "inception_5b/pool_proj"
+  top: "inception_5b/output"
+}
+layer {
+  name: "pool5/7x7_s1"
+  type: "Pooling"
+  bottom: "inception_5b/output"
+  top: "pool5/7x7_s1"
+  pooling_param {
+    pool: AVE
+    kernel_size: 7
+    stride: 1
+  }
+}
+layer {
+  name: "pool5/drop_7x7_s1"
+  type: "Dropout"
+  bottom: "pool5/7x7_s1"
+  top: "pool5/7x7_s1"
+  dropout_param {
+    dropout_ratio: 0.4
+  }
+}
+layer {
+  name: "loss3/classifier"
+  type: "InnerProduct"
+  bottom: "pool5/7x7_s1"
+  top: "loss3/classifier"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss3/loss3"
+  type: "SoftmaxWithLoss"
+  bottom: "loss3/classifier"
+  bottom: "label"
+  top: "loss3/loss3"
+  loss_weight: 1
+}
diff --git a/benchmark/caffe/image/run.sh b/benchmark/caffe/image/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..aa9ac20ca5cc1d48a07ce39f7d6c6d70ad4121ab
--- /dev/null
+++ b/benchmark/caffe/image/run.sh
@@ -0,0 +1,30 @@
+set -e
+
+function test() {
+  cfg=$1
+  batch=$2
+  prefix=$3
+  sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg 
+  sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg
+  caffe time --model=$cfg --iterations=50 --gpu 0 > logs/$prefix-1gpu-batch${batch}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# alexnet
+test alexnet.prototxt 64 alexnet 
+test alexnet.prototxt 128 alexnet 
+test alexnet.prototxt 256 alexnet 
+test alexnet.prototxt 512 alexnet 
+
+# googlenet
+test googlenet.prototxt 64 googlenet 
+test googlenet.prototxt 128 googlenet 
+
+# small net 
+test smallnet_mnist_cifar.prototxt 64 smallnet 
+test smallnet_mnist_cifar.prototxt 128 smallnet 
+test smallnet_mnist_cifar.prototxt 256 smallnet 
+test smallnet_mnist_cifar.prototxt 512 smallnet 
diff --git a/benchmark/caffe/image/run_multi.sh b/benchmark/caffe/image/run_multi.sh
new file mode 100755
index 0000000000000000000000000000000000000000..9a0a71bc185a421842265ea6d2310429adb86913
--- /dev/null
+++ b/benchmark/caffe/image/run_multi.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+set -e
+
+function test() {
+  cfg=$1
+  batch=$2
+  prefix=$3
+  batch_per_gpu=`expr ${batch} / 4`
+  sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg 
+  sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg 
+  sed -i "1c\net : \"${cfg}\"" solver.prototxt
+  caffe train --solver=solver.prototxt -gpu 0,1,2,3 > logs/${prefix}-4gpu-batch${batch}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# alexnet
+test alexnet.prototxt 512 alexnet 
+test alexnet.prototxt 1024 alexnet 
+
+# googlnet 
+test googlenet.prototxt 512 googlenet 
diff --git a/benchmark/caffe/image/smallnet_mnist_cifar.prototxt b/benchmark/caffe/image/smallnet_mnist_cifar.prototxt
new file mode 100644
index 0000000000000000000000000000000000000000..3cb0e32bbfb9f785ece6d428356987e5503dd25d
--- /dev/null
+++ b/benchmark/caffe/image/smallnet_mnist_cifar.prototxt
@@ -0,0 +1,198 @@
+name: "mnist/cifar"
+input: "data"
+input_dim: 128 
+input_dim: 3
+input_dim: 32 
+input_dim: 32 
+input: "label"
+input_dim: 128 
+input_dim: 1
+input_dim: 1
+input_dim: 1 
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 32
+    pad: 2
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.0001
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "pool1"
+  top: "pool1"
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 32
+    pad: 2
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2"
+  top: "pool2"
+  pooling_param {
+    pool: AVE
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 64
+    pad: 2
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3"
+  top: "pool3"
+  pooling_param {
+    pool: AVE
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "ip1"
+  type: "InnerProduct"
+  bottom: "pool3"
+  top: "ip1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 64
+    weight_filler {
+      type: "gaussian"
+      std: 0.1
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "ip2"
+  type: "InnerProduct"
+  bottom: "ip1"
+  top: "ip2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 10
+    weight_filler {
+      type: "gaussian"
+      std: 0.1
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "ip2"
+  bottom: "label"
+  top: "accuracy"
+  include {
+    phase: TEST
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "ip2"
+  bottom: "label"
+  top: "loss"
+}
diff --git a/benchmark/caffe/image/solver.prototxt b/benchmark/caffe/image/solver.prototxt
new file mode 100644
index 0000000000000000000000000000000000000000..61c10284e6027b4cc0b3d4c8fcf949e0a5a22a85
--- /dev/null
+++ b/benchmark/caffe/image/solver.prototxt
@@ -0,0 +1,10 @@
+net: "alexnet.prototxt"
+base_lr: 0.01
+lr_policy: "fixed"
+display: 20
+max_iter: 200
+momentum: 0.9
+weight_decay: 0.0005
+snapshot: 10000
+snapshot_prefix: "models/caffe_alexnet_train"
+solver_mode: GPU
diff --git a/benchmark/figs/alexnet-4gpu.png b/benchmark/figs/alexnet-4gpu.png
new file mode 100644
index 0000000000000000000000000000000000000000..28b95a44508f0ee7ad270c9ccdf8659009406b03
Binary files /dev/null and b/benchmark/figs/alexnet-4gpu.png differ
diff --git a/benchmark/figs/googlenet-4gpu.png b/benchmark/figs/googlenet-4gpu.png
new file mode 100644
index 0000000000000000000000000000000000000000..9b5331f05a3e54cacf949f10b6603bf627a6d106
Binary files /dev/null and b/benchmark/figs/googlenet-4gpu.png differ
diff --git a/benchmark/figs/rnn_lstm_4gpus.png b/benchmark/figs/rnn_lstm_4gpus.png
new file mode 100644
index 0000000000000000000000000000000000000000..973ce2fa5f65e9681c972d4f5bd5776b5c4aa264
Binary files /dev/null and b/benchmark/figs/rnn_lstm_4gpus.png differ
diff --git a/benchmark/figs/rnn_lstm_cls.png b/benchmark/figs/rnn_lstm_cls.png
new file mode 100644
index 0000000000000000000000000000000000000000..26d05cac11aa7ae8cdfbcd8c4401f6547a9404f6
Binary files /dev/null and b/benchmark/figs/rnn_lstm_cls.png differ
diff --git a/benchmark/paddle/image/alexnet.py b/benchmark/paddle/image/alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..3358d43a4b08c6a9b89d59e1a8be53ee1f12bbe0
--- /dev/null
+++ b/benchmark/paddle/image/alexnet.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+
+from paddle.trainer_config_helpers import *
+
+height = 227
+width = 227
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 128)
+
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+
+# conv1
+net = data_layer('data', size=height * width * 3)
+net = img_conv_layer(
+    input=net,
+    filter_size=11,
+    num_channels=3,
+    num_filters=96,
+    stride=4,
+    padding=1)
+net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
+net = img_pool_layer(input=net, pool_size=3, stride=2)
+
+# conv2
+net = img_conv_layer(
+    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1)
+net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
+net = img_pool_layer(input=net, pool_size=3, stride=2)
+
+# conv3
+net = img_conv_layer(
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1)
+# conv4
+net = img_conv_layer(
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1)
+
+# conv5
+net = img_conv_layer(
+    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1)
+net = img_pool_layer(input=net, pool_size=3, stride=2)
+
+net = fc_layer(
+    input=net,
+    size=4096,
+    act=ReluActivation(),
+    layer_attr=ExtraAttr(drop_rate=0.5))
+net = fc_layer(
+    input=net,
+    size=4096,
+    act=ReluActivation(),
+    layer_attr=ExtraAttr(drop_rate=0.5))
+net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
+
+lab = data_layer('label', num_class)
+loss = cross_entropy(input=net, label=lab)
+outputs(loss)
diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc893bab98c4d2e07c62fbd012d51a0939db4766
--- /dev/null
+++ b/benchmark/paddle/image/googlenet.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python
+from paddle.trainer_config_helpers import *
+
+height = 224
+width = 224
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 128)
+
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+
+def inception2(name, input, channels, \
+    filter1,
+    filter3R, filter3,
+    filter5R, filter5,
+    proj):
+
+    conv1 = name + '_1'
+    conv3r = name + '_3r'
+    conv3 = name + '_3'
+    conv5r = name + '_5r'
+    conv5 = name + '_5'
+    maxpool = name + '_max'
+    convproj = name + '_proj'
+
+    cov1 = img_conv_layer(
+        name=conv1,
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter1,
+        stride=1,
+        padding=0)
+
+    cov3r = img_conv_layer(
+        name=conv3r,
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter3R,
+        stride=1,
+        padding=0)
+    cov3 = img_conv_layer(
+        name=conv3,
+        input=cov3r,
+        filter_size=3,
+        num_filters=filter3,
+        stride=1,
+        padding=1)
+
+    cov5r = img_conv_layer(
+        name=conv5r,
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter5R,
+        stride=1,
+        padding=0)
+    cov5 = img_conv_layer(
+        name=conv5,
+        input=cov5r,
+        filter_size=5,
+        num_filters=filter5,
+        stride=1,
+        padding=2)
+
+    pool1 = img_pool_layer(
+        name=maxpool,
+        input=input,
+        pool_size=3,
+        num_channels=channels,
+        stride=1,
+        padding=1)
+    covprj = img_conv_layer(
+        name=convproj,
+        input=pool1,
+        filter_size=1,
+        num_filters=proj,
+        stride=1,
+        padding=0)
+
+    cat = concat_layer(name=name, input=[cov1, cov3, cov5, covprj])
+    return cat
+
+def inception(name, input, channels, \
+    filter1,
+    filter3R, filter3,
+    filter5R, filter5,
+    proj):
+
+    cov1 = conv_projection(
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter1,
+        stride=1,
+        padding=0)
+
+    cov3r = img_conv_layer(
+        name=name + '_3r',
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter3R,
+        stride=1,
+        padding=0)
+    cov3 = conv_projection(
+        input=cov3r, filter_size=3, num_filters=filter3, stride=1, padding=1)
+
+    cov5r = img_conv_layer(
+        name=name + '_5r',
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter5R,
+        stride=1,
+        padding=0)
+    cov5 = conv_projection(
+        input=cov5r, filter_size=5, num_filters=filter5, stride=1, padding=2)
+
+    pool1 = img_pool_layer(
+        name=name + '_max',
+        input=input,
+        pool_size=3,
+        num_channels=channels,
+        stride=1,
+        padding=1)
+    covprj = conv_projection(
+        input=pool1, filter_size=1, num_filters=proj, stride=1, padding=0)
+
+    cat = concat_layer(
+        name=name,
+        input=[cov1, cov3, cov5, covprj],
+        bias_attr=True,
+        act=ReluActivation())
+    return cat
+
+
+lab = data_layer(name="label", size=1000)
+data = data_layer(name="input", size=3 * height * width)
+
+# stage 1
+conv1 = img_conv_layer(
+    name="conv1",
+    input=data,
+    filter_size=7,
+    num_channels=3,
+    num_filters=64,
+    stride=2,
+    padding=3)
+pool1 = img_pool_layer(
+    name="pool1", input=conv1, pool_size=3, num_channels=64, stride=2)
+
+# stage 2
+conv2_1 = img_conv_layer(
+    name="conv2_1",
+    input=pool1,
+    filter_size=1,
+    num_filters=64,
+    stride=1,
+    padding=0)
+conv2_2 = img_conv_layer(
+    name="conv2_2",
+    input=conv2_1,
+    filter_size=3,
+    num_filters=192,
+    stride=1,
+    padding=1)
+pool2 = img_pool_layer(
+    name="pool2", input=conv2_2, pool_size=3, num_channels=192, stride=2)
+
+# stage 3
+ince3a = inception("ince3a", pool2, 192, 64, 96, 128, 16, 32, 32)
+ince3b = inception("ince3b", ince3a, 256, 128, 128, 192, 32, 96, 64)
+pool3 = img_pool_layer(
+    name="pool3", input=ince3b, num_channels=480, pool_size=3, stride=2)
+
+# stage 4
+ince4a = inception("ince4a", pool3, 480, 192, 96, 208, 16, 48, 64)
+ince4b = inception("ince4b", ince4a, 512, 160, 112, 224, 24, 64, 64)
+ince4c = inception("ince4c", ince4b, 512, 128, 128, 256, 24, 64, 64)
+ince4d = inception("ince4d", ince4c, 512, 112, 144, 288, 32, 64, 64)
+ince4e = inception("ince4e", ince4d, 528, 256, 160, 320, 32, 128, 128)
+pool4 = img_pool_layer(
+    name="pool4", input=ince4e, num_channels=832, pool_size=3, stride=2)
+
+# stage 5
+ince5a = inception("ince5a", pool4, 832, 256, 160, 320, 32, 128, 128)
+ince5b = inception("ince5b", ince5a, 832, 384, 192, 384, 48, 128, 128)
+pool5 = img_pool_layer(
+    name="pool5",
+    input=ince5b,
+    num_channels=1024,
+    pool_size=7,
+    stride=7,
+    pool_type=AvgPooling())
+
+# We remove loss1 and loss2 for all system when testing benchmark
+# output 1
+# pool_o1 = img_pool_layer(name="pool_o1", input=ince4a, num_channels=512, pool_size=5, stride=3, pool_type=AvgPooling())
+# conv_o1 = img_conv_layer(name="conv_o1", input=pool_o1, filter_size=1, num_filters=128, stride=1, padding=0)
+# fc_o1 = fc_layer(name="fc_o1", input=conv_o1, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation())
+# out1 = fc_layer(name="output1", input=fc_o1,  size=1000, act=SoftmaxActivation())
+# loss1 = cross_entropy(name='loss1', input=out1, label=lab, coeff=0.3) 
+
+# output 2
+#pool_o2 = img_pool_layer(name="pool_o2", input=ince4d, num_channels=528, pool_size=5, stride=3, pool_type=AvgPooling())
+#conv_o2 = img_conv_layer(name="conv_o2", input=pool_o2, filter_size=1, num_filters=128, stride=1, padding=0)
+#fc_o2 = fc_layer(name="fc_o2", input=conv_o2, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation())
+#out2 = fc_layer(name="output2", input=fc_o2, size=1000, act=SoftmaxActivation())
+#loss2 = cross_entropy(name='loss2', input=out2, label=lab, coeff=0.3) 
+
+# output 3
+dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
+out3 = fc_layer(
+    name="output3", input=dropout, size=1000, act=SoftmaxActivation())
+loss3 = cross_entropy(name='loss3', input=out3, label=lab)
+
+outputs(loss3)
diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ac47212b5a75667e8e9d4465b33f575516e2836
--- /dev/null
+++ b/benchmark/paddle/image/provider.py
@@ -0,0 +1,26 @@
+import io, os
+import random
+import numpy as np
+from paddle.trainer.PyDataProvider2 import *
+
+
+def initHook(settings, height, width, color, num_class, **kwargs):
+    settings.height = height
+    settings.width = width
+    settings.color = color
+    settings.num_class = num_class
+    if settings.color:
+        settings.data_size = settings.height * settings.width * 3
+    else:
+        settings.data_size = settings.height * settings.width
+
+    settings.slots = [dense_vector(settings.data_size), integer_value(1)]
+
+
+@provider(
+    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_list):
+    for i in xrange(1024):
+        img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
+        lab = random.randint(0, settings.num_class)
+        yield img.astype('float32'), int(lab)
diff --git a/benchmark/paddle/image/run.sh b/benchmark/paddle/image/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..717ed487ba7657db6535efcb1128a355a0f15eaf
--- /dev/null
+++ b/benchmark/paddle/image/run.sh
@@ -0,0 +1,51 @@
+set -e
+
+function train() {
+  cfg=$1
+  thread=$2
+  bz=$3
+  args="batch_size=$3"
+  prefix=$4
+  paddle train --job=time \
+    --config=$cfg \
+    --use_gpu=True \
+    --trainer_count=$thread \
+    --log_period=10 \
+    --test_period=100 \
+    --config_args=$args \
+    > logs/$prefix-${thread}gpu-$bz.log 2>&1 
+}
+
+if [ ! -d "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+#========single-gpu=========#
+# alexnet
+train alexnet.py 1 64 alexnet
+train alexnet.py 1 128 alexnet
+train alexnet.py 1 256 alexnet
+train alexnet.py 1 512 alexnet
+
+# googlenet
+train googlenet.py 1 64 googlenet
+train googlenet.py 1 128 googlenet
+train googlenet.py 1 256 googlenet
+
+# smallnet
+train smallnet_mnist_cifar.py 1 64 smallnet
+train smallnet_mnist_cifar.py 1 128 smallnet
+train smallnet_mnist_cifar.py 1 256 smallnet
+train smallnet_mnist_cifar.py 1 512 smallnet
+
+
+############################
+#========multi-gpus=========#
+train alexnet.py 4 512 alexnet
+train alexnet.py 4 1024 alexnet
+
+train googlenet.py 4 512 googlenet 
+train googlenet.py 4 1024 googlenet
diff --git a/benchmark/paddle/image/smallnet_mnist_cifar.py b/benchmark/paddle/image/smallnet_mnist_cifar.py
new file mode 100644
index 0000000000000000000000000000000000000000..58879c454f37991405d83bbb593bb5d1e977ff53
--- /dev/null
+++ b/benchmark/paddle/image/smallnet_mnist_cifar.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+
+from paddle.trainer_config_helpers import *
+
+height = 32
+width = 32
+num_class = 10
+
+batch_size = get_config_arg('batch_size', int, 128)
+
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+
+# conv1
+net = data_layer('data', size=height * width * 3)
+net = img_conv_layer(
+    input=net,
+    filter_size=5,
+    num_channels=3,
+    num_filters=32,
+    stride=1,
+    padding=2)
+net = img_pool_layer(input=net, pool_size=3, stride=2, padding=1)
+
+# conv2
+net = img_conv_layer(
+    input=net, filter_size=5, num_filters=32, stride=1, padding=2)
+net = img_pool_layer(
+    input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
+
+# conv3
+net = img_conv_layer(
+    input=net, filter_size=3, num_filters=64, stride=1, padding=1)
+net = img_pool_layer(
+    input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
+
+net = fc_layer(input=net, size=64, act=ReluActivation())
+net = fc_layer(input=net, size=10, act=SoftmaxActivation())
+
+lab = data_layer('label', num_class)
+loss = classification_cost(input=net, label=lab)
+outputs(loss)
diff --git a/benchmark/paddle/rnn/imdb.py b/benchmark/paddle/rnn/imdb.py
new file mode 100755
index 0000000000000000000000000000000000000000..fc4ed4025f9ed2e0a32a1709ff8df4af53521196
--- /dev/null
+++ b/benchmark/paddle/rnn/imdb.py
@@ -0,0 +1,46 @@
+from __future__ import print_function
+import six.moves.cPickle as pickle
+import gzip
+import os
+import numpy
+
+
+def get_dataset_file(dataset, default_dataset, origin):
+    data_dir, data_file = os.path.split(dataset)
+    if (not os.path.isfile(dataset)) and data_file == default_dataset:
+        from six.moves import urllib
+        print('Downloading data from %s' % origin)
+        urllib.request.urlretrieve(origin, dataset)
+
+    return dataset
+
+
+def create_data(path="imdb.pkl"):
+
+    if (not os.path.isfile('imdb.train.pkl')):
+        path = get_dataset_file(
+            path, "imdb.pkl",
+            "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
+
+        if path.endswith(".gz"):
+            f = gzip.open(path, 'rb')
+        else:
+            f = open(path, 'rb')
+
+        train_set = pickle.load(f)
+        test_set = pickle.load(f)
+        f.close()
+
+        pickle.dump(train_set, open('imdb.train.pkl', 'wb'))
+        pickle.dump(test_set, open('imdb.test.pkl', 'wb'))
+
+    if (not os.path.isfile('train.list')):
+        file('train.list', 'w').write('imdb.train.pkl\n')
+
+
+def main():
+    create_data('imdb.pkl')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/paddle/rnn/provider.py b/benchmark/paddle/rnn/provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..928ca75daf84ccebb775364b0be0d8b3d5eebff9
--- /dev/null
+++ b/benchmark/paddle/rnn/provider.py
@@ -0,0 +1,72 @@
+import io, os
+import random
+import numpy as np
+import six.moves.cPickle as pickle
+from paddle.trainer.PyDataProvider2 import *
+
+
+def remove_unk(x, n_words):
+    return [[1 if w >= n_words else w for w in sen] for sen in x]
+
+
+# ==============================================================
+#  tensorflow uses fixed length, but PaddlePaddle can process
+#  variable-length. Padding is used in benchmark in order to
+#  compare with other platform. 
+# ==============================================================
+def pad_sequences(sequences,
+                  maxlen=None,
+                  dtype='int32',
+                  padding='post',
+                  truncating='post',
+                  value=0.):
+    lengths = [len(s) for s in sequences]
+
+    nb_samples = len(sequences)
+    if maxlen is None:
+        maxlen = np.max(lengths)
+
+    x = (np.ones((nb_samples, maxlen)) * value).astype(dtype)
+    for idx, s in enumerate(sequences):
+        if len(s) == 0:
+            continue  # empty list was found
+        if truncating == 'pre':
+            trunc = s[-maxlen:]
+        elif truncating == 'post':
+            trunc = s[:maxlen]
+        else:
+            raise ValueError("Truncating type '%s' not understood" % padding)
+
+        if padding == 'post':
+            x[idx, :len(trunc)] = trunc
+        elif padding == 'pre':
+            x[idx, -len(trunc):] = trunc
+        else:
+            raise ValueError("Padding type '%s' not understood" % padding)
+    return x
+
+
+def initHook(settings, vocab_size, pad_seq, maxlen, **kwargs):
+    settings.vocab_size = vocab_size
+    settings.pad_seq = pad_seq
+    settings.maxlen = maxlen
+    settings.input_types = [
+        integer_value_sequence(vocab_size), integer_value(2)
+    ]
+
+
+@provider(
+    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file):
+    f = open(file, 'rb')
+    train_set = pickle.load(f)
+    f.close()
+    x, y = train_set
+
+    # remove unk, namely remove the words out of dictionary
+    x = remove_unk(x, settings.vocab_size)
+    if settings.pad_seq:
+        x = pad_sequences(x, maxlen=settings.maxlen, value=0.)
+
+    for i in range(len(y)):
+        yield map(int, x[i]), int(y[i])
diff --git a/benchmark/paddle/rnn/rnn.py b/benchmark/paddle/rnn/rnn.py
new file mode 100755
index 0000000000000000000000000000000000000000..83eb3e565473f7e7e91cddeaa3cd2aafb7e3df2c
--- /dev/null
+++ b/benchmark/paddle/rnn/rnn.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+
+from paddle.trainer_config_helpers import *
+import imdb
+
+num_class = 2
+vocab_size = 30000
+fixedlen = 100
+batch_size = get_config_arg('batch_size', int, 128)
+lstm_num = get_config_arg('lstm_num', int, 1)
+hidden_size = get_config_arg('hidden_size', int, 128)
+# whether to pad sequence into fixed length
+pad_seq = get_config_arg('pad_seq', bool, True)
+imdb.create_data('imdb.pkl')
+
+args = {'vocab_size': vocab_size, 'pad_seq': pad_seq, 'maxlen': fixedlen}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25)
+
+net = data_layer('data', size=vocab_size)
+net = embedding_layer(input=net, size=128)
+
+for i in xrange(lstm_num):
+    net = simple_lstm(input=net, size=hidden_size)
+
+net = last_seq(input=net)
+net = fc_layer(input=net, size=2, act=SoftmaxActivation())
+
+lab = data_layer('label', num_class)
+loss = classification_cost(input=net, label=lab)
+outputs(loss)
diff --git a/benchmark/paddle/rnn/run.sh b/benchmark/paddle/rnn/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..e9dfeb2e525979f47e4ef48f7610dc1007900f2c
--- /dev/null
+++ b/benchmark/paddle/rnn/run.sh
@@ -0,0 +1,50 @@
+set -e
+
+function train() {
+  cfg=$1
+  thread=$2
+  args="lstm_num=${3},seq_pad=${4},hidden_size=${5},batch_size=${6}"
+  paddle train --job=time \
+    --config=$cfg \
+    --use_gpu=1 \
+    --trainer_count=$thread \
+    --log_period=10 \
+    --test_period=100 \
+    --num_passes=1 \
+    --feed_data=1 \
+    --config_args=$args \
+    >logs/rnn-pad${4}-${thread}gpu-lstm${3}-batch${6}-hid${5}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+## padding, single gpu
+#-----config--gpu--lstm_num--padding--hidden_size--batch_size
+## lstm_num=2, batch_size=64
+train rnn.py 1 2 1 256 64 
+train rnn.py 1 2 1 512 64 
+train rnn.py 1 2 1 1280 64 
+
+## lstm_num=2, batch_size=128
+train rnn.py 1 2 1 256 128 
+train rnn.py 1 2 1 512 128 
+train rnn.py 1 2 1 1280 128 
+
+## lstm_num=4, batch_size=256
+train rnn.py 1 2 1 256 256 
+train rnn.py 1 2 1 512 256 
+train rnn.py 1 2 1 1280 256 
+
+
+#==================multi gpus=====================#
+# hidden_size=256, lstm_num=2, different batch size
+train rnn.py 4 2 1 256 128 
+train rnn.py 4 2 1 256 256 
+train rnn.py 4 2 1 256 512 
+
+# hidden_size=512, lstm_num=4, different batch size
+train rnn.py 4 2 1 512 128 
+train rnn.py 4 2 1 512 256 
+train rnn.py 4 2 1 512 512 
diff --git a/benchmark/tensorflow/image/alexnet.py b/benchmark/tensorflow/image/alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6a39ef778e21bee7374718a1b1ddf43392825a8
--- /dev/null
+++ b/benchmark/tensorflow/image/alexnet.py
@@ -0,0 +1,298 @@
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from datetime import datetime
+import math
+import time
+
+import tensorflow.python.platform
+import tensorflow as tf
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_boolean('forward_only', False,
+                            """Only run the forward pass.""")
+tf.app.flags.DEFINE_boolean('forward_backward_only', False,
+                            """Only run the forward-forward pass.""")
+tf.app.flags.DEFINE_string('data_format', 'NCHW',
+                           """The data format for Convnet operations.
+                           Can be either NHWC or NCHW.
+                           """)
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+
+
+def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.0005):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [kH, kW, nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        if wd is not None and wd > 0:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        if FLAGS.data_format == 'NCHW':
+            strides = [1, 1, dH, dW]
+        else:
+            strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(
+            inpOp,
+            kernel,
+            strides,
+            padding=padType,
+            data_format=FLAGS.data_format)
+
+        biases = tf.get_variable(
+            name=name + '_b',
+            shape=[nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32)
+
+        bias = tf.reshape(
+            tf.nn.bias_add(
+                conv, biases, data_format=FLAGS.data_format),
+            conv.get_shape())
+
+        conv1 = tf.nn.relu(bias, name=scope)
+        return conv1
+
+
+def _affine(name, inpOp, nIn, nOut, wd=0.0005, act=True, drop=None):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        if wd is not None and wd > 0:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        biases = tf.get_variable(
+            name + '_b', [nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=True)
+
+        affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
+                  tf.matmul(inpOp, kernel) + biases
+
+        output = tf.nn.dropout(affine1, drop) if drop else affine1
+
+        return output
+
+
+def _mpool(name, inpOp, kH, kW, dH, dW):
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding='VALID',
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _norm(name, l_input, lsize=4):
+    return tf.nn.lrn(l_input,
+                     lsize,
+                     bias=1.0,
+                     alpha=0.001 / 9.0,
+                     beta=0.75,
+                     name=name)
+
+
+def loss(logits, labels):
+    labels = tf.cast(labels, tf.int64)
+    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        logits, labels, name='cross_entropy_per_example')
+    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
+    tf.add_to_collection('losses', cross_entropy_mean)
+
+    # The total loss is defined as the cross entropy loss plus all of the weight
+    # decay terms (L2 loss).
+    return tf.add_n(tf.get_collection('losses'), name='total_loss')
+
+
+def get_incoming_shape(incoming):
+    """ Returns the incoming data shape """
+    if isinstance(incoming, tf.Tensor):
+        return incoming.get_shape().as_list()
+    elif type(incoming) in [np.array, list, tuple]:
+        return np.shape(incoming)
+    else:
+        raise Exception("Invalid incoming layer.")
+
+
+def inference(images):
+    conv1 = _conv('conv1', images, 3, 96, 11, 11, 4, 4, 'VALID')
+    pool1 = _mpool('pool1', conv1, 3, 3, 2, 2)
+    norm1 = _norm('norm1', pool1, lsize=5)
+    conv2 = _conv('conv2', norm1, 96, 256, 5, 5, 1, 1, 'SAME')
+    pool2 = _mpool('pool2', conv2, 3, 3, 2, 2)
+    norm2 = _norm('norm2', pool2, lsize=5)
+    conv3 = _conv('conv3', norm2, 256, 384, 3, 3, 1, 1, 'SAME')
+    conv4 = _conv('conv4', conv3, 384, 384, 3, 3, 1, 1, 'SAME')
+    conv5 = _conv('conv5', conv4, 384, 256, 3, 3, 1, 1, 'SAME')
+    pool5 = _mpool('pool5', conv5, 3, 3, 2, 2)
+    resh1 = tf.reshape(pool5, [-1, 256 * 6 * 6])
+    affn1 = _affine('fc6', resh1, 256 * 6 * 6, 4096, 0.5)
+    affn2 = _affine('fc7', affn1, 4096, 4096, 0.5)
+    affn3 = _affine('fc8', affn2, 4096, 1000, wd=None, act=False)  # last fc
+
+    return affn3
+
+
+def time_tensorflow_run(session, target, info_string):
+    num_steps_burn_in = 10
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    if not isinstance(target, list):
+        target = [target]
+    target_op = tf.group(*target)
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _ = session.run(target_op)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                print('%s: step %d, duration = %.3f' %
+                      (datetime.now(), i - num_steps_burn_in, duration))
+            total_duration += duration
+            total_duration_squared += duration * duration
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
+
+
+def _add_loss_summaries(total_loss):
+    """
+  Generates moving average for all losses and associated summaries for
+  visualizing the performance of the network.
+
+  Args:
+    total_loss: Total loss from loss().
+  Returns:
+    loss_averages_op: op for generating moving averages of losses.
+  """
+    # Compute the moving average of all individual losses and the total loss.
+    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
+    losses = tf.get_collection('losses')
+    loss_averages_op = loss_averages.apply(losses + [total_loss])
+
+    # Attach a scalar summary to all individual losses and the total loss; do the
+    # same for the averaged version of the losses.
+    for l in losses + [total_loss]:
+        # Name each loss as '(raw)' and name the moving average version of the loss
+        # as the original loss name.
+        tf.scalar_summary(l.op.name + ' (raw)', l)
+        tf.scalar_summary(l.op.name, loss_averages.average(l))
+
+    return loss_averages_op
+
+
+def run_benchmark():
+    with tf.Graph().as_default():
+        with tf.device('/gpu:0'):
+            # Generate some dummy images.
+            image_size = 224
+            # Note that our padding definition is slightly different the cuda-convnet.
+            # In order to force the model to start with the same activations sizes,
+            # we add 3 to the image_size and employ VALID padding above.
+            if FLAGS.data_format == 'NCHW':
+                image_shape = [
+                    FLAGS.batch_size, 3, image_size + 3, image_size + 3
+                ]
+            else:
+                image_shape = [
+                    FLAGS.batch_size, image_size + 3, image_size + 3, 3
+                ]
+            images = tf.get_variable(
+                'image',
+                image_shape,
+                initializer=tf.truncated_normal_initializer(
+                    stddev=0.1, dtype=tf.float32),
+                dtype=tf.float32,
+                trainable=False)
+
+            labels = tf.get_variable(
+                'label', [FLAGS.batch_size],
+                initializer=tf.constant_initializer(1),
+                dtype=tf.int32,
+                trainable=False)
+
+            # Build a Graph that computes the logits predictions from the
+            # inference model.
+            last_layer = inference(images)
+
+            objective = loss(last_layer, labels)
+            # Compute the gradient with respect to all the parameters.
+
+            # Compute gradients.
+            # opt = tf.train.GradientDescentOptimizer(0.001)
+            opt = tf.train.MomentumOptimizer(0.001, 0.9)
+            grads = opt.compute_gradients(objective)
+            global_step = tf.get_variable(
+                'global_step', [],
+                initializer=tf.constant_initializer(
+                    0.0, dtype=tf.float32),
+                trainable=False,
+                dtype=tf.float32)
+            apply_gradient_op = opt.apply_gradients(
+                grads, global_step=global_step)
+
+            # Track the moving averages of all trainable variables.
+            variable_averages = tf.train.ExponentialMovingAverage(0.9,
+                                                                  global_step)
+            variables_averages_op = variable_averages.apply(
+                tf.trainable_variables())
+
+            # Build an initialization operation.
+            init = tf.initialize_all_variables()
+
+            # Start running operations on the Graph.
+            sess = tf.Session(config=tf.ConfigProto(
+                allow_soft_placement=True,
+                log_device_placement=FLAGS.log_device_placement))
+            sess.run(init)
+
+            run_forward = True
+            run_forward_backward = True
+            if FLAGS.forward_only and FLAGS.forward_backward_only:
+                raise ValueError("Cannot specify --forward_only and "
+                                 "--forward_backward_only at the same time.")
+            if FLAGS.forward_only:
+                run_forward_backward = False
+            elif FLAGS.forward_backward_only:
+                run_forward = False
+
+            if run_forward:
+                time_tensorflow_run(sess, last_layer, "Forward")
+
+            if run_forward_backward:
+                with tf.control_dependencies(
+                    [apply_gradient_op, variables_averages_op]):
+                    train_op = tf.no_op(name='train')
+                time_tensorflow_run(sess, [train_op, objective],
+                                    "Forward-backward")
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
diff --git a/benchmark/tensorflow/image/alexnet_multi_gpu.py b/benchmark/tensorflow/image/alexnet_multi_gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b5ee78f4dd5429abd85d75c092a6e3a2a39f922
--- /dev/null
+++ b/benchmark/tensorflow/image/alexnet_multi_gpu.py
@@ -0,0 +1,365 @@
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from datetime import datetime
+import math
+import re
+import time
+
+import tensorflow.python.platform
+import tensorflow as tf
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_string('data_format', 'NCHW',
+                           """The data format for Convnet operations.
+                           Can be either NHWC or NCHW.
+                           """)
+
+tf.app.flags.DEFINE_string('train_dir', '/train_model',
+                           """Directory where to write event logs """
+                           """and checkpoint.""")
+tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""")
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+
+NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
+NUM_EPOCHS_PER_DECAY = 50
+INITIAL_LEARNING_RATE = 0.1
+LEARNING_RATE_DECAY_FACTOR = 0.1
+TOWER_NAME = 'tower'
+
+
+def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [kH, kW, nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        if wd is not None:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        if FLAGS.data_format == 'NCHW':
+            strides = [1, 1, dH, dW]
+        else:
+            strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(
+            inpOp,
+            kernel,
+            strides,
+            padding=padType,
+            data_format=FLAGS.data_format)
+
+        biases = tf.get_variable(
+            name=name + '_b',
+            shape=[nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32)
+
+        bias = tf.reshape(
+            tf.nn.bias_add(
+                conv, biases, data_format=FLAGS.data_format),
+            conv.get_shape())
+
+        conv1 = tf.nn.relu(bias, name=scope)
+        return conv1
+
+
+def _affine(name, inpOp, nIn, nOut, wd=0.005, act=True):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        if wd is not None:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        biases = tf.get_variable(
+            name + '_b', [nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=True)
+
+        affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
+                  tf.matmul(inpOp, kernel) + biases
+
+        return affine1
+
+
+def _mpool(name, inpOp, kH, kW, dH, dW):
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding='VALID',
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _norm(name, l_input, lsize=4):
+    return tf.nn.lrn(l_input,
+                     lsize,
+                     bias=1.0,
+                     alpha=0.001 / 9.0,
+                     beta=0.75,
+                     name=name)
+
+
+def loss(logits, labels):
+    labels = tf.cast(labels, tf.int64)
+    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        logits, labels, name='cross_entropy_per_example')
+    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
+    tf.add_to_collection('losses', cross_entropy_mean)
+
+    # The total loss is defined as the cross entropy loss plus all of the weight
+    # decay terms (L2 loss).
+    return tf.add_n(tf.get_collection('losses'), name='total_loss')
+
+
+def get_incoming_shape(incoming):
+    """ Returns the incoming data shape """
+    if isinstance(incoming, tf.Tensor):
+        return incoming.get_shape().as_list()
+    elif type(incoming) in [np.array, list, tuple]:
+        return np.shape(incoming)
+    else:
+        raise Exception("Invalid incoming layer.")
+
+
+def inference(images):
+    conv1 = _conv('conv1', images, 3, 96, 11, 11, 4, 4, 'VALID')
+    pool1 = _mpool('pool1', conv1, 3, 3, 2, 2)
+    norm1 = _norm('norm1', pool1, lsize=5)
+    conv2 = _conv('conv2', norm1, 96, 256, 5, 5, 1, 1, 'SAME')
+    pool2 = _mpool('pool2', conv2, 3, 3, 2, 2)
+    norm2 = _norm('norm2', pool2, lsize=5)
+    conv3 = _conv('conv3', norm2, 256, 384, 3, 3, 1, 1, 'SAME')
+    conv4 = _conv('conv4', conv3, 384, 384, 3, 3, 1, 1, 'SAME')
+    conv5 = _conv('conv5', conv4, 384, 256, 3, 3, 1, 1, 'SAME')
+    pool5 = _mpool('pool5', conv5, 3, 3, 2, 2)
+    resh1 = tf.reshape(pool5, [-1, 256 * 6 * 6])
+    affn1 = _affine('fc6', resh1, 256 * 6 * 6, 4096)
+    affn2 = _affine('fc7', affn1, 4096, 4096)
+    affn3 = _affine('fc8', affn2, 4096, 1000, wd=None, act=False)  # last fc
+
+    return affn3
+
+
+def tower_loss(scope):
+    """Calculate the total loss on a single tower running the model.
+    Args:
+        scope: unique prefix string identifying the tower, e.g. 'tower_0'
+    Returns:
+        Tensor of shape [] containing the total loss for a batch of data
+    """
+    image_size = 224
+    if FLAGS.data_format == 'NCHW':
+        image_shape = [FLAGS.batch_size, 3, image_size + 3, image_size + 3]
+    else:
+        image_shape = [FLAGS.batch_size, image_size + 3, image_size + 3, 3]
+    images = tf.get_variable(
+        'image',
+        image_shape,
+        initializer=tf.truncated_normal_initializer(
+            stddev=0.1, dtype=tf.float32),
+        dtype=tf.float32,
+        trainable=False)
+
+    labels = tf.get_variable(
+        'label', [FLAGS.batch_size],
+        initializer=tf.constant_initializer(1),
+        dtype=tf.int32,
+        trainable=False)
+
+    # Build a Graph that computes the logits predictions from the
+    # inference model.
+    last_layer = inference(images)
+
+    # Build the portion of the Graph calculating the losses. Note that we will
+    # assemble the total_loss using a custom function below.
+    _ = loss(last_layer, labels)
+
+    # Assemble all of the losses for the current tower only.
+    losses = tf.get_collection('losses', scope)
+
+    # Calculate the total loss for the current tower.
+    total_loss = tf.add_n(losses, name='total_loss')
+
+    # Compute the moving average of all individual losses and the total loss.
+    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
+    loss_averages_op = loss_averages.apply(losses + [total_loss])
+
+    # Attach a scalar summary to all individual losses and the total loss; do the
+    # same for the averaged version of the losses.
+    for l in losses + [total_loss]:
+        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
+        # session. This helps the clarity of presentation on tensorboard.
+        loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
+        # Name each loss as '(raw)' and name the moving average version of the loss
+        # as the original loss name.
+        tf.scalar_summary(loss_name + ' (raw)', l)
+        tf.scalar_summary(loss_name, loss_averages.average(l))
+
+    with tf.control_dependencies([loss_averages_op]):
+        total_loss = tf.identity(total_loss)
+    return total_loss
+
+
+def average_gradients(tower_grads):
+    """Calculate the average gradient for each shared variable across all towers.
+  Note that this function provides a synchronization point across all towers.
+  Args:
+    tower_grads: List of lists of (gradient, variable) tuples. The outer list
+      is over individual gradients. The inner list is over the gradient
+      calculation for each tower.
+  Returns:
+     List of pairs of (gradient, variable) where the gradient has been averaged
+     across all towers.
+  """
+    average_grads = []
+    for grad_and_vars in zip(*tower_grads):
+        # Note that each grad_and_vars looks like the following:
+        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+        grads = []
+        for g, _ in grad_and_vars:
+            # Add 0 dimension to the gradients to represent the tower.
+            expanded_g = tf.expand_dims(g, 0)
+
+            # Append on a 'tower' dimension which we will average over below.
+            grads.append(expanded_g)
+
+        # Average over the 'tower' dimension.
+        grad = tf.concat(0, grads)
+        grad = tf.reduce_mean(grad, 0)
+
+        # Keep in mind that the Variables are redundant because they are shared
+        # across towers. So .. we will just return the first tower's pointer to
+        # the Variable.
+        v = grad_and_vars[0][1]
+        grad_and_var = (grad, v)
+        average_grads.append(grad_and_var)
+    return average_grads
+
+
+def time_tensorflow_run(session, target):
+    num_steps_burn_in = 50
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _, loss_value = session.run(target)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
+                examples_per_sec = num_examples_per_step / duration
+                sec_per_batch = duration
+
+                format_str = (
+                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
+                    'sec/batch batch_size = %d)')
+                print(format_str %
+                      (datetime.now(), i - num_steps_burn_in, loss_value,
+                       duration, sec_per_batch, num_examples_per_step))
+
+            total_duration += duration
+            total_duration_squared += duration * duration
+
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), FLAGS.num_batches, mn, sd))
+
+
+def run_benchmark():
+    with tf.Graph().as_default(), tf.device('/cpu:0'):
+        # Create a variable to count the number of train() calls. This equals the
+        # number of batches processed * FLAGS.num_gpus.
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(0),
+            trainable=False)
+
+        # Calculate the learning rate schedule.
+        num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
+                                 FLAGS.batch_size)
+        decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
+
+        # Decay the learning rate exponentially based on the number of steps.
+        lr = tf.train.exponential_decay(
+            INITIAL_LEARNING_RATE,
+            global_step,
+            decay_steps,
+            LEARNING_RATE_DECAY_FACTOR,
+            staircase=True)
+
+        # Create an optimizer that performs gradient descent.
+        opt = tf.train.MomentumOptimizer(lr, 0.9)
+
+        # Calculate the gradients for each model tower.
+        tower_grads = []
+        for i in xrange(FLAGS.num_gpus):
+            with tf.device('/gpu:%d' % i):
+                with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
+                    # Calculate the loss for one tower of the model. This function
+                    # constructs the entire model but shares the variables across
+                    # all towers.
+                    loss = tower_loss(scope)
+
+                    # Reuse variables for the next tower.
+                    tf.get_variable_scope().reuse_variables()
+
+                    # Retain the summaries from the final tower.
+                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
+
+                    # Calculate the gradients for the batch of data on this tower.
+                    grads = opt.compute_gradients(loss)
+
+                    # Keep track of the gradients across all towers.
+                    tower_grads.append(grads)
+
+        # We must calculate the mean of each gradient. Note that this is the
+        # synchronization point across all towers.
+        grads = average_gradients(tower_grads)
+
+        # Apply the gradients to adjust the shared variables.
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Group all updates to into a single train op.
+        train_op = tf.group(apply_gradient_op)
+
+        # Build an initialization operation.
+        init = tf.initialize_all_variables()
+
+        # Start running operations on the Graph. allow_soft_placement must be set to
+        # True to build towers on GPU, as some of the ops do not have GPU
+        # implementations.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+        time_tensorflow_run(sess, [train_op, loss])
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
diff --git a/benchmark/tensorflow/image/googlenet.py b/benchmark/tensorflow/image/googlenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..decf855b54451efba5f6a7868fbcf631789f3572
--- /dev/null
+++ b/benchmark/tensorflow/image/googlenet.py
@@ -0,0 +1,311 @@
+from six.moves import xrange
+from datetime import datetime
+import math
+import time
+
+import tensorflow.python.platform
+import tensorflow as tf
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_boolean('forward_only', False,
+                            """Only run the forward pass.""")
+tf.app.flags.DEFINE_boolean('forward_backward_only', False,
+                            """Only run the forward-forward pass.""")
+tf.app.flags.DEFINE_string('data_format', 'NCHW',
+                           """The data format for Convnet operations.
+                           Can be either NHWC or NCHW.
+                           """)
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+
+parameters = []
+
+conv_counter = 1
+pool_counter = 1
+affine_counter = 1
+
+
+def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.0005):
+    global conv_counter
+    global parameters
+    name = 'conv' + str(conv_counter)
+    conv_counter += 1
+    with tf.name_scope(name) as scope:
+        kernel = tf.Variable(
+            tf.truncated_normal(
+                [kH, kW, nIn, nOut], dtype=tf.float32, stddev=1e-1),
+            name='weights')
+
+        if wd is not None and wd > 0:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        if FLAGS.data_format == 'NCHW':
+            strides = [1, 1, dH, dW]
+        else:
+            strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(
+            inpOp,
+            kernel,
+            strides,
+            padding=padType,
+            data_format=FLAGS.data_format)
+        biases = tf.Variable(
+            tf.constant(
+                0.0, shape=[nOut], dtype=tf.float32),
+            trainable=True,
+            name='biases')
+        bias = tf.reshape(
+            tf.nn.bias_add(
+                conv, biases, data_format=FLAGS.data_format),
+            conv.get_shape())
+        conv1 = tf.nn.relu(bias, name=scope)
+        parameters += [kernel, biases]
+        return conv1
+
+
+def _affine(inpOp, nIn, nOut, act=True, wd=0.0005):
+    global affine_counter
+    global parameters
+    name = 'affine' + str(affine_counter)
+    affine_counter += 1
+    with tf.name_scope(name) as scope:
+        kernel = tf.Variable(
+            tf.truncated_normal(
+                [nIn, nOut], dtype=tf.float32, stddev=1e-1),
+            name='weights')
+
+        if wd is not None and wd > 0:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        biases = tf.Variable(
+            tf.constant(
+                0.0, shape=[nOut], dtype=tf.float32),
+            trainable=True,
+            name='biases')
+        affine1 = tf.nn.relu_layer(
+            inpOp, kernel, biases,
+            name=name) if act else tf.matmul(inpOp, kernel) + biases
+        parameters += [kernel, biases]
+        return affine1
+
+
+def _mpool(inpOp, kH, kW, dH, dW, padding):
+    global pool_counter
+    global parameters
+    name = 'pool' + str(pool_counter)
+    pool_counter += 1
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _apool(inpOp, kH, kW, dH, dW, padding):
+    global pool_counter
+    global parameters
+    name = 'pool' + str(pool_counter)
+    pool_counter += 1
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.avg_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _inception(inp, inSize, o1s, o2s1, o2s2, o3s1, o3s2, o4s1, o4s2):
+    conv1 = _conv(inp, inSize, o1s, 1, 1, 1, 1, 'VALID')
+
+    conv3_ = _conv(inp, inSize, o2s1, 1, 1, 1, 1, 'VALID')
+    conv3 = _conv(conv3_, o2s1, o2s2, 3, 3, 1, 1, 'SAME')
+
+    conv5_ = _conv(inp, inSize, o3s1, 1, 1, 1, 1, 'VALID')
+    conv5 = _conv(conv5_, o3s1, o3s2, 5, 5, 1, 1, 'SAME')
+
+    pool_ = _mpool(inp, o4s1, o4s1, 1, 1, 'SAME')
+    pool = _conv(pool_, inSize, o4s2, 1, 1, 1, 1, 'VALID')
+
+    if FLAGS.data_format == 'NCHW':
+        channel_dim = 1
+    else:
+        channel_dim = 3
+    incept = tf.concat(channel_dim, [conv1, conv3, conv5, pool])
+    return incept
+
+
+def loss(logits, labels):
+    batch_size = tf.size(labels)
+    labels = tf.expand_dims(labels, 1)
+    indices = tf.expand_dims(tf.range(0, batch_size, 1), 1)
+    concated = tf.concat(1, [indices, labels])
+    onehot_labels = tf.sparse_to_dense(concated,
+                                       tf.pack([batch_size, 1000]), 1.0, 0.0)
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+        logits, onehot_labels, name='xentropy')
+    loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
+    return loss
+
+
+def inference(images):
+    # stage 1
+    conv1 = _conv(images, 3, 64, 7, 7, 2, 2, 'SAME')
+    pool1 = _mpool(conv1, 3, 3, 2, 2, 'SAME')
+    # stage 2
+    conv2 = _conv(pool1, 64, 64, 1, 1, 1, 1, 'VALID')
+    conv3 = _conv(conv2, 64, 192, 3, 3, 1, 1, 'SAME')
+    pool3 = _mpool(conv3, 3, 3, 2, 2, 'SAME')
+
+    # stage 3
+    incept3a = _inception(pool3, 192, 64, 96, 128, 16, 32, 3, 32)
+    incept3b = _inception(incept3a, 256, 128, 128, 192, 32, 96, 3, 64)
+    pool4 = _mpool(incept3b, 3, 3, 2, 2, 'SAME')
+
+    # stage 4
+    incept4a = _inception(pool4, 480, 192, 96, 208, 16, 48, 3, 64)
+    incept4b = _inception(incept4a, 512, 160, 112, 224, 24, 64, 3, 64)
+    incept4c = _inception(incept4b, 512, 128, 128, 256, 24, 64, 3, 64)
+    incept4d = _inception(incept4c, 512, 112, 144, 288, 32, 64, 3, 64)
+    incept4e = _inception(incept4d, 528, 256, 160, 320, 32, 128, 3, 128)
+    pool5 = _mpool(incept4e, 3, 3, 2, 2, 'SAME')
+
+    # stage 5
+    incept5a = _inception(pool5, 832, 256, 160, 320, 32, 128, 3, 128)
+    incept5b = _inception(incept5a, 832, 384, 192, 384, 48, 128, 3, 128)
+    pool6 = _apool(incept5b, 7, 7, 1, 1, 'VALID')
+
+    # output 1
+    resh1 = tf.reshape(pool6, [-1, 1024])
+    drop = tf.nn.dropout(resh1, 0.4)
+    affn1 = _affine(resh1, 1024, 1000, act=False)
+
+    return affn1
+
+
+def time_tensorflow_run(session, target, info_string):
+    num_steps_burn_in = 10
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    if not isinstance(target, list):
+        target = [target]
+    target_op = tf.group(*target)
+    for i in range(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _ = session.run(target_op)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                print('%s: step %d, duration = %.3f' %
+                      (datetime.now(), i - num_steps_burn_in, duration))
+            total_duration += duration
+            total_duration_squared += duration * duration
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
+
+
+def run_benchmark():
+    global parameters
+    with tf.Graph().as_default():
+        # Generate some dummy images.
+        image_size = 224
+        if FLAGS.data_format == 'NCHW':
+            image_shape = [FLAGS.batch_size, 3, image_size, image_size]
+        else:
+            image_shape = [FLAGS.batch_size, image_size, image_size, 3]
+
+        images = tf.get_variable(
+            'image',
+            image_shape,
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.1, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=False)
+
+        labels = tf.get_variable(
+            'label', [FLAGS.batch_size],
+            initializer=tf.constant_initializer(1),
+            dtype=tf.int32,
+            trainable=False)
+
+        # Build a Graph that computes the logits predictions from the
+        # inference model.
+        last_layer = inference(images)
+
+        objective = loss(last_layer, labels)
+
+        # Compute gradients.
+        # opt = tf.train.GradientDescentOptimizer(0.001)
+        opt = tf.train.MomentumOptimizer(0.001, 0.9)
+        grads = opt.compute_gradients(objective)
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(
+                0.0, dtype=tf.float32),
+            trainable=False,
+            dtype=tf.float32)
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Track the moving averages of all trainable variables.
+        variable_averages = tf.train.ExponentialMovingAverage(0.9, global_step)
+        variables_averages_op = variable_averages.apply(tf.trainable_variables(
+        ))
+
+        # Build an initialization operation.
+        init = tf.initialize_all_variables()
+
+        # Start running operations on the Graph.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+
+        run_forward = True
+        run_forward_backward = True
+        if FLAGS.forward_only and FLAGS.forward_backward_only:
+            raise ValueError("Cannot specify --forward_only and "
+                             "--forward_backward_only at the same time.")
+        if FLAGS.forward_only:
+            run_forward_backward = False
+        elif FLAGS.forward_backward_only:
+            run_forward = False
+
+        if run_forward:
+            # Run the forward benchmark.
+            time_tensorflow_run(sess, last_layer, "Forward")
+
+        if run_forward_backward:
+            with tf.control_dependencies(
+                [apply_gradient_op, variables_averages_op]):
+                train_op = tf.no_op(name='train')
+            time_tensorflow_run(sess, [train_op, objective], "Forward-backward")
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
diff --git a/benchmark/tensorflow/image/googlenet_multi_gpu.py b/benchmark/tensorflow/image/googlenet_multi_gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..31466faa37c47c66e4fe4628e28c867875e89f2e
--- /dev/null
+++ b/benchmark/tensorflow/image/googlenet_multi_gpu.py
@@ -0,0 +1,411 @@
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from datetime import datetime
+import math
+import re
+import time
+
+import tensorflow.python.platform
+import tensorflow as tf
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_string('data_format', 'NCHW',
+                           """The data format for Convnet operations.
+                           Can be either NHWC or NCHW.
+                           """)
+
+tf.app.flags.DEFINE_string('train_dir', '/train_model',
+                           """Directory where to write event logs """
+                           """and checkpoint.""")
+tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""")
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+
+NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
+NUM_EPOCHS_PER_DECAY = 50
+INITIAL_LEARNING_RATE = 0.1
+LEARNING_RATE_DECAY_FACTOR = 0.1
+TOWER_NAME = 'tower'
+
+
+def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [kH, kW, nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        if wd is not None:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        if FLAGS.data_format == 'NCHW':
+            strides = [1, 1, dH, dW]
+        else:
+            strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(
+            inpOp,
+            kernel,
+            strides,
+            padding=padType,
+            data_format=FLAGS.data_format)
+
+        biases = tf.get_variable(
+            name=name + '_b',
+            shape=[nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32)
+
+        bias = tf.reshape(
+            tf.nn.bias_add(
+                conv, biases, data_format=FLAGS.data_format),
+            conv.get_shape())
+
+        conv1 = tf.nn.relu(bias, name=scope)
+        return conv1
+
+
+def _affine(name, inpOp, nIn, nOut, wd=0.005, act=True):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        if wd is not None:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        biases = tf.get_variable(
+            name + '_b', [nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=True)
+
+        affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
+                  tf.matmul(inpOp, kernel) + biases
+
+        return affine1
+
+
+def _mpool(name, inpOp, kH, kW, dH, dW, padding):
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _apool(name, inpOp, kH, kW, dH, dW, padding):
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.avg_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def loss(logits, labels):
+    labels = tf.cast(labels, tf.int64)
+    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        logits, labels, name='cross_entropy_per_example')
+    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
+    tf.add_to_collection('losses', cross_entropy_mean)
+
+    # The total loss is defined as the cross entropy loss plus all of the weight
+    # decay terms (L2 loss).
+    return tf.add_n(tf.get_collection('losses'), name='total_loss')
+
+
+def get_incoming_shape(incoming):
+    """ Returns the incoming data shape """
+    if isinstance(incoming, tf.Tensor):
+        return incoming.get_shape().as_list()
+    elif type(incoming) in [np.array, list, tuple]:
+        return np.shape(incoming)
+    else:
+        raise Exception("Invalid incoming layer.")
+
+
+def _inception(name, inp, inSize, o1s, o2s1, o2s2, o3s1, o3s2, o4s1, o4s2):
+    conv1 = _conv(name + '_1', inp, inSize, o1s, 1, 1, 1, 1, 'VALID')
+
+    conv3_ = _conv(name + '_3r', inp, inSize, o2s1, 1, 1, 1, 1, 'VALID')
+    conv3 = _conv(name + '_3', conv3_, o2s1, o2s2, 3, 3, 1, 1, 'SAME')
+
+    conv5_ = _conv(name + '_5r', inp, inSize, o3s1, 1, 1, 1, 1, 'VALID')
+    conv5 = _conv(name + '5', conv5_, o3s1, o3s2, 5, 5, 1, 1, 'SAME')
+
+    pool_ = _mpool(name + 'pool', inp, o4s1, o4s1, 1, 1, 'SAME')
+    pool = _conv(name + 'proj', pool_, inSize, o4s2, 1, 1, 1, 1, 'VALID')
+
+    if FLAGS.data_format == 'NCHW':
+        channel_dim = 1
+    else:
+        channel_dim = 3
+    incept = tf.concat(channel_dim, [conv1, conv3, conv5, pool])
+    return incept
+
+
+def inference(images):
+    # stage 1
+    conv1 = _conv('conv1', images, 3, 64, 7, 7, 2, 2, 'SAME')
+    pool1 = _mpool('pool1', conv1, 3, 3, 2, 2, 'SAME')
+
+    # stage 2
+    conv2 = _conv('conv2', pool1, 64, 64, 1, 1, 1, 1, 'VALID')
+    conv3 = _conv('conv3', conv2, 64, 192, 3, 3, 1, 1, 'SAME')
+    pool3 = _mpool('pool3', conv3, 3, 3, 2, 2, 'SAME')
+
+    # stage 3
+    incept3a = _inception('ince3a', pool3, 192, 64, 96, 128, 16, 32, 3, 32)
+    incept3b = _inception('ince3b', incept3a, 256, 128, 128, 192, 32, 96, 3, 64)
+    pool4 = _mpool('pool4', incept3b, 3, 3, 2, 2, 'SAME')
+
+    # stage 4
+    incept4a = _inception('ince4a', pool4, 480, 192, 96, 208, 16, 48, 3, 64)
+    incept4b = _inception('ince4b', incept4a, 512, 160, 112, 224, 24, 64, 3, 64)
+    incept4c = _inception('ince4c', incept4b, 512, 128, 128, 256, 24, 64, 3, 64)
+    incept4d = _inception('ince4d', incept4c, 512, 112, 144, 288, 32, 64, 3, 64)
+    incept4e = _inception('ince4e', incept4d, 528, 256, 160, 320, 32, 128, 3,
+                          128)
+    pool5 = _mpool('pool5', incept4e, 3, 3, 2, 2, 'SAME')
+
+    # stage 5
+    incept5a = _inception('ince5a', pool5, 832, 256, 160, 320, 32, 128, 3, 128)
+    incept5b = _inception('ince5b', incept5a, 832, 384, 192, 384, 48, 128, 3,
+                          128)
+    pool6 = _apool('pool6', incept5b, 7, 7, 1, 1, 'VALID')
+
+    # output 1
+    resh1 = tf.reshape(pool6, [-1, 1024])
+    drop = tf.nn.dropout(resh1, 0.4)
+    affn1 = _affine('fc_out', resh1, 1024, 1000, act=False)
+
+    return affn1
+
+
+def tower_loss(scope):
+    """Calculate the total loss on a single tower running the model.
+    Args:
+        scope: unique prefix string identifying the tower, e.g. 'tower_0'
+    Returns:
+        Tensor of shape [] containing the total loss for a batch of data
+    """
+    image_size = 224
+    if FLAGS.data_format == 'NCHW':
+        image_shape = [FLAGS.batch_size, 3, image_size, image_size]
+    else:
+        image_shape = [FLAGS.batch_size, image_size, image_size, 3]
+    images = tf.get_variable(
+        'image',
+        image_shape,
+        initializer=tf.truncated_normal_initializer(
+            stddev=0.1, dtype=tf.float32),
+        dtype=tf.float32,
+        trainable=False)
+
+    labels = tf.get_variable(
+        'label', [FLAGS.batch_size],
+        initializer=tf.constant_initializer(1),
+        dtype=tf.int32,
+        trainable=False)
+
+    # Build a Graph that computes the logits predictions from the
+    # inference model.
+    last_layer = inference(images)
+
+    # Build the portion of the Graph calculating the losses. Note that we will
+    # assemble the total_loss using a custom function below.
+    _ = loss(last_layer, labels)
+
+    # Assemble all of the losses for the current tower only.
+    losses = tf.get_collection('losses', scope)
+
+    # Calculate the total loss for the current tower.
+    total_loss = tf.add_n(losses, name='total_loss')
+
+    # Compute the moving average of all individual losses and the total loss.
+    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
+    loss_averages_op = loss_averages.apply(losses + [total_loss])
+
+    # Attach a scalar summary to all individual losses and the total loss; do the
+    # same for the averaged version of the losses.
+    for l in losses + [total_loss]:
+        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
+        # session. This helps the clarity of presentation on tensorboard.
+        loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
+        # Name each loss as '(raw)' and name the moving average version of the loss
+        # as the original loss name.
+        tf.scalar_summary(loss_name + ' (raw)', l)
+        tf.scalar_summary(loss_name, loss_averages.average(l))
+
+    with tf.control_dependencies([loss_averages_op]):
+        total_loss = tf.identity(total_loss)
+    return total_loss
+
+
+def average_gradients(tower_grads):
+    """Calculate the average gradient for each shared variable across all towers.
+  Note that this function provides a synchronization point across all towers.
+  Args:
+    tower_grads: List of lists of (gradient, variable) tuples. The outer list
+      is over individual gradients. The inner list is over the gradient
+      calculation for each tower.
+  Returns:
+     List of pairs of (gradient, variable) where the gradient has been averaged
+     across all towers.
+  """
+    average_grads = []
+    for grad_and_vars in zip(*tower_grads):
+        # Note that each grad_and_vars looks like the following:
+        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+        grads = []
+        for g, _ in grad_and_vars:
+            # Add 0 dimension to the gradients to represent the tower.
+            expanded_g = tf.expand_dims(g, 0)
+
+            # Append on a 'tower' dimension which we will average over below.
+            grads.append(expanded_g)
+
+        # Average over the 'tower' dimension.
+        grad = tf.concat(0, grads)
+        grad = tf.reduce_mean(grad, 0)
+
+        # Keep in mind that the Variables are redundant because they are shared
+        # across towers. So .. we will just return the first tower's pointer to
+        # the Variable.
+        v = grad_and_vars[0][1]
+        grad_and_var = (grad, v)
+        average_grads.append(grad_and_var)
+    return average_grads
+
+
+def time_tensorflow_run(session, target):
+    num_steps_burn_in = 50
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _, loss_value = session.run(target)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
+                examples_per_sec = num_examples_per_step / duration
+                sec_per_batch = duration
+
+                format_str = (
+                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
+                    'sec/batch batch_size = %d)')
+                print(format_str %
+                      (datetime.now(), i - num_steps_burn_in, loss_value,
+                       duration, sec_per_batch, num_examples_per_step))
+
+            total_duration += duration
+            total_duration_squared += duration * duration
+
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), FLAGS.num_batches, mn, sd))
+
+
+def run_benchmark():
+    with tf.Graph().as_default(), tf.device('/cpu:0'):
+        # Create a variable to count the number of train() calls. This equals the
+        # number of batches processed * FLAGS.num_gpus.
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(0),
+            trainable=False)
+
+        # Calculate the learning rate schedule.
+        num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
+                                 FLAGS.batch_size)
+        decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
+
+        # Decay the learning rate exponentially based on the number of steps.
+        lr = tf.train.exponential_decay(
+            INITIAL_LEARNING_RATE,
+            global_step,
+            decay_steps,
+            LEARNING_RATE_DECAY_FACTOR,
+            staircase=True)
+
+        # Create an optimizer that performs gradient descent.
+        opt = tf.train.MomentumOptimizer(lr, 0.9)
+
+        # Calculate the gradients for each model tower.
+        tower_grads = []
+        for i in xrange(FLAGS.num_gpus):
+            with tf.device('/gpu:%d' % i):
+                with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
+                    # Calculate the loss for one tower of the model. This function
+                    # constructs the entire model but shares the variables across
+                    # all towers.
+                    loss = tower_loss(scope)
+
+                    # Reuse variables for the next tower.
+                    tf.get_variable_scope().reuse_variables()
+
+                    # Retain the summaries from the final tower.
+                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
+
+                    # Calculate the gradients for the batch of data on this tower.
+                    grads = opt.compute_gradients(loss)
+
+                    # Keep track of the gradients across all towers.
+                    tower_grads.append(grads)
+
+        # We must calculate the mean of each gradient. Note that this is the
+        # synchronization point across all towers.
+        grads = average_gradients(tower_grads)
+
+        # Apply the gradients to adjust the shared variables.
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Group all updates to into a single train op.
+        train_op = tf.group(apply_gradient_op)
+
+        # Build an initialization operation.
+        init = tf.initialize_all_variables()
+
+        # Start running operations on the Graph. allow_soft_placement must be set to
+        # True to build towers on GPU, as some of the ops do not have GPU
+        # implementations.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+        time_tensorflow_run(sess, [train_op, loss])
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
diff --git a/benchmark/tensorflow/image/run.sh b/benchmark/tensorflow/image/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..eade36beb9df5f8d3978939216e058203e024c1a
--- /dev/null
+++ b/benchmark/tensorflow/image/run.sh
@@ -0,0 +1,28 @@
+set -e
+
+function test() {
+  cfg=$1
+  batch_size=$2
+  prefix=$3
+  python $cfg --batch_size=$batch_size > logs/${prefix}-1gpu-${batch_size}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# alexnet
+test alexnet.py 64 alexnet
+test alexnet.py 128 alexnet
+test alexnet.py 256 alexnet
+test alexnet.py 512 alexnet
+
+# googlenet
+test googlenet.py 64 googlenet
+test googlenet.py 128 googlenet
+
+# smallnet 
+test smallnet_mnist_cifar.py 64 smallnet
+test smallnet_mnist_cifar.py 128 smallnet
+test smallnet_mnist_cifar.py 256 smallnet
+test smallnet_mnist_cifar.py 512 smallnet
diff --git a/benchmark/tensorflow/image/run_multi.sh b/benchmark/tensorflow/image/run_multi.sh
new file mode 100755
index 0000000000000000000000000000000000000000..69faa4331744f2276e7706185ae10bc507f95764
--- /dev/null
+++ b/benchmark/tensorflow/image/run_multi.sh
@@ -0,0 +1,22 @@
+set -e
+
+function test() {
+  cfg=$1
+  num_gpu=$2
+  batch_size=$3
+  batch_per_gpu=`expr ${batch_size} / ${num_gpu}`
+  prefix=$4
+  python $cfg --num_gpus=$num_gpu --batch_size=${batch_per_gpu} > logs/${prefix}-4gpu-${batch_size}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# alexnet
+test alexnet_multi_gpu.py 4 512 alexnet
+test alexnet_multi_gpu.py 4 1024 alexnet
+
+# googlenet 
+test googlenet_multi_gpu.py 4 512 alexnet
+test googlenet_multi_gpu.py 4 1024 alexnet
diff --git a/benchmark/tensorflow/image/smallnet_mnist_cifar.py b/benchmark/tensorflow/image/smallnet_mnist_cifar.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a625134a6c58586b29190ede9c66253f484d2cf
--- /dev/null
+++ b/benchmark/tensorflow/image/smallnet_mnist_cifar.py
@@ -0,0 +1,304 @@
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from datetime import datetime
+import math
+import time
+
+import tensorflow.python.platform
+import tensorflow as tf
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_boolean('forward_only', False,
+                            """Only run the forward pass.""")
+tf.app.flags.DEFINE_boolean('forward_backward_only', False,
+                            """Only run the forward-forward pass.""")
+tf.app.flags.DEFINE_string('data_format', 'NCHW',
+                           """The data format for Convnet operations.
+                           Can be either NHWC or NCHW.
+                           """)
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+
+parameters = []
+
+conv_counter = 1
+pool_counter = 1
+affine_counter = 1
+
+
+def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005, act=True):
+    global conv_counter
+    global parameters
+    name = 'conv' + str(conv_counter)
+    conv_counter += 1
+    with tf.name_scope(name) as scope:
+        kernel = tf.Variable(
+            tf.truncated_normal(
+                [kH, kW, nIn, nOut], dtype=tf.float32, stddev=1e-1),
+            name='weights')
+
+        if wd is not None:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        if FLAGS.data_format == 'NCHW':
+            strides = [1, 1, dH, dW]
+        else:
+            strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(
+            inpOp,
+            kernel,
+            strides,
+            padding=padType,
+            data_format=FLAGS.data_format)
+        biases = tf.Variable(
+            tf.constant(
+                0.0, shape=[nOut], dtype=tf.float32),
+            trainable=True,
+            name='biases')
+        bias = tf.reshape(
+            tf.nn.bias_add(
+                conv, biases, data_format=FLAGS.data_format),
+            conv.get_shape())
+
+        conv1 = tf.nn.relu(bias, name=scope) if act else bias
+
+        parameters += [kernel, biases]
+
+        return conv1
+
+
+def _affine(inpOp, nIn, nOut, wd=None, act=True):
+    global affine_counter
+    global parameters
+    name = 'affine' + str(affine_counter)
+    affine_counter += 1
+    with tf.name_scope(name) as scope:
+        kernel = tf.Variable(
+            tf.truncated_normal(
+                [nIn, nOut], dtype=tf.float32, stddev=1e-1),
+            name='weights')
+
+        if wd is not None:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        biases = tf.Variable(
+            tf.constant(
+                0.0, shape=[nOut], dtype=tf.float32),
+            trainable=True,
+            name='biases')
+
+        affine1 = tf.nn.relu_layer(
+            inpOp, kernel, biases,
+            name=name) if act else tf.matmul(inpOp, kernel) + biases
+
+        parameters += [kernel, biases]
+
+        return affine1
+
+
+def _mpool(inpOp, kH, kW, dH, dW, padding):
+    global pool_counter
+    global parameters
+    name = 'pool' + str(pool_counter)
+    pool_counter += 1
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _apool(inpOp, kH, kW, dH, dW, padding):
+    global pool_counter
+    global parameters
+    name = 'pool' + str(pool_counter)
+    pool_counter += 1
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.avg_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _norm(name, l_input, lsize=4):
+    return tf.nn.lrn(l_input,
+                     lsize,
+                     bias=1.0,
+                     alpha=0.001 / 9.0,
+                     beta=0.75,
+                     name=name)
+
+
+def loss(logits, labels):
+    batch_size = tf.size(labels)
+    labels = tf.expand_dims(labels, 1)
+    indices = tf.expand_dims(tf.range(0, batch_size, 1), 1)
+    concated = tf.concat(1, [indices, labels])
+    onehot_labels = tf.sparse_to_dense(concated,
+                                       tf.pack([batch_size, 10]), 1.0, 0.0)
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+        logits, onehot_labels, name='xentropy')
+    loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
+    return loss
+
+
+def get_incoming_shape(incoming):
+    """ Returns the incoming data shape """
+    if isinstance(incoming, tf.Tensor):
+        return incoming.get_shape().as_list()
+    elif type(incoming) in [np.array, list, tuple]:
+        return np.shape(incoming)
+    else:
+        raise Exception("Invalid incoming layer.")
+
+
+def inference(images):
+    conv1 = _conv(images, 3, 32, 5, 5, 1, 1, 'SAME')
+    pool1 = _mpool(conv1, 3, 3, 2, 2, 'SAME')
+    conv2 = _conv(pool1, 32, 32, 5, 5, 1, 1, 'SAME')
+    pool2 = _apool(conv2, 3, 3, 2, 2, 'SAME')
+    conv3 = _conv(pool2, 32, 64, 5, 5, 1, 1, 'SAME')
+    pool3 = _apool(conv3, 3, 3, 2, 2, 'SAME')
+    resh1 = tf.reshape(pool3, [-1, 64 * 4 * 4])
+    affn1 = _affine(resh1, 64 * 4 * 4, 64)
+    affn2 = _affine(affn1, 64, 10, act=False)
+
+    print('conv1:', get_incoming_shape(conv1))
+    print('pool1:', get_incoming_shape(pool1))
+    print('conv2:', get_incoming_shape(conv2))
+    print('pool2:', get_incoming_shape(pool2))
+    print('conv3:', get_incoming_shape(conv3))
+    print('pool3:', get_incoming_shape(pool3))
+
+    return affn2
+
+
+def time_tensorflow_run(session, target, info_string):
+    num_steps_burn_in = 10
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    if not isinstance(target, list):
+        target = [target]
+    target_op = tf.group(*target)
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _ = session.run(target_op)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                print('%s: step %d, duration = %.3f' %
+                      (datetime.now(), i - num_steps_burn_in, duration))
+            total_duration += duration
+            total_duration_squared += duration * duration
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
+
+
+def run_benchmark():
+    global parameters
+    with tf.Graph().as_default():
+        # Generate some dummy images.
+        image_size = 32
+        # Note that our padding definition is slightly different the cuda-convnet.
+        # In order to force the model to start with the same activations sizes,
+        # we add 3 to the image_size and employ VALID padding above.
+        if FLAGS.data_format == 'NCHW':
+            image_shape = [FLAGS.batch_size, 3, image_size, image_size]
+        else:
+            image_shape = [FLAGS.batch_size, image_size, image_size, 3]
+
+        images = tf.get_variable(
+            'image',
+            image_shape,
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.1, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=False)
+
+        labels = tf.get_variable(
+            'label', [FLAGS.batch_size],
+            initializer=tf.constant_initializer(1),
+            dtype=tf.int32,
+            trainable=False)
+
+        # Build a Graph that computes the logits predictions from the
+        # inference model.
+        last_layer = inference(images)
+
+        objective = loss(last_layer, labels)
+
+        # Compute gradients.
+        opt = tf.train.MomentumOptimizer(0.001, 0.9)
+        grads = opt.compute_gradients(objective)
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(
+                0.0, dtype=tf.float32),
+            trainable=False,
+            dtype=tf.float32)
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Track the moving averages of all trainable variables.
+        variable_averages = tf.train.ExponentialMovingAverage(0.9, global_step)
+        variables_averages_op = variable_averages.apply(tf.trainable_variables(
+        ))
+
+        # Build an initialization operation.
+        init = tf.initialize_all_variables()
+
+        # Start running operations on the Graph.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+
+        run_forward = True
+        run_forward_backward = True
+        if FLAGS.forward_only and FLAGS.forward_backward_only:
+            raise ValueError("Cannot specify --forward_only and "
+                             "--forward_backward_only at the same time.")
+        if FLAGS.forward_only:
+            run_forward_backward = False
+        elif FLAGS.forward_backward_only:
+            run_forward = False
+
+        if run_forward:
+            # Run the forward benchmark.
+            time_tensorflow_run(sess, last_layer, "Forward")
+
+        if run_forward_backward:
+            with tf.control_dependencies(
+                [apply_gradient_op, variables_averages_op]):
+                train_op = tf.no_op(name='train')
+            time_tensorflow_run(sess, [train_op, objective], "Forward-backward")
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
diff --git a/benchmark/tensorflow/rnn/README.md b/benchmark/tensorflow/rnn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..da8e7b8b07969051cbec3ac6a713eaf7fc738a55
--- /dev/null
+++ b/benchmark/tensorflow/rnn/README.md
@@ -0,0 +1,5 @@
+You also should install tflearn:
+
+```bash
+pip install -r requirements.txt
+```
diff --git a/benchmark/tensorflow/rnn/reader.py b/benchmark/tensorflow/rnn/reader.py
new file mode 100755
index 0000000000000000000000000000000000000000..f538329a15ea9ad9293c97c94340989e2c421eb2
--- /dev/null
+++ b/benchmark/tensorflow/rnn/reader.py
@@ -0,0 +1,92 @@
+import os.path
+import io
+import numpy as np
+import tensorflow as tf
+
+# tflearn
+import tflearn
+from tflearn.data_utils import to_categorical, pad_sequences
+from tflearn.datasets import imdb
+
+FLAGS = tf.app.flags.FLAGS
+
+
+class DataSet(object):
+    def __init__(self, data, labels):
+        assert data.shape[0] == labels.shape[0], (
+            'data.shape: %s labels.shape: %s' % (data.shape, labels.shape))
+        self._num_examples = data.shape[0]
+
+        self._data = data
+        self._labels = labels
+        self._epochs_completed = 0
+        self._index_in_epoch = 0
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def labels(self):
+        return self._labels
+
+    @property
+    def num_examples(self):
+        return self._num_examples
+
+    @property
+    def epochs_completed(self):
+        return self._epochs_completed
+
+    def next_batch(self, batch_size):
+        assert batch_size <= self._num_examples
+
+        start = self._index_in_epoch
+        self._index_in_epoch += batch_size
+        if self._index_in_epoch > self._num_examples:
+            # Finished epoch
+            self._epochs_completed += 1
+            # Shuffle the data
+            perm = np.arange(self._num_examples)
+            np.random.shuffle(perm)
+            self._data = self._data[perm]
+            self._labels = self._labels[perm]
+            # Start next epoch
+            start = 0
+            self._index_in_epoch = batch_size
+
+        end = self._index_in_epoch
+
+        return self._data[start:end], self._labels[start:end]
+
+
+def create_datasets(file_path, vocab_size=30000, val_fraction=0.0):
+
+    # IMDB Dataset loading
+    train, test, _ = imdb.load_data(
+        path=file_path,
+        n_words=vocab_size,
+        valid_portion=val_fraction,
+        sort_by_len=False)
+    trainX, trainY = train
+    testX, testY = test
+
+    # Data preprocessing
+    # Sequence padding
+    trainX = pad_sequences(trainX, maxlen=FLAGS.max_len, value=0.)
+    testX = pad_sequences(testX, maxlen=FLAGS.max_len, value=0.)
+    # Converting labels to binary vectors
+    trainY = to_categorical(trainY, nb_classes=2)
+    testY = to_categorical(testY, nb_classes=2)
+
+    train_dataset = DataSet(trainX, trainY)
+
+    return train_dataset
+
+
+def main():
+    create_datasets('imdb.pkl')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/tensorflow/rnn/requirements.txt b/benchmark/tensorflow/rnn/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4242e7d24fbbeb18e8fb9a760d76fa6d5363b03f
--- /dev/null
+++ b/benchmark/tensorflow/rnn/requirements.txt
@@ -0,0 +1 @@
+tflearn
diff --git a/benchmark/tensorflow/rnn/rnn.py b/benchmark/tensorflow/rnn/rnn.py
new file mode 100755
index 0000000000000000000000000000000000000000..f288083e13656563b511980553245142efec4e65
--- /dev/null
+++ b/benchmark/tensorflow/rnn/rnn.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import math
+import time
+import numpy as np
+from datetime import datetime
+
+import reader
+import tensorflow as tf
+from tensorflow.python.ops import rnn
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('num_layers', 1, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('max_len', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_boolean('forward_only', False,
+                            """Only run the forward pass.""")
+tf.app.flags.DEFINE_boolean('forward_backward_only', False,
+                            """Only run the forward-forward pass.""")
+tf.app.flags.DEFINE_integer('hidden_size', 128, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('emb_size', 128, """Number of batches to run.""")
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+
+VOCAB_SIZE = 30000
+NUM_CLASS = 2
+
+
+def get_feed_dict(x_data, y_data=None):
+    feed_dict = {}
+
+    if y_data is not None:
+        feed_dict[y_input] = y_data
+
+    for i in xrange(x_data.shape[0]):
+        feed_dict[x_input[i]] = x_data[i, :, :]
+
+    return feed_dict
+
+
+def get_incoming_shape(incoming):
+    """ Returns the incoming data shape """
+    if isinstance(incoming, tf.Tensor):
+        return incoming.get_shape().as_list()
+    elif type(incoming) in [np.array, list, tuple]:
+        return np.shape(incoming)
+    else:
+        raise Exception("Invalid incoming layer.")
+
+
+# Note input * W is done in LSTMCell, 
+# which is different from PaddlePaddle
+def single_lstm(name,
+                incoming,
+                n_units,
+                use_peepholes=True,
+                return_seq=False,
+                return_state=False):
+    with tf.name_scope(name) as scope:
+        cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes)
+        output, _cell_state = rnn.rnn(cell, incoming, dtype=tf.float32)
+        out = output if return_seq else output[-1]
+        return (out, _cell_state) if return_state else out
+
+
+def lstm(name,
+         incoming,
+         n_units,
+         use_peepholes=True,
+         return_seq=False,
+         return_state=False,
+         num_layers=1):
+    with tf.name_scope(name) as scope:
+        lstm_cell = tf.nn.rnn_cell.LSTMCell(
+            n_units, use_peepholes=use_peepholes)
+        cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers)
+        initial_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32)
+        if not isinstance(incoming, list):
+            # if the input is embeding, the Tensor shape : [None, time_step, emb_size]
+            incoming = [
+                tf.squeeze(input_, [1])
+                for input_ in tf.split(1, FLAGS.max_len, incoming)
+            ]
+        outputs, state = tf.nn.rnn(cell,
+                                   incoming,
+                                   initial_state=initial_state,
+                                   dtype=tf.float32)
+        out = outputs if return_seq else outputs[-1]
+        return (out, _cell_state) if return_state else out
+
+
+def embedding(name, incoming, vocab_size, emb_size):
+    with tf.name_scope(name) as scope:
+        #with tf.device("/cpu:0"):
+        embedding = tf.get_variable(
+            name + '_emb', [vocab_size, emb_size], dtype=tf.float32)
+        out = tf.nn.embedding_lookup(embedding, incoming)
+        return out
+
+
+def fc(name, inpOp, nIn, nOut, act=True):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        biases = tf.get_variable(
+            name + '_b', [nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=True)
+
+        net = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
+                  tf.matmul(inpOp, kernel) + biases
+
+        return net
+
+
+def inference(seq):
+    net = embedding('emb', seq, VOCAB_SIZE, FLAGS.emb_size)
+    print "emb:", get_incoming_shape(net)
+    net = lstm('lstm', net, FLAGS.hidden_size, num_layers=FLAGS.num_layers)
+    print "lstm:", get_incoming_shape(net)
+    net = fc('fc1', net, FLAGS.hidden_size, 2)
+    return net
+
+
+def loss(logits, labels):
+    # one label index for one sample
+    labels = tf.cast(labels, tf.float32)
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+        logits, labels, name='cross_entropy_per_example')
+    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
+    tf.add_to_collection('losses', cross_entropy_mean)
+    return tf.add_n(tf.get_collection('losses'), name='total_loss')
+
+
+def time_tensorflow_run(session, target, x_input, y_input, info_string):
+    num_steps_burn_in = 50
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    if not isinstance(target, list):
+        target = [target]
+    target_op = tf.group(*target)
+    train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE)
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        data, label = train_dataset.next_batch(FLAGS.batch_size)
+        _ = session.run(target_op, feed_dict={x_input: data, y_input: label})
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                print('%s: step %d, duration = %.3f' %
+                      (datetime.now(), i - num_steps_burn_in, duration))
+            total_duration += duration
+            total_duration_squared += duration * duration
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
+
+
+def run_benchmark():
+    with tf.Graph().as_default():
+        global_step = 0
+        with tf.device('/cpu:0'):
+            global_step = tf.Variable(0, trainable=False)
+        with tf.device('/gpu:0'):
+            #x_input = tf.placeholder(tf.int32, [None, FLAGS.max_len], name="x_input")
+            #y_input = tf.placeholder(tf.int32, [None, NUM_CLASS], name="y_input")
+            x_input = tf.placeholder(
+                tf.int32, [FLAGS.batch_size, FLAGS.max_len], name="x_input")
+            y_input = tf.placeholder(
+                tf.int32, [FLAGS.batch_size, NUM_CLASS], name="y_input")
+            # Generate some dummy sequnce.
+
+            last_layer = inference(x_input)
+
+            objective = loss(last_layer, y_input)
+            opt = tf.train.AdamOptimizer(0.001)
+            grads = opt.compute_gradients(objective)
+            apply_gradient_op = opt.apply_gradients(
+                grads, global_step=global_step)
+
+            init = tf.initialize_all_variables()
+            sess = tf.Session(config=tf.ConfigProto(
+                allow_soft_placement=True,
+                log_device_placement=FLAGS.log_device_placement))
+            sess.run(init)
+
+            run_forward = True
+            run_forward_backward = True
+            if FLAGS.forward_only and FLAGS.forward_backward_only:
+                raise ValueError("Cannot specify --forward_only and "
+                                 "--forward_backward_only at the same time.")
+            if FLAGS.forward_only:
+                run_forward_backward = False
+            elif FLAGS.forward_backward_only:
+                run_forward = False
+
+            if run_forward:
+                time_tensorflow_run(sess, last_layer, x_input, y_input,
+                                    "Forward")
+
+            if run_forward_backward:
+                with tf.control_dependencies([apply_gradient_op]):
+                    train_op = tf.no_op(name='train')
+                time_tensorflow_run(sess, [train_op, objective], x_input,
+                                    y_input, "Forward-backward")
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
diff --git a/benchmark/tensorflow/rnn/rnn_multi_gpu.py b/benchmark/tensorflow/rnn/rnn_multi_gpu.py
new file mode 100755
index 0000000000000000000000000000000000000000..eabee4fa8fe6325212ace1c11be4862cd2720b08
--- /dev/null
+++ b/benchmark/tensorflow/rnn/rnn_multi_gpu.py
@@ -0,0 +1,322 @@
+#!/usr/bin/env python
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import re
+import math
+import time
+import numpy as np
+from datetime import datetime
+
+import reader
+import tensorflow as tf
+from tensorflow.python.ops import rnn
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('num_layers', 1, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('max_len', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('hidden_size', 128, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('emb_size', 64, """Number of batches to run.""")
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""")
+
+VOCAB_SIZE = 30000
+NUM_CLASS = 2
+
+NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
+NUM_EPOCHS_PER_DECAY = 50
+INITIAL_LEARNING_RATE = 0.1
+LEARNING_RATE_DECAY_FACTOR = 0.1
+TOWER_NAME = 'tower'
+
+train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE)
+
+
+def get_incoming_shape(incoming):
+    """ Returns the incoming data shape """
+    if isinstance(incoming, tf.Tensor):
+        return incoming.get_shape().as_list()
+    elif type(incoming) in [np.array, list, tuple]:
+        return np.shape(incoming)
+    else:
+        raise Exception("Invalid incoming layer.")
+
+
+# Note input * W is done in LSTMCell, 
+# which is different from PaddlePaddle
+def single_lstm(name,
+                incoming,
+                n_units,
+                use_peepholes=True,
+                return_seq=False,
+                return_state=False):
+    with tf.name_scope(name) as scope:
+        cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes)
+        output, _cell_state = rnn.rnn(cell, incoming, dtype=tf.float32)
+        out = output if return_seq else output[-1]
+        return (out, _cell_state) if return_state else out
+
+
+def lstm(name,
+         incoming,
+         n_units,
+         use_peepholes=True,
+         return_seq=False,
+         return_state=False,
+         num_layers=1):
+    with tf.name_scope(name) as scope:
+        lstm_cell = tf.nn.rnn_cell.LSTMCell(
+            n_units, use_peepholes=use_peepholes)
+        cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers)
+        initial_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32)
+        if not isinstance(incoming, list):
+            # if the input is embeding, the Tensor shape : [None, time_step, emb_size]
+            incoming = [
+                tf.squeeze(input_, [1])
+                for input_ in tf.split(1, FLAGS.max_len, incoming)
+            ]
+        outputs, state = tf.nn.rnn(cell,
+                                   incoming,
+                                   initial_state=initial_state,
+                                   dtype=tf.float32)
+        out = outputs if return_seq else outputs[-1]
+        return (out, _cell_state) if return_state else out
+
+
+def embedding(name, incoming, vocab_size, emb_size):
+    with tf.name_scope(name) as scope:
+        #with tf.device("/cpu:0"):
+        embedding = tf.get_variable(
+            name + '_emb', [vocab_size, emb_size], dtype=tf.float32)
+        out = tf.nn.embedding_lookup(embedding, incoming)
+        return out
+
+
+def fc(name, inpOp, nIn, nOut, act=True):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        biases = tf.get_variable(
+            name + '_b', [nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=True)
+
+        net = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
+                  tf.matmul(inpOp, kernel) + biases
+
+        return net
+
+
+def inference(seq):
+    net = embedding('emb', seq, VOCAB_SIZE, FLAGS.emb_size)
+    print "emb:", get_incoming_shape(net)
+    net = lstm('lstm', net, FLAGS.hidden_size, num_layers=FLAGS.num_layers)
+    print "lstm:", get_incoming_shape(net)
+    net = fc('fc1', net, FLAGS.hidden_size, 2)
+    return net
+
+
+def loss(logits, labels):
+    # one label index for one sample
+    #labels = tf.cast(labels, tf.int64)
+    # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
+    #                logits, labels, name='cross_entropy_per_example')
+    labels = tf.cast(labels, tf.float32)
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+        logits, labels, name='cross_entropy_per_example')
+    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
+    tf.add_to_collection('losses', cross_entropy_mean)
+    return tf.add_n(tf.get_collection('losses'), name='total_loss')
+
+
+def tower_loss(scope):
+    """Calculate the total loss on a single tower running the model.
+    Args:
+        scope: unique prefix string identifying the tower, e.g. 'tower_0'
+    Returns:
+        Tensor of shape [] containing the total loss for a batch of data
+    """
+    data, label = train_dataset.next_batch(FLAGS.batch_size)
+
+    # Build a Graph that computes the logits predictions from the
+    # inference model.
+    last_layer = inference(data)
+
+    # Build the portion of the Graph calculating the losses. Note that we will
+    # assemble the total_loss using a custom function below.
+    #_ = loss(last_layer, label)
+    _ = loss(last_layer, label)
+
+    # Assemble all of the losses for the current tower only.
+    losses = tf.get_collection('losses', scope)
+
+    # Calculate the total loss for the current tower.
+    total_loss = tf.add_n(losses, name='total_loss')
+
+    # Compute the moving average of all individual losses and the total loss.
+    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
+    loss_averages_op = loss_averages.apply(losses + [total_loss])
+
+    # Attach a scalar summary to all individual losses and the total loss; do the
+    # same for the averaged version of the losses.
+    for l in losses + [total_loss]:
+        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
+        # session. This helps the clarity of presentation on tensorboard.
+        loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
+        # Name each loss as '(raw)' and name the moving average version of the loss
+        # as the original loss name.
+        tf.scalar_summary(loss_name + ' (raw)', l)
+        #tf.scalar_summary(loss_name, loss_averages.average(l))
+
+    with tf.control_dependencies([loss_averages_op]):
+        total_loss = tf.identity(total_loss)
+    return total_loss
+
+
+def average_gradients(tower_grads):
+    """Calculate the average gradient for each shared variable across all towers.
+  Note that this function provides a synchronization point across all towers.
+  Args:
+    tower_grads: List of lists of (gradient, variable) tuples. The outer list
+      is over individual gradients. The inner list is over the gradient
+      calculation for each tower.
+  Returns:
+     List of pairs of (gradient, variable) where the gradient has been averaged
+     across all towers.
+  """
+    average_grads = []
+    for grad_and_vars in zip(*tower_grads):
+        # Note that each grad_and_vars looks like the following:
+        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+        grads = []
+        for g, _ in grad_and_vars:
+            # Add 0 dimension to the gradients to represent the tower.
+            expanded_g = tf.expand_dims(g, 0)
+
+            # Append on a 'tower' dimension which we will average over below.
+            grads.append(expanded_g)
+
+        # Average over the 'tower' dimension.
+        grad = tf.concat(0, grads)
+        grad = tf.reduce_mean(grad, 0)
+
+        # Keep in mind that the Variables are redundant because they are shared
+        # across towers. So .. we will just return the first tower's pointer to
+        # the Variable.
+        v = grad_and_vars[0][1]
+        grad_and_var = (grad, v)
+        average_grads.append(grad_and_var)
+    return average_grads
+
+
+def time_tensorflow_run(session, target):
+    num_steps_burn_in = 80
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _ = session.run(target, feed_dict={x_input: data, y_input: label})
+        _, loss_value = session.run(target)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
+                examples_per_sec = num_examples_per_step / duration
+                # sec_per_batch = duration / FLAGS.num_gpus
+                sec_per_batch = duration
+
+                format_str = (
+                    '%s: step %d, loss= %.2f (%.1f examples/sec; %.3f '
+                    'sec/batch batch_size= %d)')
+                print(format_str %
+                      (datetime.now(), i - num_steps_burn_in, loss_value,
+                       duration, sec_per_batch, num_examples_per_step))
+
+            total_duration += duration
+            total_duration_squared += duration * duration
+
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), FLAGS.num_batches, mn, sd))
+
+
+def run_benchmark():
+    with tf.Graph().as_default(), tf.device('/cpu:0'):
+        # Create a variable to count the number of train() calls. This equals the
+        # number of batches processed * FLAGS.num_gpus.
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(0),
+            trainable=False)
+
+        # Calculate the learning rate schedule.
+        num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
+                                 FLAGS.batch_size)
+        decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
+
+        # Create an optimizer that performs gradient descent.
+        opt = tf.train.AdamOptimizer(0.001)
+
+        #train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE)
+
+        # Calculate the gradients for each model tower.
+        tower_grads = []
+        for i in xrange(FLAGS.num_gpus):
+            with tf.device('/gpu:%d' % i):
+                with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
+                    # Calculate the loss for one tower of the model. This function
+                    # constructs the entire model but shares the variables across
+                    # all towers.
+                    loss = tower_loss(scope)
+
+                    # Reuse variables for the next tower.
+                    tf.get_variable_scope().reuse_variables()
+
+                    # Retain the summaries from the final tower.
+                    # summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
+
+                    # Calculate the gradients for the batch of data on this tower.
+                    grads = opt.compute_gradients(loss)
+
+                    # Keep track of the gradients across all towers.
+                    tower_grads.append(grads)
+
+        # We must calculate the mean of each gradient. Note that this is the
+        # synchronization point across all towers.
+        grads = average_gradients(tower_grads)
+
+        # Apply the gradients to adjust the shared variables.
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Group all updates to into a single train op.
+        train_op = tf.group(apply_gradient_op)
+
+        # Build an initialization operation.
+        init = tf.initialize_all_variables()
+
+        # Start running operations on the Graph. allow_soft_placement must be set to
+        # True to build towers on GPU, as some of the ops do not have GPU
+        # implementations.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+        time_tensorflow_run(sess, [train_op, loss])
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
diff --git a/benchmark/tensorflow/rnn/run.sh b/benchmark/tensorflow/rnn/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..bb4c69cb95f965eff35f1c5a60376bf1e84f841b
--- /dev/null
+++ b/benchmark/tensorflow/rnn/run.sh
@@ -0,0 +1,29 @@
+set -e
+
+function test() {
+  lstm_num=$1
+  batch_size=$2
+  hid_size=$3
+  prefix=$4
+  python rnn.py --num_layers=${lstm_num} --batch_size=$batch_size \
+      --hidden_size=${hid_size} \
+      --forward_backward_only=1 \
+       > logs/1gpu-${lstm_num}lstm-batch${batch_size}-hid${hid_size}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+#--lstm_num--batch_size--hidden_size--#
+test 2 64 256 
+test 2 64 512 
+test 2 64 1280 
+
+test 2 128 256 
+test 2 128 512 
+test 2 128 1280 
+
+test 2 256 256 
+test 2 256 512 
+test 2 256 1280 
diff --git a/benchmark/tensorflow/rnn/run_multi.sh b/benchmark/tensorflow/rnn/run_multi.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c2d7dd597e6da54cd5c4cda311fbbd18486b4647
--- /dev/null
+++ b/benchmark/tensorflow/rnn/run_multi.sh
@@ -0,0 +1,27 @@
+set -e
+
+function test() {
+  num_gpu=$1
+  lstm_num=$2
+  hid_size=$3
+  batch_per_gpu=`expr ${batch_size} / ${num_gpu}`
+  batch_size=$4
+  python rnn_multi_gpu.py --num_layers=${lstm_num} --batch_size=$batch_per_gpu \
+      --num_gpus=${num_gpu} \
+      --hidden_size=${hid_size} \
+      --forward_backward_only=1 \
+      > logs/${num_gpu}gpu-${lstm_num}lstm-hid${hid_size}-batch${batch_size}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+#--num_gpus--lstm_num--hiddne_size--batch_size--#
+test 4 2 256 128 
+test 4 2 256 256 
+test 4 2 256 512 
+
+test 4 2 512 128 
+test 4 2 512 256 
+test 4 2 512 512 
diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake
index 6702f45a168bf0dfc6cfca3ff8e68fbc79c92b11..d319442ef10b38b9edf5844e5540a92c7094c7ce 100644
--- a/cmake/FindSphinx.cmake
+++ b/cmake/FindSphinx.cmake
@@ -72,6 +72,7 @@ function( Sphinx_add_target target_name builder conf cache source destination )
     ${source}
     ${destination}
     COMMENT "Generating sphinx documentation: ${builder}"
+    COMMAND ln -sf ${destination}/index_*.html ${destination}/index.html
     )
 
   set_property(
@@ -143,4 +144,4 @@ function( Sphinx_add_targets target_base_name conf source base_destination )
 
     add_dependencies( ${target_base_name}_linkcheck ${_dependencies} )
   endif()
-endfunction()
\ No newline at end of file
+endfunction()
diff --git a/cmake/check_packages.cmake b/cmake/check_packages.cmake
index 3bc0c1fd18448e3c2f0799295ac77d57cdc1bee7..afb84c6ff52af05769a99246d2e93380832c04e0 100644
--- a/cmake/check_packages.cmake
+++ b/cmake/check_packages.cmake
@@ -14,13 +14,9 @@ if(WITH_STYLE_CHECK)
   find_package(PythonInterp REQUIRED)
 endif()
 
-if(WITH_GLOG)
-  find_package(Glog REQUIRED)
-endif()
+find_package(Glog REQUIRED)
 
-if(WITH_GFLAGS)
-  find_package(Gflags REQUIRED)
-endif()
+find_package(Gflags REQUIRED)
 
 if(WITH_TESTING)
   find_package(GTest REQUIRED)
@@ -28,9 +24,7 @@ endif()
 
 if(WITH_DOC)
   find_package(Sphinx REQUIRED)
-  find_package(Doxygen REQUIRED)
   find_python_module(recommonmark REQUIRED)
-  find_python_module(breathe REQUIRED)
 endif()
 
 if(WITH_SWIG_PY)
diff --git a/cmake/enableCXX11.cmake b/cmake/enableCXX11.cmake
deleted file mode 100644
index dc8cc3371aa6e577676289750cc525ec4f1fb6b4..0000000000000000000000000000000000000000
--- a/cmake/enableCXX11.cmake
+++ /dev/null
@@ -1,13 +0,0 @@
-# Enable C++ 11 for GCC.
-# NOTE: It's only tested for gcc.
-include(CheckCXXCompilerFlag)
-CHECK_CXX_COMPILER_FLAG("-std=c++11" COMPILER_SUPPORT_CXX11)
-CHECK_CXX_COMPILER_FLAG("-std=c++0x" COMPILER_SUPPORT_CXX0X)
-
-if(COMPILER_SUPPORT_CXX11)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-elseif(COMPILER_SUPPORT_CXX0X)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x")
-else()
-    message(FATAL_ERROR "Your compiler must support c++11")
-endif()
\ No newline at end of file
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index e087770991aefc17535d50c0539c50f6316520d7..0983d83b73a32d0615170155759d45001cc6ff54 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -2,6 +2,37 @@
 include(CheckCXXCompilerFlag)
 include(CheckCCompilerFlag)
 include(CheckCXXSymbolExists)
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING 
+        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+        FORCE)
+endif()
+
+function(CheckCompilerCXX11Flag)
+    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
+            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
+        endif()
+    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
+        # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
+        # https://gist.github.com/yamaya/2924292
+        if(APPLE)  # cmake < 3.0 compiler id "Clang" on Mac OS X
+            if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.1)
+                message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.")
+            endif()
+        else()
+            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
+                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
+            endif()
+        endif()   
+    endif()
+endfunction()
+
+CheckCompilerCXX11Flag()
+LIST(APPEND CMAKE_CXX_FLAGS -std=c++11)
+
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support
@@ -41,9 +72,7 @@ macro(safe_set_nvflag flag_name)
     CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name})
     set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name})
     if(${safe_name})
-        set(CUDA_NVCC_FLAGS
-            --compiler-options;${flag_name}
-            ${CUDA_NVCC_FLAGS})
+        LIST(APPEND CUDA_NVCC_FLAGS -Xcompiler ${flag_name})
     endif()
 endmacro()
 
@@ -109,8 +138,22 @@ foreach(flag ${GPU_COMMON_FLAGS})
 endforeach()
 
 
+set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
+LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
+LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
+
+if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
+    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
+    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
+    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
+endif()
 
 function(specify_cuda_arch cuda_version cuda_arch)
     if(${cuda_version} VERSION_GREATER "8.0")
diff --git a/cmake/util.cmake b/cmake/util.cmake
index a8282f07184c34f77d506ed7ef40206fbbd55b41..38366373c6dbcc1d05c359484ae73ace1bbc59be 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -65,7 +65,7 @@ endmacro()
 # link_paddle_exe
 # add paddle library for a paddle executable, such as trainer, pserver.
 #
-# It will handle WITH_PYTHON/WITH_GLOG etc.
+# It will handle WITH_PYTHON etc.
 function(link_paddle_exe TARGET_NAME)
     if(WITH_RDMA)
         generate_rdma_links()
@@ -108,6 +108,8 @@ function(link_paddle_exe TARGET_NAME)
         paddle_cuda
         ${METRIC_LIBS}
         ${PROTOBUF_LIBRARY}
+        ${LIBGLOG_LIBRARY}
+        ${GFLAGS_LIBRARIES}
         ${CMAKE_THREAD_LIBS_INIT}
         ${CBLAS_LIBS}
         ${ZLIB_LIBRARIES}
@@ -119,27 +121,17 @@ function(link_paddle_exe TARGET_NAME)
             ${RDMA_LD_FLAGS}
             ${RDMA_LIBS})
     endif()
-    
+
     if(WITH_PYTHON)
         target_link_libraries(${TARGET_NAME}
             ${PYTHON_LIBRARIES})
     endif()
 
-    if(WITH_GLOG)
-        target_link_libraries(${TARGET_NAME}
-            ${LIBGLOG_LIBRARY})
-    endif()
-
-    if(WITH_GFLAGS)
-        target_link_libraries(${TARGET_NAME}
-            ${GFLAGS_LIBRARIES})
-    endif()
-
     if(WITH_GPU)
-        if(NOT WITH_DSO OR WITH_METRIC) 
+        if(NOT WITH_DSO OR WITH_METRIC)
             target_link_libraries(${TARGET_NAME}
                 ${CUDNN_LIBRARY}
-                ${CUDA_curand_LIBRARY}) 
+                ${CUDA_curand_LIBRARY})
             CUDA_ADD_CUBLAS_TO_TARGET(${TARGET_NAME})
         endif()
 
@@ -148,6 +140,11 @@ function(link_paddle_exe TARGET_NAME)
             target_link_libraries(${TARGET_NAME} rt)
         endif()
     endif()
+
+    if(NOT WITH_DSO)
+        target_link_libraries(${TARGET_NAME}
+            ${WARPCTC_LIBRARY})
+    endif()
 endfunction()
 
 # link_paddle_test
@@ -201,5 +198,5 @@ function(create_resources res_file output)
     # Convert hex data for C compatibility
     string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," filedata ${filedata})
     # Append data to output file
-    file(APPEND ${output} "const unsigned char ${filename}[] = {${filedata}};\nconst unsigned ${filename}_size = sizeof(${filename});\n")
+    file(APPEND ${output} "const unsigned char ${filename}[] = {${filedata}0};\nconst unsigned ${filename}_size = sizeof(${filename});\n")
 endfunction()
diff --git a/cmake/version.cmake b/cmake/version.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..a0518e07e88a1ff468c301523f888c7d95e15185
--- /dev/null
+++ b/cmake/version.cmake
@@ -0,0 +1,24 @@
+# Get the latest git tag.
+set(PADDLE_VERSION $ENV{PADDLE_VERSION})
+set(tmp_version "HEAD")
+while ("${PADDLE_VERSION}" STREQUAL "")
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 ${tmp_version}
+    WORKING_DIRECTORY ${PROJ_ROOT}
+    OUTPUT_VARIABLE GIT_TAG_NAME
+    RESULT_VARIABLE GIT_RESULT
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if (NOT ${GIT_RESULT})
+    # Check the tag is a correct version
+    if (${GIT_TAG_NAME} MATCHES "v[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
+      string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
+    else()  # otherwise, get the previous git tag name.
+      set(tmp_version "${GIT_TAG_NAME}~1")
+    endif()
+  else()
+    set(PADDLE_VERSION "0.0.0")
+    message(WARNING "Cannot add paddle version from git tag")
+  endif()
+endwhile()
+
+message(STATUS "Paddle version is ${PADDLE_VERSION}")
diff --git a/demo/gan/.gitignore b/demo/gan/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..93a6f5080a16a601cffb0bff51af9aef3ba3bae7
--- /dev/null
+++ b/demo/gan/.gitignore
@@ -0,0 +1,11 @@
+output/
+uniform_params/
+cifar_params/
+mnist_params/
+*.png
+.pydevproject
+.project
+*.log
+*.pyc
+data/mnist_data/
+data/cifar-10-batches-py/
diff --git a/demo/gan/README.md b/demo/gan/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1908b534b0c1f63904d5503399b961d74ce0037c
--- /dev/null
+++ b/demo/gan/README.md
@@ -0,0 +1,13 @@
+# Generative Adversarial Networks (GAN) 
+
+This demo implements GAN training described in the original GAN paper (https://arxiv.org/abs/1406.2661) and DCGAN (https://arxiv.org/abs/1511.06434).
+
+The general training procedures are implemented in gan_trainer.py. The neural network configurations are specified in gan_conf.py (for synthetic data) and gan_conf_image.py (for image data).
+
+In order to run the model, first download the corresponding data by running the shell script in ./data.
+Then you can run the command below. The flag -d specifies the training data (cifar, mnist or uniform) and flag --useGpu specifies whether to use gpu for training (0 is cpu, 1 is gpu).  
+
+$python gan_trainer.py -d cifar --use_gpu 1
+
+The generated images will be stored in ./cifar_samples/
+The corresponding models will be stored in ./cifar_params/
diff --git a/demo/gan/data/download_cifar.sh b/demo/gan/data/download_cifar.sh
new file mode 100755
index 0000000000000000000000000000000000000000..bbadc7c10c73e45a0948018b8812f79040d14bc4
--- /dev/null
+++ b/demo/gan/data/download_cifar.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
+tar zxf cifar-10-python.tar.gz
+rm cifar-10-python.tar.gz
diff --git a/demo/gan/data/get_mnist_data.sh b/demo/gan/data/get_mnist_data.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a77c81bf5af9ddb6634ff89460797ca543c5e517
--- /dev/null
+++ b/demo/gan/data/get_mnist_data.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env sh
+# This script downloads the mnist data and unzips it.
+set -e
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+rm -rf "$DIR/mnist_data"
+mkdir "$DIR/mnist_data"
+cd "$DIR/mnist_data"
+
+echo "Downloading..."
+
+for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
+do
+    if [ ! -e $fname ]; then
+        wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
+        gunzip ${fname}.gz
+    fi
+done
diff --git a/demo/gan/gan_conf.py b/demo/gan/gan_conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..86ac2dffe5f4490a88e12d1fa5e8cd9fa61a69f4
--- /dev/null
+++ b/demo/gan/gan_conf.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+
+mode = get_config_arg("mode", str, "generator")
+assert mode in set([
+    "generator", "discriminator", "generator_training", "discriminator_training"
+])
+
+is_generator_training = mode == "generator_training"
+is_discriminator_training = mode == "discriminator_training"
+is_generator = mode == "generator"
+is_discriminator = mode == "discriminator"
+
+# The network structure below follows the ref https://arxiv.org/abs/1406.2661
+# Here we used two hidden layers and batch_norm
+
+print('mode=%s' % mode)
+# the dim of the noise (z) as the input of the generator network
+noise_dim = 10
+# the dim of the hidden layer
+hidden_dim = 10
+# the dim of the generated sample
+sample_dim = 2
+
+settings(
+    batch_size=128,
+    learning_rate=1e-4,
+    learning_method=AdamOptimizer(beta1=0.5))
+
+
+def discriminator(sample):
+    """
+    discriminator ouputs the probablity of a sample is from generator
+    or real data.
+    The output has two dimenstional: dimension 0 is the probablity
+    of the sample is from generator and dimension 1 is the probabblity
+    of the sample is from real data.
+    """
+    param_attr = ParamAttr(is_static=is_generator_training)
+    bias_attr = ParamAttr(
+        is_static=is_generator_training, initial_mean=1.0, initial_std=0)
+
+    hidden = fc_layer(
+        input=sample,
+        name="dis_hidden",
+        size=hidden_dim,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=ReluActivation())
+
+    hidden2 = fc_layer(
+        input=hidden,
+        name="dis_hidden2",
+        size=hidden_dim,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=LinearActivation())
+
+    hidden_bn = batch_norm_layer(
+        hidden2,
+        act=ReluActivation(),
+        name="dis_hidden_bn",
+        bias_attr=bias_attr,
+        param_attr=ParamAttr(
+            is_static=is_generator_training, initial_mean=1.0,
+            initial_std=0.02),
+        use_global_stats=False)
+
+    return fc_layer(
+        input=hidden_bn,
+        name="dis_prob",
+        size=2,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=SoftmaxActivation())
+
+
+def generator(noise):
+    """
+    generator generates a sample given noise
+    """
+    param_attr = ParamAttr(is_static=is_discriminator_training)
+    bias_attr = ParamAttr(
+        is_static=is_discriminator_training, initial_mean=1.0, initial_std=0)
+
+    hidden = fc_layer(
+        input=noise,
+        name="gen_layer_hidden",
+        size=hidden_dim,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=ReluActivation())
+
+    hidden2 = fc_layer(
+        input=hidden,
+        name="gen_hidden2",
+        size=hidden_dim,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=LinearActivation())
+
+    hidden_bn = batch_norm_layer(
+        hidden2,
+        act=ReluActivation(),
+        name="gen_layer_hidden_bn",
+        bias_attr=bias_attr,
+        param_attr=ParamAttr(
+            is_static=is_discriminator_training,
+            initial_mean=1.0,
+            initial_std=0.02),
+        use_global_stats=False)
+
+    return fc_layer(
+        input=hidden_bn,
+        name="gen_layer1",
+        size=sample_dim,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=LinearActivation())
+
+
+if is_generator_training:
+    noise = data_layer(name="noise", size=noise_dim)
+    sample = generator(noise)
+
+if is_discriminator_training:
+    sample = data_layer(name="sample", size=sample_dim)
+
+if is_generator_training or is_discriminator_training:
+    label = data_layer(name="label", size=1)
+    prob = discriminator(sample)
+    cost = cross_entropy(input=prob, label=label)
+    classification_error_evaluator(
+        input=prob, label=label, name=mode + '_error')
+    outputs(cost)
+
+if is_generator:
+    noise = data_layer(name="noise", size=noise_dim)
+    outputs(generator(noise))
diff --git a/demo/gan/gan_conf_image.py b/demo/gan/gan_conf_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..f89a4e706c3b7eeaa7858f54f8fa04a5e038b66e
--- /dev/null
+++ b/demo/gan/gan_conf_image.py
@@ -0,0 +1,298 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+
+mode = get_config_arg("mode", str, "generator")
+dataSource = get_config_arg("data", str, "mnist")
+assert mode in set([
+    "generator", "discriminator", "generator_training", "discriminator_training"
+])
+
+is_generator_training = mode == "generator_training"
+is_discriminator_training = mode == "discriminator_training"
+is_generator = mode == "generator"
+is_discriminator = mode == "discriminator"
+
+# The network structure below follows the dcgan paper 
+# (https://arxiv.org/abs/1511.06434)
+
+print('mode=%s' % mode)
+# the dim of the noise (z) as the input of the generator network
+noise_dim = 100
+# the number of filters in the layer in generator/discriminator that is 
+# closet to the image
+gf_dim = 64
+df_dim = 64
+if dataSource == "mnist":
+    sample_dim = 28  # image dim
+    c_dim = 1  # image color
+else:
+    sample_dim = 32
+    c_dim = 3
+s2, s4 = int(sample_dim / 2), int(sample_dim / 4),
+s8, s16 = int(sample_dim / 8), int(sample_dim / 16)
+
+settings(
+    batch_size=128,
+    learning_rate=2e-4,
+    learning_method=AdamOptimizer(beta1=0.5))
+
+
+def conv_bn(input,
+            channels,
+            imgSize,
+            num_filters,
+            output_x,
+            stride,
+            name,
+            param_attr,
+            bias_attr,
+            param_attr_bn,
+            bn,
+            trans=False,
+            act=ReluActivation()):
+    """
+    conv_bn is a utility function that constructs a convolution/deconv layer 
+    with an optional batch_norm layer
+
+    :param bn: whether to use batch_norm_layer
+    :type bn: bool
+    :param trans: whether to use conv (False) or deconv (True)
+    :type trans: bool
+    """
+
+    # calculate the filter_size and padding size based on the given
+    # imgSize and ouput size
+    tmp = imgSize - (output_x - 1) * stride
+    if tmp <= 1 or tmp > 5:
+        raise ValueError("conv input-output dimension does not fit")
+    elif tmp <= 3:
+        filter_size = tmp + 2
+        padding = 1
+    else:
+        filter_size = tmp
+        padding = 0
+
+    print(imgSize, output_x, stride, filter_size, padding)
+
+    if trans:
+        nameApx = "_conv"
+    else:
+        nameApx = "_convt"
+
+    if bn:
+        conv = img_conv_layer(
+            input,
+            filter_size=filter_size,
+            num_filters=num_filters,
+            name=name + nameApx,
+            num_channels=channels,
+            act=LinearActivation(),
+            groups=1,
+            stride=stride,
+            padding=padding,
+            bias_attr=bias_attr,
+            param_attr=param_attr,
+            shared_biases=True,
+            layer_attr=None,
+            filter_size_y=None,
+            stride_y=None,
+            padding_y=None,
+            trans=trans)
+
+        conv_bn = batch_norm_layer(
+            conv,
+            act=act,
+            name=name + nameApx + "_bn",
+            bias_attr=bias_attr,
+            param_attr=param_attr_bn,
+            use_global_stats=False)
+
+        return conv_bn
+    else:
+        conv = img_conv_layer(
+            input,
+            filter_size=filter_size,
+            num_filters=num_filters,
+            name=name + nameApx,
+            num_channels=channels,
+            act=act,
+            groups=1,
+            stride=stride,
+            padding=padding,
+            bias_attr=bias_attr,
+            param_attr=param_attr,
+            shared_biases=True,
+            layer_attr=None,
+            filter_size_y=None,
+            stride_y=None,
+            padding_y=None,
+            trans=trans)
+        return conv
+
+
+def generator(noise):
+    """
+    generator generates a sample given noise
+    """
+    param_attr = ParamAttr(
+        is_static=is_discriminator_training, initial_mean=0.0, initial_std=0.02)
+    bias_attr = ParamAttr(
+        is_static=is_discriminator_training, initial_mean=0.0, initial_std=0.0)
+
+    param_attr_bn = ParamAttr(
+        is_static=is_discriminator_training, initial_mean=1.0, initial_std=0.02)
+
+    h1 = fc_layer(
+        input=noise,
+        name="gen_layer_h1",
+        size=s8 * s8 * gf_dim * 4,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=LinearActivation())
+
+    h1_bn = batch_norm_layer(
+        h1,
+        act=ReluActivation(),
+        name="gen_layer_h1_bn",
+        bias_attr=bias_attr,
+        param_attr=param_attr_bn,
+        use_global_stats=False)
+
+    h2_bn = conv_bn(
+        h1_bn,
+        channels=gf_dim * 4,
+        output_x=s8,
+        num_filters=gf_dim * 2,
+        imgSize=s4,
+        stride=2,
+        name="gen_layer_h2",
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        param_attr_bn=param_attr_bn,
+        bn=True,
+        trans=True)
+
+    h3_bn = conv_bn(
+        h2_bn,
+        channels=gf_dim * 2,
+        output_x=s4,
+        num_filters=gf_dim,
+        imgSize=s2,
+        stride=2,
+        name="gen_layer_h3",
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        param_attr_bn=param_attr_bn,
+        bn=True,
+        trans=True)
+
+    return conv_bn(
+        h3_bn,
+        channels=gf_dim,
+        output_x=s2,
+        num_filters=c_dim,
+        imgSize=sample_dim,
+        stride=2,
+        name="gen_layer_h4",
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        param_attr_bn=param_attr_bn,
+        bn=False,
+        trans=True,
+        act=TanhActivation())
+
+
+def discriminator(sample):
+    """
+    discriminator ouputs the probablity of a sample is from generator
+    or real data.
+    The output has two dimenstional: dimension 0 is the probablity
+    of the sample is from generator and dimension 1 is the probabblity
+    of the sample is from real data.
+    """
+    param_attr = ParamAttr(
+        is_static=is_generator_training, initial_mean=0.0, initial_std=0.02)
+    bias_attr = ParamAttr(
+        is_static=is_generator_training, initial_mean=0.0, initial_std=0.0)
+
+    param_attr_bn = ParamAttr(
+        is_static=is_generator_training, initial_mean=1.0, initial_std=0.02)
+
+    h0 = conv_bn(
+        sample,
+        channels=c_dim,
+        imgSize=sample_dim,
+        num_filters=df_dim,
+        output_x=s2,
+        stride=2,
+        name="dis_h0",
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        param_attr_bn=param_attr_bn,
+        bn=False)
+
+    h1_bn = conv_bn(
+        h0,
+        channels=df_dim,
+        imgSize=s2,
+        num_filters=df_dim * 2,
+        output_x=s4,
+        stride=2,
+        name="dis_h1",
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        param_attr_bn=param_attr_bn,
+        bn=True)
+
+    h2_bn = conv_bn(
+        h1_bn,
+        channels=df_dim * 2,
+        imgSize=s4,
+        num_filters=df_dim * 4,
+        output_x=s8,
+        stride=2,
+        name="dis_h2",
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        param_attr_bn=param_attr_bn,
+        bn=True)
+
+    return fc_layer(
+        input=h2_bn,
+        name="dis_prob",
+        size=2,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=SoftmaxActivation())
+
+
+if is_generator_training:
+    noise = data_layer(name="noise", size=noise_dim)
+    sample = generator(noise)
+
+if is_discriminator_training:
+    sample = data_layer(name="sample", size=sample_dim * sample_dim * c_dim)
+
+if is_generator_training or is_discriminator_training:
+    label = data_layer(name="label", size=1)
+    prob = discriminator(sample)
+    cost = cross_entropy(input=prob, label=label)
+    classification_error_evaluator(
+        input=prob, label=label, name=mode + '_error')
+    outputs(cost)
+
+if is_generator:
+    noise = data_layer(name="noise", size=noise_dim)
+    outputs(generator(noise))
diff --git a/demo/gan/gan_trainer.py b/demo/gan/gan_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a26c230f7a21cc6dd4a3cdb52e32730b1ce73ca
--- /dev/null
+++ b/demo/gan/gan_trainer.py
@@ -0,0 +1,349 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import random
+import numpy
+import cPickle
+import sys, os
+from PIL import Image
+
+from paddle.trainer.config_parser import parse_config
+from paddle.trainer.config_parser import logger
+import py_paddle.swig_paddle as api
+import matplotlib.pyplot as plt
+
+
+def plot2DScatter(data, outputfile):
+    '''
+    Plot the data as a 2D scatter plot and save to outputfile
+    data needs to be two dimensinoal
+    '''
+    x = data[:, 0]
+    y = data[:, 1]
+    logger.info("The mean vector is %s" % numpy.mean(data, 0))
+    logger.info("The std vector is %s" % numpy.std(data, 0))
+
+    heatmap, xedges, yedges = numpy.histogram2d(x, y, bins=50)
+    extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
+
+    plt.clf()
+    plt.scatter(x, y)
+    plt.savefig(outputfile, bbox_inches='tight')
+
+
+def CHECK_EQ(a, b):
+    assert a == b, "a=%s, b=%s" % (a, b)
+
+
+def copy_shared_parameters(src, dst):
+    '''
+    copy the parameters from src to dst
+    :param src: the source of the parameters
+    :type src: GradientMachine
+    :param dst: the destination of the parameters
+    :type dst: GradientMachine
+    '''
+    src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())]
+    src_params = dict([(p.getName(), p) for p in src_params])
+
+    for i in xrange(dst.getParameterSize()):
+        dst_param = dst.getParameter(i)
+        src_param = src_params.get(dst_param.getName(), None)
+        if src_param is None:
+            continue
+        src_value = src_param.getBuf(api.PARAMETER_VALUE)
+        dst_value = dst_param.getBuf(api.PARAMETER_VALUE)
+        CHECK_EQ(len(src_value), len(dst_value))
+        dst_value.copyFrom(src_value)
+        dst_param.setValueUpdated()
+
+
+def print_parameters(src):
+    src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())]
+
+    print "***************"
+    for p in src_params:
+        print "Name is %s" % p.getName()
+        print "value is %s \n" % p.getBuf(api.PARAMETER_VALUE).copyToNumpyArray(
+        )
+
+
+def load_mnist_data(imageFile):
+    f = open(imageFile, "rb")
+    f.read(16)
+
+    # Define number of samples for train/test
+    if "train" in imageFile:
+        n = 60000
+    else:
+        n = 10000
+
+    data = numpy.fromfile(f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28))
+    data = data / 255.0 * 2.0 - 1.0
+
+    f.close()
+    return data.astype('float32')
+
+
+def load_cifar_data(cifar_path):
+    batch_size = 10000
+    data = numpy.zeros((5 * batch_size, 32 * 32 * 3), dtype="float32")
+    for i in range(1, 6):
+        file = cifar_path + "/data_batch_" + str(i)
+        fo = open(file, 'rb')
+        dict = cPickle.load(fo)
+        fo.close()
+        data[(i - 1) * batch_size:(i * batch_size), :] = dict["data"]
+
+    data = data / 255.0 * 2.0 - 1.0
+    return data
+
+
+# synthesize 2-D uniform data
+def load_uniform_data():
+    data = numpy.random.rand(1000000, 2).astype('float32')
+    return data
+
+
+def merge(images, size):
+    if images.shape[1] == 28 * 28:
+        h, w, c = 28, 28, 1
+    else:
+        h, w, c = 32, 32, 3
+    img = numpy.zeros((h * size[0], w * size[1], c))
+    for idx in xrange(size[0] * size[1]):
+        i = idx % size[1]
+        j = idx // size[1]
+        img[j*h:j*h+h, i*w:i*w+w, :] = \
+          ((images[idx, :].reshape((h, w, c), order="F").transpose(1, 0, 2) + 1.0) / 2.0 * 255.0)
+    return img.astype('uint8')
+
+
+def save_images(images, path):
+    merged_img = merge(images, [8, 8])
+    if merged_img.shape[2] == 1:
+        im = Image.fromarray(numpy.squeeze(merged_img)).convert('RGB')
+    else:
+        im = Image.fromarray(merged_img, mode="RGB")
+    im.save(path)
+
+
+def get_real_samples(batch_size, data_np):
+    return data_np[numpy.random.choice(
+        data_np.shape[0], batch_size, replace=False), :]
+
+
+def get_noise(batch_size, noise_dim):
+    return numpy.random.normal(size=(batch_size, noise_dim)).astype('float32')
+
+
+def get_fake_samples(generator_machine, batch_size, noise):
+    gen_inputs = api.Arguments.createArguments(1)
+    gen_inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise))
+    gen_outputs = api.Arguments.createArguments(0)
+    generator_machine.forward(gen_inputs, gen_outputs, api.PASS_TEST)
+    fake_samples = gen_outputs.getSlotValue(0).copyToNumpyMat()
+    return fake_samples
+
+
+def get_training_loss(training_machine, inputs):
+    outputs = api.Arguments.createArguments(0)
+    training_machine.forward(inputs, outputs, api.PASS_TEST)
+    loss = outputs.getSlotValue(0).copyToNumpyMat()
+    return numpy.mean(loss)
+
+
+def prepare_discriminator_data_batch_pos(batch_size, data_np):
+    real_samples = get_real_samples(batch_size, data_np)
+    labels = numpy.ones(batch_size, dtype='int32')
+    inputs = api.Arguments.createArguments(2)
+    inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(real_samples))
+    inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(labels))
+    return inputs
+
+
+def prepare_discriminator_data_batch_neg(generator_machine, batch_size, noise):
+    fake_samples = get_fake_samples(generator_machine, batch_size, noise)
+    labels = numpy.zeros(batch_size, dtype='int32')
+    inputs = api.Arguments.createArguments(2)
+    inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(fake_samples))
+    inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(labels))
+    return inputs
+
+
+def prepare_generator_data_batch(batch_size, noise):
+    label = numpy.ones(batch_size, dtype='int32')
+    inputs = api.Arguments.createArguments(2)
+    inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise))
+    inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(label))
+    return inputs
+
+
+def find(iterable, cond):
+    for item in iterable:
+        if cond(item):
+            return item
+    return None
+
+
+def get_layer_size(model_conf, layer_name):
+    layer_conf = find(model_conf.layers, lambda x: x.name == layer_name)
+    assert layer_conf is not None, "Cannot find '%s' layer" % layer_name
+    return layer_conf.size
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--data_source", help="mnist or cifar or uniform")
+    parser.add_argument(
+        "--use_gpu", default="1", help="1 means use gpu for training")
+    parser.add_argument("--gpu_id", default="0", help="the gpu_id parameter")
+    args = parser.parse_args()
+    data_source = args.data_source
+    use_gpu = args.use_gpu
+    assert data_source in ["mnist", "cifar", "uniform"]
+    assert use_gpu in ["0", "1"]
+
+    if not os.path.exists("./%s_samples/" % data_source):
+        os.makedirs("./%s_samples/" % data_source)
+
+    if not os.path.exists("./%s_params/" % data_source):
+        os.makedirs("./%s_params/" % data_source)
+
+    api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10',
+                   '--log_period=100', '--gpu_id=' + args.gpu_id,
+                   '--save_dir=' + "./%s_params/" % data_source)
+
+    if data_source == "uniform":
+        conf = "gan_conf.py"
+        num_iter = 10000
+    else:
+        conf = "gan_conf_image.py"
+        num_iter = 1000
+
+    gen_conf = parse_config(conf, "mode=generator_training,data=" + data_source)
+    dis_conf = parse_config(conf,
+                            "mode=discriminator_training,data=" + data_source)
+    generator_conf = parse_config(conf, "mode=generator,data=" + data_source)
+    batch_size = dis_conf.opt_config.batch_size
+    noise_dim = get_layer_size(gen_conf.model_config, "noise")
+
+    if data_source == "mnist":
+        data_np = load_mnist_data("./data/mnist_data/train-images-idx3-ubyte")
+    elif data_source == "cifar":
+        data_np = load_cifar_data("./data/cifar-10-batches-py/")
+    else:
+        data_np = load_uniform_data()
+
+    # this creates a gradient machine for discriminator
+    dis_training_machine = api.GradientMachine.createFromConfigProto(
+        dis_conf.model_config)
+    # this create a gradient machine for generator    
+    gen_training_machine = api.GradientMachine.createFromConfigProto(
+        gen_conf.model_config)
+
+    # generator_machine is used to generate data only, which is used for
+    # training discriminator
+    logger.info(str(generator_conf.model_config))
+    generator_machine = api.GradientMachine.createFromConfigProto(
+        generator_conf.model_config)
+
+    dis_trainer = api.Trainer.create(dis_conf, dis_training_machine)
+
+    gen_trainer = api.Trainer.create(gen_conf, gen_training_machine)
+
+    dis_trainer.startTrain()
+    gen_trainer.startTrain()
+
+    # Sync parameters between networks (GradientMachine) at the beginning
+    copy_shared_parameters(gen_training_machine, dis_training_machine)
+    copy_shared_parameters(gen_training_machine, generator_machine)
+
+    # constrain that either discriminator or generator can not be trained
+    # consecutively more than MAX_strike times
+    curr_train = "dis"
+    curr_strike = 0
+    MAX_strike = 5
+
+    for train_pass in xrange(100):
+        dis_trainer.startTrainPass()
+        gen_trainer.startTrainPass()
+        for i in xrange(num_iter):
+            # Do forward pass in discriminator to get the dis_loss
+            noise = get_noise(batch_size, noise_dim)
+            data_batch_dis_pos = prepare_discriminator_data_batch_pos(
+                batch_size, data_np)
+            dis_loss_pos = get_training_loss(dis_training_machine,
+                                             data_batch_dis_pos)
+
+            data_batch_dis_neg = prepare_discriminator_data_batch_neg(
+                generator_machine, batch_size, noise)
+            dis_loss_neg = get_training_loss(dis_training_machine,
+                                             data_batch_dis_neg)
+
+            dis_loss = (dis_loss_pos + dis_loss_neg) / 2.0
+
+            # Do forward pass in generator to get the gen_loss
+            data_batch_gen = prepare_generator_data_batch(batch_size, noise)
+            gen_loss = get_training_loss(gen_training_machine, data_batch_gen)
+
+            if i % 100 == 0:
+                print "d_pos_loss is %s     d_neg_loss is %s" % (dis_loss_pos,
+                                                                 dis_loss_neg)
+                print "d_loss is %s    g_loss is %s" % (dis_loss, gen_loss)
+
+            # Decide which network to train based on the training history
+            # And the relative size of the loss        
+            if (not (curr_train == "dis" and curr_strike == MAX_strike)) and \
+               ((curr_train == "gen" and curr_strike == MAX_strike) or dis_loss > gen_loss):
+                if curr_train == "dis":
+                    curr_strike += 1
+                else:
+                    curr_train = "dis"
+                    curr_strike = 1
+                dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_neg)
+                dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_pos)
+                copy_shared_parameters(dis_training_machine,
+                                       gen_training_machine)
+
+            else:
+                if curr_train == "gen":
+                    curr_strike += 1
+                else:
+                    curr_train = "gen"
+                    curr_strike = 1
+                gen_trainer.trainOneDataBatch(batch_size, data_batch_gen)
+                # TODO: add API for paddle to allow true parameter sharing between different GradientMachines 
+                # so that we do not need to copy shared parameters. 
+                copy_shared_parameters(gen_training_machine,
+                                       dis_training_machine)
+                copy_shared_parameters(gen_training_machine, generator_machine)
+
+        dis_trainer.finishTrainPass()
+        gen_trainer.finishTrainPass()
+        # At the end of each pass, save the generated samples/images
+        fake_samples = get_fake_samples(generator_machine, batch_size, noise)
+        if data_source == "uniform":
+            plot2DScatter(fake_samples, "./%s_samples/train_pass%s.png" %
+                          (data_source, train_pass))
+        else:
+            save_images(fake_samples, "./%s_samples/train_pass%s.png" %
+                        (data_source, train_pass))
+    dis_trainer.finishTrain()
+    gen_trainer.finishTrain()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/image_classification/data/download_cifar.sh b/demo/image_classification/data/download_cifar.sh
index ca9b0b5c9052548d73037e0434bf47d0c8347787..532178d627fe19ab8ea79ecae73e5328b5294bea 100755
--- a/demo/image_classification/data/download_cifar.sh
+++ b/demo/image_classification/data/download_cifar.sh
@@ -1,4 +1,5 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/image_classification/data/process_cifar.py b/demo/image_classification/data/process_cifar.py
index b235010e4ece377beffaaa1b9247a77d7a96b712..db6666189e5b8008a6b66fb64afcdf98980e72bb 100644
--- a/demo/image_classification/data/process_cifar.py
+++ b/demo/image_classification/data/process_cifar.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/image_classification/image_provider.py b/demo/image_classification/image_provider.py
index 28bf1bb02c1f08b2e8ec9acd38f0a8594b05ab66..6a315ff094c1af5f8250d8a22ff96740dddd9808 100644
--- a/demo/image_classification/image_provider.py
+++ b/demo/image_classification/image_provider.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@ from paddle.trainer.PyDataProvider2 import *
 
 #
 # {'img_size': 32,
-# 'settings': <paddle.trainer.PyDataProviderWrapper.Cls instance at 0x7fea27cb6050>,
+# 'settings': a global object,
 # 'color': True,
 # 'mean_img_size': 32,
 # 'meta': './data/cifar-out/batches/batches.meta',
@@ -50,10 +50,10 @@ def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg,
 
     settings.logger.info('Image size: %s', settings.img_size)
     settings.logger.info('Meta path: %s', settings.meta_path)
-    settings.input_types = [
-        dense_vector(settings.img_raw_size),  # image feature
-        integer_value(settings.num_classes)
-    ]  # labels
+    settings.input_types = {
+        'image': dense_vector(settings.img_raw_size),
+        'label': integer_value(settings.num_classes)
+    }
 
     settings.logger.info('DataProvider Initialization finished')
 
@@ -83,4 +83,7 @@ def processData(settings, file_list):
                         img, settings.img_mean, settings.img_size,
                         settings.is_train, settings.color)
                     label = data['labels'][i]
-                    yield img_feat.astype('float32'), int(label)
+                    yield {
+                        'image': img_feat.astype('float32'),
+                        'label': int(label)
+                    }
diff --git a/demo/image_classification/image_util.py b/demo/image_classification/image_util.py
index b5c6431c06f77cef5c31ca844a8427eebaea2fce..f09605394a19e09d92e555eeefb0b5646625b618 100644
--- a/demo/image_classification/image_util.py
+++ b/demo/image_classification/image_util.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/image_classification/predict.sh b/demo/image_classification/predict.sh
old mode 100644
new mode 100755
index 35ffae6c8cdc90bc9d71b35e05a5717ff7ba4970..9d5785c9a1a4dac12f7940fa708b1a79c6ec8a93
--- a/demo/image_classification/predict.sh
+++ b/demo/image_classification/predict.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/image_classification/prediction.py b/demo/image_classification/prediction.py
index 6a47bd5851c99635dd7d3f1d5df67dd081ca4584..9a86aafcb2fa4d4354d1dd9443c1b73ddcda980b 100755
--- a/demo/image_classification/prediction.py
+++ b/demo/image_classification/prediction.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/image_classification/preprocess.py b/demo/image_classification/preprocess.py
index 10b9c1691b5e51273c73a975545cd36f3822e901..2947ad239c36f9a02ed67ccf5906380cb70e37ce 100755
--- a/demo/image_classification/preprocess.py
+++ b/demo/image_classification/preprocess.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/image_classification/preprocess.sh b/demo/image_classification/preprocess.sh
index e3e86ff10675c0622867af2eb0d26c87f4bc2db5..c7396c6393599ef3f2c55089eb05f2435b2b4b82 100755
--- a/demo/image_classification/preprocess.sh
+++ b/demo/image_classification/preprocess.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/image_classification/train.sh b/demo/image_classification/train.sh
index ed9b5220fff6a434cd332f0972d39c4149b3ebfe..6fc11caf1c75192242482c2e85f8167eb9fba4ec 100755
--- a/demo/image_classification/train.sh
+++ b/demo/image_classification/train.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ paddle train \
 --test_all_data_in_one_period=1 \
 --use_gpu=1 \
 --trainer_count=1 \
---num_passes=200 \
+--num_passes=300 \
 --save_dir=$output \
 2>&1 | tee $log
 
diff --git a/demo/image_classification/vgg_16_cifar.py b/demo/image_classification/vgg_16_cifar.py
index 58ceff5fc2f46cac9997b6d8af2b0db0c43e0c75..8ee4a64c15f885023a6e19812885b4f76bb12af9 100755
--- a/demo/image_classification/vgg_16_cifar.py
+++ b/demo/image_classification/vgg_16_cifar.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/introduction/.gitignore b/demo/introduction/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..c54f3f9480ce4ceefda98f77a812ec2d6cd4a5e3
--- /dev/null
+++ b/demo/introduction/.gitignore
@@ -0,0 +1,5 @@
+dataprovider.pyc
+empty.list
+train.log
+output
+train.list
diff --git a/demo/introduction/dataprovider.py b/demo/introduction/dataprovider.py
index 8515022e18dc6bbf055e6db3121568acf1df1c55..5b48aad0408800676ae7da16eba2dcbb2124f25f 100644
--- a/demo/introduction/dataprovider.py
+++ b/demo/introduction/dataprovider.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,8 +17,10 @@ import random
 
 
 # define data types of input: 2 real numbers
-@provider(input_types=[dense_vector(1), dense_vector(1)], use_seq=False)
+@provider(
+    input_types={'x': dense_vector(1),
+                 'y': dense_vector(1)}, use_seq=False)
 def process(settings, input_file):
     for i in xrange(2000):
         x = random.random()
-        yield [x], [2 * x + 0.3]
+        yield {'x': [x], 'y': [2 * x + 0.3]}
diff --git a/demo/introduction/evaluate_model.py b/demo/introduction/evaluate_model.py
index ca4a1872731abde90e72cb167929b3d9e2e1ebf4..eeda43c5c86f3e49f758bf55b16a68387e64238c 100755
--- a/demo/introduction/evaluate_model.py
+++ b/demo/introduction/evaluate_model.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: UTF-8 -*-
 
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/introduction/train.sh b/demo/introduction/train.sh
index 06db8edd105ada071597ed1aa5e42f7de547174d..b7bbb90ddd287e3e312a490b53924ae76fb20d2c 100755
--- a/demo/introduction/train.sh
+++ b/demo/introduction/train.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/introduction/trainer_config.py b/demo/introduction/trainer_config.py
index 7c838c1a8f5b3cb6ac732197c85cd7c728eb013f..ecafe955f9e5c1062168d5d7b6b4c639d6e72a99 100644
--- a/demo/introduction/trainer_config.py
+++ b/demo/introduction/trainer_config.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,11 +15,8 @@
 from paddle.trainer_config_helpers import *
 
 # 1. read data. Suppose you saved above python code as dataprovider.py
-data_file = 'empty.list'
-with open(data_file, 'w') as f:
-    f.writelines(' ')
 define_py_data_sources2(
-    train_list=data_file,
+    train_list=['no_matter.txt'],
     test_list=None,
     module='dataprovider',
     obj='process',
diff --git a/demo/mnist/data/generate_list.py b/demo/mnist/data/generate_list.py
index d880721f94c68bbbc1740f82872462efdb368fa2..49981cc7a93308bc96ad5097eba749440e958525 100644
--- a/demo/mnist/data/generate_list.py
+++ b/demo/mnist/data/generate_list.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/mnist/mnist_provider.py b/demo/mnist/mnist_provider.py
index 6df4676da3bdc2e6949cc911fa3720cb51ddc568..4635833d36b9f21c992d96910f3ac9094ccefd2c 100644
--- a/demo/mnist/mnist_provider.py
+++ b/demo/mnist/mnist_provider.py
@@ -1,10 +1,12 @@
 from paddle.trainer.PyDataProvider2 import *
+import numpy
 
 
 # Define a py data provider
 @provider(
     input_types={'pixel': dense_vector(28 * 28),
-                 'label': integer_value(10)})
+                 'label': integer_value(10)},
+    cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, filename):  # settings is not used currently.
     imgf = filename + "-images-idx3-ubyte"
     labelf = filename + "-labels-idx1-ubyte"
@@ -20,12 +22,13 @@ def process(settings, filename):  # settings is not used currently.
     else:
         n = 10000
 
-    for i in range(n):
-        label = ord(l.read(1))
-        pixels = []
-        for j in range(28 * 28):
-            pixels.append(float(ord(f.read(1))) / 255.0)
-        yield {"pixel": pixels, 'label': label}
+    images = numpy.fromfile(
+        f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
+    images = images / 255.0 * 2.0 - 1.0
+    labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
+
+    for i in xrange(n):
+        yield {"pixel": images[i, :], 'label': labels[i]}
 
     f.close()
     l.close()
diff --git a/demo/mnist/train.sh b/demo/mnist/train.sh
index 084b32ac390b847379fd0e5e6a5fd33714730ec4..da90cd749a02976633d0f0d6e4352d8a85c7cdef 100755
--- a/demo/mnist/train.sh
+++ b/demo/mnist/train.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/mnist/vgg_16_mnist.py b/demo/mnist/vgg_16_mnist.py
index f9e89bc588abacd98a8f5fc82a00fae6bb2de10e..a819b391c690fb473801eb2e7ba3161cc31b5b4b 100644
--- a/demo/mnist/vgg_16_mnist.py
+++ b/demo/mnist/vgg_16_mnist.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/model_zoo/embedding/extract_para.py b/demo/model_zoo/embedding/extract_para.py
index 47e06fae9caa9c3d9e0d6eb2e3f6633a776c5b1d..570b90c1f772c8f6abfc6cda02560fd3471ef0b6 100755
--- a/demo/model_zoo/embedding/extract_para.py
+++ b/demo/model_zoo/embedding/extract_para.py
@@ -1,5 +1,5 @@
 #!/bin/env python
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/model_zoo/embedding/paraconvert.py b/demo/model_zoo/embedding/paraconvert.py
index 54155eff8e26b16ff5303d8d279e81b4bf8a90f4..ce7a70efc43d7f85708f1e12bb94739f3588370c 100755
--- a/demo/model_zoo/embedding/paraconvert.py
+++ b/demo/model_zoo/embedding/paraconvert.py
@@ -1,5 +1,5 @@
 #!/bin/env python
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/model_zoo/embedding/pre_DictAndModel.sh b/demo/model_zoo/embedding/pre_DictAndModel.sh
index 7821850fb25cc5b87aa305c2113efbf50b093ed1..f97ef2610734449c88fdfca6216b1cab57472b84 100755
--- a/demo/model_zoo/embedding/pre_DictAndModel.sh
+++ b/demo/model_zoo/embedding/pre_DictAndModel.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,7 +18,5 @@ set -x
 # download the dictionary and pretrained model 
 for file in baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb
 do 
-  # following is the google drive address
-  # you can also directly download from https://pan.baidu.com/s/1o8q577s
-  wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/$file --no-check-certificate
+  wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/$file
 done
diff --git a/demo/model_zoo/resnet/classify.py b/demo/model_zoo/resnet/classify.py
index 7855126edcfec20de251e5bc08c08c7aab8f7a8e..4631816c43ef48839df1863a0a86c3ab00924d3f 100755
--- a/demo/model_zoo/resnet/classify.py
+++ b/demo/model_zoo/resnet/classify.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/model_zoo/resnet/example/__init__.py b/demo/model_zoo/resnet/example/__init__.py
index c90af2ee000d46a032984ee23559e7e99b49ddad..f662d6826321eb840739382558f76327d27b5847 100644
--- a/demo/model_zoo/resnet/example/__init__.py
+++ b/demo/model_zoo/resnet/example/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/model_zoo/resnet/example/image_list_provider.py b/demo/model_zoo/resnet/example/image_list_provider.py
index 9e415f76a53326c5809b7a8c508701c519ab443b..2cd8eb8bf850f41282ed5db2885dc0b7218c79f7 100644
--- a/demo/model_zoo/resnet/example/image_list_provider.py
+++ b/demo/model_zoo/resnet/example/image_list_provider.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/model_zoo/resnet/extract_fea_c++.sh b/demo/model_zoo/resnet/extract_fea_c++.sh
index c7f9aea9a57df5f9021d6cb3743967dcc9aa3e5f..5447aa92dfb5facd3433eb4a1893e96e3c786c73 100755
--- a/demo/model_zoo/resnet/extract_fea_c++.sh
+++ b/demo/model_zoo/resnet/extract_fea_c++.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/model_zoo/resnet/extract_fea_py.sh b/demo/model_zoo/resnet/extract_fea_py.sh
index a70cef9a87e9337a4dacd4a98fb1e2cf53004221..2e87152f7f8598f487870291271cdee646105044 100755
--- a/demo/model_zoo/resnet/extract_fea_py.sh
+++ b/demo/model_zoo/resnet/extract_fea_py.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/model_zoo/resnet/get_model.sh b/demo/model_zoo/resnet/get_model.sh
index 89312d43edf8e4e7d639be73d5b3983ea916b902..b33d8178ab7859fc0b0d514fb19bec2c28a77c3d 100755
--- a/demo/model_zoo/resnet/get_model.sh
+++ b/demo/model_zoo/resnet/get_model.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,9 +24,7 @@ echo "Downloading ResNet models..."
 
 for file in resnet_50.tar.gz resnet_101.tar.gz resnet_152.tar.gz mean_meta_224.tar.gz 
 do 
-  # following is the google drive address
-  # you can also directly download from https://pan.baidu.com/s/1o8q577s
-  wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/imagenet/$file --no-check-certificate
+  wget http://paddlepaddle.bj.bcebos.com/model_zoo/imagenet/$file
   tar -xvf $file 
   rm $file
 done
diff --git a/demo/model_zoo/resnet/load_feature.py b/demo/model_zoo/resnet/load_feature.py
index b0948b75fd0ac9a3fa89070aed04d523ce286f4e..5d3d0c0d30ef710c37c98e93a51b2f813d636b59 100644
--- a/demo/model_zoo/resnet/load_feature.py
+++ b/demo/model_zoo/resnet/load_feature.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/model_zoo/resnet/net_diagram.sh b/demo/model_zoo/resnet/net_diagram.sh
index a21ab4345bfb31e5586cb07625b44e34ec1f7ec6..1b06ffa44eec8a0f312420c35699d3902f9a6400 100755
--- a/demo/model_zoo/resnet/net_diagram.sh
+++ b/demo/model_zoo/resnet/net_diagram.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/model_zoo/resnet/predict.sh b/demo/model_zoo/resnet/predict.sh
index 55cf16e34a759c3538ef4216fcb7e724ffd83b9f..2b67b17c48c60cc8a7b7c46a1c80a3f2bf281870 100755
--- a/demo/model_zoo/resnet/predict.sh
+++ b/demo/model_zoo/resnet/predict.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/model_zoo/resnet/resnet.py b/demo/model_zoo/resnet/resnet.py
index 015b74cd484596039b9fcf010576ca340d044db7..6fdd97fefc62392c93ecffae0fc918e8dc4b18c5 100644
--- a/demo/model_zoo/resnet/resnet.py
+++ b/demo/model_zoo/resnet/resnet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/quick_start/.gitignore b/demo/quick_start/.gitignore
index d6bc73105b1abfdae3067b7fecd656079a56b57c..f71662563ff96d6227dd568d9951a90b0d09456e 100644
--- a/demo/quick_start/.gitignore
+++ b/demo/quick_start/.gitignore
@@ -8,6 +8,8 @@ data/test.list
 data/test.txt
 data/train.list
 data/train.txt
+data/pred.list
+data/pred.txt
 dataprovider_copy_1.py
 train.log
 output
diff --git a/demo/quick_start/api_predict.py b/demo/quick_start/api_predict.py
new file mode 100755
index 0000000000000000000000000000000000000000..9bdffe1006281c58a595e2771561ba62e4c2d6bd
--- /dev/null
+++ b/demo/quick_start/api_predict.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os, sys
+import numpy as np
+from optparse import OptionParser
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import sparse_binary_vector
+from paddle.trainer.config_parser import parse_config
+"""
+Usage: run following command to show help message.
+  python api_predict.py -h
+"""
+
+
+class QuickStartPrediction():
+    def __init__(self, train_conf, dict_file, model_dir=None, label_file=None):
+        """
+        train_conf: trainer configure.
+        dict_file: word dictionary file name.
+        model_dir: directory of model.
+        """
+        self.train_conf = train_conf
+        self.dict_file = dict_file
+        self.word_dict = {}
+        self.dict_dim = self.load_dict()
+        self.model_dir = model_dir
+        if model_dir is None:
+            self.model_dir = os.path.dirname(train_conf)
+
+        self.label = None
+        if label_file is not None:
+            self.load_label(label_file)
+
+        conf = parse_config(train_conf, "is_predict=1")
+        self.network = swig_paddle.GradientMachine.createFromConfigProto(
+            conf.model_config)
+        self.network.loadParameters(self.model_dir)
+        input_types = [sparse_binary_vector(self.dict_dim)]
+        self.converter = DataProviderConverter(input_types)
+
+    def load_dict(self):
+        """
+        Load dictionary from self.dict_file.
+        """
+        for line_count, line in enumerate(open(self.dict_file, 'r')):
+            self.word_dict[line.strip().split('\t')[0]] = line_count
+        return len(self.word_dict)
+
+    def load_label(self, label_file):
+        """
+        Load label.
+        """
+        self.label = {}
+        for v in open(label_file, 'r'):
+            self.label[int(v.split('\t')[1])] = v.split('\t')[0]
+
+    def get_index(self, data):
+        """
+        transform word into integer index according to the dictionary.
+        """
+        words = data.strip().split()
+        word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
+        return word_slot
+
+    def batch_predict(self, data_batch):
+        input = self.converter(data_batch)
+        output = self.network.forwardTest(input)
+        prob = output[0]["id"].tolist()
+        print("predicting labels is:")
+        print prob
+
+
+def option_parser():
+    usage = "python predict.py -n config -w model_dir -d dictionary -i input_file "
+    parser = OptionParser(usage="usage: %s [options]" % usage)
+    parser.add_option(
+        "-n",
+        "--tconf",
+        action="store",
+        dest="train_conf",
+        help="network config")
+    parser.add_option(
+        "-d",
+        "--dict",
+        action="store",
+        dest="dict_file",
+        help="dictionary file")
+    parser.add_option(
+        "-b",
+        "--label",
+        action="store",
+        dest="label",
+        default=None,
+        help="dictionary file")
+    parser.add_option(
+        "-c",
+        "--batch_size",
+        type="int",
+        action="store",
+        dest="batch_size",
+        default=1,
+        help="the batch size for prediction")
+    parser.add_option(
+        "-w",
+        "--model",
+        action="store",
+        dest="model_path",
+        default=None,
+        help="model path")
+    return parser.parse_args()
+
+
+def main():
+    options, args = option_parser()
+    train_conf = options.train_conf
+    batch_size = options.batch_size
+    dict_file = options.dict_file
+    model_path = options.model_path
+    label = options.label
+    swig_paddle.initPaddle("--use_gpu=0")
+    predict = QuickStartPrediction(train_conf, dict_file, model_path, label)
+
+    batch = []
+    labels = []
+    for line in sys.stdin:
+        [label, text] = line.split("\t")
+        labels.append(int(label))
+        batch.append([predict.get_index(text)])
+    print("labels is:")
+    print labels
+    predict.batch_predict(batch)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/quick_start/api_predict.sh b/demo/quick_start/api_predict.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c90d3b70548b3ef2a7e0e423c74cd97f1886c0fc
--- /dev/null
+++ b/demo/quick_start/api_predict.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+#Note the default model is pass-00002, you shold make sure the model path
+#exists or change the mode path.
+#only test on trainer_config.lr.py
+model=output/pass-00001/
+config=trainer_config.lr.py
+label=data/labels.list
+dict=data/dict.txt
+batch_size=20
+head -n$batch_size data/test.txt | python api_predict.py \
+     --tconf=$config\
+     --model=$model \
+     --label=$label \
+     --dict=$dict \
+     --batch_size=$batch_size
diff --git a/demo/quick_start/api_train.py b/demo/quick_start/api_train.py
index 66cbb856484d231613a0026be129a7bc3a7cfdf5..5699789daa4051661b0a72c69f4668f2d8bb9cb2 100644
--- a/demo/quick_start/api_train.py
+++ b/demo/quick_start/api_train.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/quick_start/api_train.sh b/demo/quick_start/api_train.sh
index 40e9d0a09aaa6b672d6b3997c67c07a5e8a8c3d8..9b2a4e2f224b1677c458ede66a6a3bac09d8ad61 100755
--- a/demo/quick_start/api_train.sh
+++ b/demo/quick_start/api_train.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/quick_start/data/README.md b/demo/quick_start/data/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..63abcf7ebf31903213e44cf492b93e09f61db14e
--- /dev/null
+++ b/demo/quick_start/data/README.md
@@ -0,0 +1,9 @@
+This dataset consists of electronics product reviews associated with
+binary labels (positive/negative) for sentiment classification.
+
+The preprocessed data can be downloaded by script `get_data.sh`.
+The data was derived from reviews_Electronics_5.json.gz at
+
+http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+
+If you want to process the raw data, you can use the script `proc_from_raw_data/get_data.sh`.
diff --git a/demo/quick_start/data/get_data.sh b/demo/quick_start/data/get_data.sh
index f355d63225b28ab495b34e72dd3be8d237ae08f4..a09a18f919e5a84f1f7c889a43f0a5fbf4a60a77 100755
--- a/demo/quick_start/data/get_data.sh
+++ b/demo/quick_start/data/get_data.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,14 +17,11 @@ set -e
 DIR="$( cd "$(dirname "$0")" ; pwd -P )"
 cd $DIR
 
-echo "Downloading Amazon Electronics reviews data..."
-# http://jmcauley.ucsd.edu/data/amazon/
-wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+# Download the preprocessed data
+wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
 
-echo "Downloading mosesdecoder..."
-#https://github.com/moses-smt/mosesdecoder
-wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
+# Extract package
+tar zxvf preprocessed_data.tar.gz
 
-unzip master.zip
-rm master.zip
-echo "Done."
+# Remove compressed package
+rm preprocessed_data.tar.gz
diff --git a/demo/quick_start/data/pred.list b/demo/quick_start/data/pred.list
deleted file mode 100644
index d88b2b63851101a8b40e706b32d8c17b5fabb201..0000000000000000000000000000000000000000
--- a/demo/quick_start/data/pred.list
+++ /dev/null
@@ -1 +0,0 @@
-./data/pred.txt
diff --git a/demo/quick_start/data/pred.txt b/demo/quick_start/data/pred.txt
deleted file mode 100644
index 6ed5f738ddaff6645448d5e606dcef1baf01b282..0000000000000000000000000000000000000000
--- a/demo/quick_start/data/pred.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-the device is cute , but that &apos;s just about all that &apos;s good. the specs are what you &apos;d expect : it &apos;s a wifi mic , with some noise filter options. the app has the option to upload your baby &apos;s name and photo , which is a cutesy touch. but the app is otherwise unstable and useless unless you upgrade for $ 60 / year.set up involves downloading the app , turning on the mic , switching your phone to the wifi network of the mic , telling the app your wifi settings , switching your wifi back to your home router. the app is then directly connected to your mic.the app is adware ! the main screen says &quot; cry notifications on / off : upgrade to evoz premium and receive a text message of email when your baby is crying &quot; .but the adware points out an important limitation , this monitor is only intended to be used from your home network. if you want to access it remotely , get a webcam. this app would make a lot more sense of the premium features were included with the hardware .
-don &apos;t be fooled by my one star rating. if there was a zero , i would have selected it. this product was a waste of my money.it has never worked like the company said it supposed to. i only have one device , an iphone 4gs. after charging the the iphone mid way , the i.sound portable power max 16,000 mah is completely drained. the led light no longer lit up. when plugging the isound portable power max into a wall outlet to charge , it would charge for about 20-30 minutes and then all four battery led indicator lit up showing a full charge. i would leave it on to charge for the full 8 hours or more but each time with the same result upon using. don &apos;t buy this thing. put your money to good use elsewhere .
diff --git a/demo/quick_start/preprocess.sh b/demo/quick_start/data/proc_from_raw_data/get_data.sh
similarity index 62%
rename from demo/quick_start/preprocess.sh
rename to demo/quick_start/data/proc_from_raw_data/get_data.sh
index c9190e2dd2ef754bf3c7287006322b52493dc3a0..d976eaebfaa600778e0ab6bb0adbd7159f1cce2f 100755
--- a/demo/quick_start/preprocess.sh
+++ b/demo/quick_start/data/proc_from_raw_data/get_data.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,10 +16,26 @@
 # 1. size of pos : neg = 1:1.
 # 2. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
 # 3. distinct train set and test set.
-# 4. build dict
 
 set -e
 
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+
+# Download data
+echo "Downloading Amazon Electronics reviews data..."
+# http://jmcauley.ucsd.edu/data/amazon/
+wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+echo "Downloading mosesdecoder..."
+# https://github.com/moses-smt/mosesdecoder
+wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
+
+unzip master.zip
+rm master.zip
+
+##################
+# Preprocess data 
+echo "Preprocess data..."
 export LC_ALL=C
 UNAME_STR=`uname`
 
@@ -29,11 +45,11 @@ else
   SHUF_PROG='gshuf'
 fi
 
-mkdir -p data/tmp
-python preprocess.py -i data/reviews_Electronics_5.json.gz
+mkdir -p tmp
+python preprocess.py -i reviews_Electronics_5.json.gz
 # uniq and shuffle
-cd data/tmp
-echo 'uniq and shuffle...'
+cd tmp
+echo 'Uniq and shuffle...'
 cat pos_*|sort|uniq|${SHUF_PROG}> pos.shuffed
 cat neg_*|sort|uniq|${SHUF_PROG}> neg.shuffed
 
@@ -53,11 +69,11 @@ cat train.pos train.neg | ${SHUF_PROG} >../train.txt
 cat test.pos test.neg | ${SHUF_PROG} >../test.txt
 
 cd -
-echo 'data/train.txt' > data/train.list
-echo 'data/test.txt' > data/test.list
+echo 'train.txt' > train.list
+echo 'test.txt' > test.list
 
 # use 30k dict
-rm -rf data/tmp
-mv data/dict.txt data/dict_all.txt
-cat data/dict_all.txt | head -n 30001 > data/dict.txt
-echo 'preprocess finished'
+rm -rf tmp
+mv dict.txt dict_all.txt
+cat dict_all.txt | head -n 30001 > dict.txt
+echo 'Done.'
diff --git a/demo/quick_start/preprocess.py b/demo/quick_start/data/proc_from_raw_data/preprocess.py
similarity index 94%
rename from demo/quick_start/preprocess.py
rename to demo/quick_start/data/proc_from_raw_data/preprocess.py
index d87fad632a7429f7d9682badabe4c72ca127354f..72bd95f21d8bde8b3d1962ea10ecf6fc7d0ea478 100755
--- a/demo/quick_start/preprocess.py
+++ b/demo/quick_start/data/proc_from_raw_data/preprocess.py
@@ -1,6 +1,6 @@
 # -*- coding: UTF-8 -*-
 
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-1. (remove HTML before or not)tokensizing
+1. Tokenize the words and punctuation 
 2. pos sample : rating score 5; neg sample: rating score 1-2.
 
 Usage:
@@ -76,7 +76,11 @@ def tokenize(sentences):
     sentences : a list of input sentences.
     return: a list of processed text.
     """
-    dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
+    dir = './mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
+    if not os.path.exists(dir):
+        sys.exit(
+            "The ./mosesdecoder-master/scripts/tokenizer/tokenizer.perl does not exists."
+        )
     tokenizer_cmd = [dir, '-l', 'en', '-q', '-']
     assert isinstance(sentences, list)
     text = "\n".join(sentences)
@@ -104,7 +108,7 @@ def tokenize_batch(id):
         num_batch, instance, pre_fix = parse_queue.get()
         if num_batch == -1:  ### parse_queue finished
             tokenize_queue.put((-1, None, None))
-            sys.stderr.write("tokenize theread %s finish\n" % (id))
+            sys.stderr.write("Thread %s finish\n" % (id))
             break
         tokenize_instance = tokenize(instance)
         tokenize_queue.put((num_batch, tokenize_instance, pre_fix))
diff --git a/demo/quick_start/dataprovider_bow.py b/demo/quick_start/dataprovider_bow.py
index a5156a2d40cc04c02e50d676045ae6da8937ba01..2745495586449b5d1eb64ae570f73eb6b14dbdfe 100644
--- a/demo/quick_start/dataprovider_bow.py
+++ b/demo/quick_start/dataprovider_bow.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -31,16 +31,16 @@ def initializer(settings, dictionary, **kwargs):
 
     # setting.input_types specifies what the data types the data provider
     # generates.
-    settings.input_types = [
+    settings.input_types = {
         # The first input is a sparse_binary_vector,
         # which means each dimension of the vector is either 0 or 1. It is the
         # bag-of-words (BOW) representation of the texts.
-        sparse_binary_vector(len(dictionary)),
+        'word': sparse_binary_vector(len(dictionary)),
         # The second input is an integer. It represents the category id of the
         # sample. 2 means there are two labels in the dataset.
         # (1 for positive and 0 for negative)
-        integer_value(2)
-    ]
+        'label': integer_value(2)
+    }
 
 
 # Delaring a data provider. It has an initializer 'data_initialzer'.
@@ -67,12 +67,12 @@ def process(settings, file_name):
             # Return the features for the current comment. The first is a list
             # of ids representing a 0-1 binary sparse vector of the text,
             # the second is the integer id of the label.
-            yield word_vector, int(label)
+            yield {'word': word_vector, 'label': int(label)}
 
 
 def predict_initializer(settings, dictionary, **kwargs):
     settings.word_dict = dictionary
-    settings.input_types = [sparse_binary_vector(len(dictionary))]
+    settings.input_types = {'word': sparse_binary_vector(len(dictionary))}
 
 
 # Declaring a data provider for prediction. The difference with process
@@ -83,4 +83,4 @@ def process_predict(settings, file_name):
         for line in f:
             comment = line.strip().split()
             word_vector = [settings.word_dict.get(w, UNK_IDX) for w in comment]
-            yield word_vector
+            yield {'word': word_vector}
diff --git a/demo/quick_start/dataprovider_emb.py b/demo/quick_start/dataprovider_emb.py
index 286f3f5c82081f1a6e02a26023969790792a78a3..ddfa3ce9b73555cb3b7f5a44314ca35b12d41ede 100755
--- a/demo/quick_start/dataprovider_emb.py
+++ b/demo/quick_start/dataprovider_emb.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,13 +19,13 @@ UNK_IDX = 0
 
 def initializer(settings, dictionary, **kwargs):
     settings.word_dict = dictionary
-    settings.input_types = [
+    settings.input_types = {
         # Define the type of the first input as sequence of integer.
         # The value of the integers range from 0 to len(dictrionary)-1
-        integer_value_sequence(len(dictionary)),
+        'word': integer_value_sequence(len(dictionary)),
         # Define the second input for label id
-        integer_value(2)
-    ]
+        'label': integer_value(2)
+    }
 
 
 @provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
@@ -35,15 +35,12 @@ def process(settings, file_name):
             label, comment = line.strip().split('\t')
             words = comment.split()
             word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
-            yield word_slot, int(label)
+            yield {'word': word_slot, 'label': int(label)}
 
 
 def predict_initializer(settings, dictionary, **kwargs):
     settings.word_dict = dictionary
-    settings.input_types = [
-        integer_value(
-            len(dictionary), seq_type=SequenceType.SEQUENCE)
-    ]
+    settings.input_types = {'word': integer_value_sequence(len(dictionary))}
 
 
 @provider(init_hook=predict_initializer, should_shuffle=False)
@@ -52,4 +49,4 @@ def process_predict(settings, file_name):
         for line in f:
             comment = line.strip().split()
             word_slot = [settings.word_dict.get(w, UNK_IDX) for w in comment]
-            yield word_slot
+            yield {'word': word_slot}
diff --git a/demo/quick_start/predict.sh b/demo/quick_start/predict.sh
index b1e5e44f0b644547d6573ef635084b555237bea6..f02e5038e92790c7f1ddcd84a09c6d9a02f84ac4 100755
--- a/demo/quick_start/predict.sh
+++ b/demo/quick_start/predict.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/quick_start/train.sh b/demo/quick_start/train.sh
index b3c471608c3248bfc714d5e44dd927f25dd23ea0..e3595fce7519297058e1eeb66487692267ddcfcc 100755
--- a/demo/quick_start/train.sh
+++ b/demo/quick_start/train.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/quick_start/trainer_config.bidi-lstm.py b/demo/quick_start/trainer_config.bidi-lstm.py
index 51deaf31f94681b6b61f98f798cef14a65ec92cb..ca1d1f8d099b5a3f5276c108855c5e890e7214fe 100644
--- a/demo/quick_start/trainer_config.bidi-lstm.py
+++ b/demo/quick_start/trainer_config.bidi-lstm.py
@@ -1,6 +1,6 @@
 # edit-mode: -*- python -*-
 
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/quick_start/trainer_config.cnn.py b/demo/quick_start/trainer_config.cnn.py
index 388efa75f903e0c7c803c99cd50d73a004133a67..f8c3d511f323ed9ec96be0a1951014c6db639003 100644
--- a/demo/quick_start/trainer_config.cnn.py
+++ b/demo/quick_start/trainer_config.cnn.py
@@ -1,6 +1,6 @@
 # edit-mode: -*- python -*-
 
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/quick_start/trainer_config.db-lstm.py b/demo/quick_start/trainer_config.db-lstm.py
index 02bc898d881efbd3bfaed95d45cd9e70ed046746..fba802b4600b33cfbfd0820cce1f47e4d0f948ae 100644
--- a/demo/quick_start/trainer_config.db-lstm.py
+++ b/demo/quick_start/trainer_config.db-lstm.py
@@ -1,6 +1,6 @@
 # edit-mode: -*- python -*-
 
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/quick_start/trainer_config.emb.py b/demo/quick_start/trainer_config.emb.py
index 8fd18a7aac704e62b137845edb46cce5bc373285..7410397ef656e363b232787995d3a869cd11b655 100644
--- a/demo/quick_start/trainer_config.emb.py
+++ b/demo/quick_start/trainer_config.emb.py
@@ -1,6 +1,6 @@
 # edit-mode: -*- python -*-
 
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/quick_start/trainer_config.lr.py b/demo/quick_start/trainer_config.lr.py
index b9c9441baac28a8a8f6078065b75664819d6cd04..e5105aa89532d71c80c8ec77ca98ac6a8e9c8c58 100644
--- a/demo/quick_start/trainer_config.lr.py
+++ b/demo/quick_start/trainer_config.lr.py
@@ -1,6 +1,6 @@
 # edit-mode: -*- python -*-
 
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/quick_start/trainer_config.lstm.py b/demo/quick_start/trainer_config.lstm.py
index 8821e02d9bd4a0d06b8afa99df8e0fac3e2fcefe..43b4ddac2dca5f6b9aa28f055e843abf12e92312 100644
--- a/demo/quick_start/trainer_config.lstm.py
+++ b/demo/quick_start/trainer_config.lstm.py
@@ -1,6 +1,6 @@
 # edit-mode: -*- python -*-
 
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/quick_start/trainer_config.resnet-lstm.py b/demo/quick_start/trainer_config.resnet-lstm.py
index 91e1581c386eb880d481b7352c4d21f3a5ef5c9a..89a837abb7cdeaaa249160123e1f2001d23d7aa1 100644
--- a/demo/quick_start/trainer_config.resnet-lstm.py
+++ b/demo/quick_start/trainer_config.resnet-lstm.py
@@ -1,6 +1,6 @@
 # edit-mode: -*- python -*-
 
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 This configuration is a demonstration of how to implement the stacked LSTM
 with residual connections, i.e. an LSTM layer takes the sum of the hidden states
@@ -46,11 +45,12 @@ is_predict = get_config_arg('is_predict', bool, False)
 trn = 'data/train.list' if not is_predict else None
 tst = 'data/test.list' if not is_predict else 'data/pred.list'
 process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(train_list=trn,
-                        test_list=tst,
-                        module="dataprovider_emb",
-                        obj=process,
-                        args={"dictionary": word_dict})
+define_py_data_sources2(
+    train_list=trn,
+    test_list=tst,
+    module="dataprovider_emb",
+    obj=process,
+    args={"dictionary": word_dict})
 
 batch_size = 128 if not is_predict else 1
 settings(
@@ -58,10 +58,9 @@ settings(
     learning_rate=2e-3,
     learning_method=AdamOptimizer(),
     regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25
-)
+    gradient_clipping_threshold=25)
 
-bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
+bias_attr = ParamAttr(initial_std=0., l2_rate=0.)
 
 data = data_layer(name="word", size=len(word_dict))
 emb = embedding_layer(input=data, size=128)
@@ -73,17 +72,15 @@ for i in range(3):
     # The input to the current layer is the sum of the hidden state
     # and input of the previous layer.
     current_input = addto_layer(input=[previous_input, previous_hidden_state])
-    hidden_state = simple_lstm(input=current_input, size=128,
-                               lstm_cell_attr=ExtraAttr(drop_rate=0.1))
+    hidden_state = simple_lstm(
+        input=current_input, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.1))
     previous_input, previous_hidden_state = current_input, hidden_state
 
 lstm = previous_hidden_state
 
 lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
-output = fc_layer(input=lstm_last, size=2,
-                  bias_attr=bias_attr,
-                  act=SoftmaxActivation())
-
+output = fc_layer(
+    input=lstm_last, size=2, bias_attr=bias_attr, act=SoftmaxActivation())
 
 if is_predict:
     maxid = maxid_layer(output)
diff --git a/demo/recommendation/common_utils.py b/demo/recommendation/common_utils.py
index 613e36b496e47edbc0eabd8f15a0abdcb50f6424..c20c65286621d701ad58409b539bbe9c813d453a 100755
--- a/demo/recommendation/common_utils.py
+++ b/demo/recommendation/common_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,13 +17,14 @@ from paddle.trainer.PyDataProvider2 import *
 def meta_to_header(meta, name):
     metas = meta[name]['__meta__']['raw_meta']
     for each_meta in metas:
+        slot_name = each_meta.get('name', '%s_id' % name)
         if each_meta['type'] == 'id':
-            yield integer_value(each_meta['max'])
+            yield slot_name, integer_value(each_meta['max'])
         elif each_meta['type'] == 'embedding':
             is_seq = each_meta['seq'] == 'sequence'
-            yield integer_value(
+            yield slot_name, integer_value(
                 len(each_meta['dict']),
                 seq_type=SequenceType.SEQUENCE
                 if is_seq else SequenceType.NO_SEQUENCE)
         elif each_meta['type'] == 'one_hot_dense':
-            yield dense_vector(len(each_meta['dict']))
+            yield slot_name, dense_vector(len(each_meta['dict']))
diff --git a/demo/recommendation/data/config_generator.py b/demo/recommendation/data/config_generator.py
index fa605458300f81da6772d88cfbad413e4dcf97fe..4ca496a252dffc62ed62bb8f2a5ee1661a940580 100644
--- a/demo/recommendation/data/config_generator.py
+++ b/demo/recommendation/data/config_generator.py
@@ -1,5 +1,5 @@
 #!/bin/env python2
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/recommendation/data/meta_generator.py b/demo/recommendation/data/meta_generator.py
index 593c863670d5eb5d684adf643ff745f3914b656b..38e4679d266c331a751114cd13f0e3453016cf26 100644
--- a/demo/recommendation/data/meta_generator.py
+++ b/demo/recommendation/data/meta_generator.py
@@ -1,5 +1,5 @@
 #!/bin/env python2
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/recommendation/data/ml_data.sh b/demo/recommendation/data/ml_data.sh
index 408a8723e086d3c374c915623c85bffe0d939151..2268d876389e0bdf5ead405e74d278d276626f82 100755
--- a/demo/recommendation/data/ml_data.sh
+++ b/demo/recommendation/data/ml_data.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/recommendation/data/split.py b/demo/recommendation/data/split.py
index 8dd0cbd32af6074439e98dac024c5fed76cd52b2..be6869c22f04be1db0f8e9c35c73c851e4c490b0 100644
--- a/demo/recommendation/data/split.py
+++ b/demo/recommendation/data/split.py
@@ -1,5 +1,5 @@
 #!/bin/env python2
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/recommendation/dataprovider.py b/demo/recommendation/dataprovider.py
index ff3932be03f1e4a1fc1d0bdb189ab7fe1fbbeca0..c4ff96d80e81926049c9a71d6d9d991c0b568c25 100755
--- a/demo/recommendation/dataprovider.py
+++ b/demo/recommendation/dataprovider.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,6 +16,14 @@ from paddle.trainer.PyDataProvider2 import *
 import common_utils  # parse
 
 
+def __list_to_map__(lst):
+    ret_val = dict()
+    for each in lst:
+        k, v = each
+        ret_val[k] = v
+    return ret_val
+
+
 def hook(settings, meta, **kwargs):
     """
     Init hook is invoked before process data. It will set obj.slots and store
@@ -34,12 +42,16 @@ def hook(settings, meta, **kwargs):
     #    second part is user features.
     #    final part is rating score.
     # header is a list of [USE_SEQ_OR_NOT?, SlotType]
-    headers = list(common_utils.meta_to_header(meta, 'movie'))
-    headers.extend(list(common_utils.meta_to_header(meta, 'user')))
-    headers.append(dense_vector(1))  # Score
+    movie_headers = list(common_utils.meta_to_header(meta, 'movie'))
+    settings.movie_names = [h[0] for h in movie_headers]
+    headers = movie_headers
+    user_headers = list(common_utils.meta_to_header(meta, 'user'))
+    settings.user_names = [h[0] for h in user_headers]
+    headers.extend(user_headers)
+    headers.append(("rating", dense_vector(1)))  # Score
 
     # slot types.
-    settings.input_types = headers
+    settings.input_types = __list_to_map__(headers)
     settings.meta = meta
 
 
@@ -57,20 +69,20 @@ def process(settings, filename):
             movie_meta = settings.meta['movie'][movie_id]
             user_meta = settings.meta['user'][user_id]
 
-            outputs = [movie_id - 1]
+            outputs = [('movie_id', movie_id - 1)]
 
             # Then add movie features
-            for each_meta in movie_meta:
-                outputs.append(each_meta)
+            for i, each_meta in enumerate(movie_meta):
+                outputs.append((settings.movie_names[i + 1], each_meta))
 
             # Then add user id.
-            outputs.append(user_id - 1)
+            outputs.append(('user_id', user_id - 1))
 
             # Then add user features.
-            for each_meta in user_meta:
-                outputs.append(each_meta)
+            for i, each_meta in enumerate(user_meta):
+                outputs.append((settings.user_names[i + 1], each_meta))
 
             # Finally, add score
-            outputs.append([score])
+            outputs.append(('rating', [score]))
             # Return data to paddle
-            yield outputs
+            yield __list_to_map__(outputs)
diff --git a/demo/recommendation/evaluate.sh b/demo/recommendation/evaluate.sh
index 38c1562c6370ddcda60ecdbbc2acf8d32635a4f4..02b2857de028bc9c05d7ddd67012043b671b2764 100755
--- a/demo/recommendation/evaluate.sh
+++ b/demo/recommendation/evaluate.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/recommendation/prediction.py b/demo/recommendation/prediction.py
index e2a202cfd1a476046d7e1d1896b87d72c4906ff2..8ad993eab3a9f637cfff752bfedbbc62eaf3c8d5 100755
--- a/demo/recommendation/prediction.py
+++ b/demo/recommendation/prediction.py
@@ -1,5 +1,5 @@
 #!/bin/env python2
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -34,8 +34,8 @@ if __name__ == '__main__':
     network.loadParameters(model_path)
     with open('./data/meta.bin', 'rb') as f:
         meta = pickle.load(f)
-        headers = list(meta_to_header(meta, 'movie'))
-        headers.extend(list(meta_to_header(meta, 'user')))
+        headers = [h[1] for h in meta_to_header(meta, 'movie')]
+        headers.extend([h[1] for h in meta_to_header(meta, 'user')])
         cvt = DataProviderConverter(headers)
         while True:
             movie_id = int(raw_input("Input movie_id: "))
diff --git a/demo/recommendation/preprocess.sh b/demo/recommendation/preprocess.sh
index e181d0be45558955935234c6ccb9eee1b3ce442f..eeb81ce3cb47e65c0aeb303e7571024ba82dad65 100755
--- a/demo/recommendation/preprocess.sh
+++ b/demo/recommendation/preprocess.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,6 +14,15 @@
 # limitations under the License.
 set -e
 
+UNAME_STR=`uname`
+
+if [[ ${UNAME_STR} == 'Linux' ]]; then
+	SHUF_PROG='shuf'
+else
+	SHUF_PROG='gshuf'
+fi
+
+
 cd "$(dirname "$0")"
 delimiter='::'
 dir=ml-1m
@@ -25,7 +34,7 @@ python meta_generator.py $dir meta.bin --config=meta_config.json
 echo 'split train/test file'
 python split.py $dir/ratings.dat --delimiter=${delimiter} --test_ratio=0.1
 echo 'shuffle train file'
-shuf $dir/ratings.dat.train > ratings.dat.train
+${SHUF_PROG} $dir/ratings.dat.train > ratings.dat.train
 cp $dir/ratings.dat.test .
 echo "./data/ratings.dat.train" > train.list
 echo "./data/ratings.dat.test" > test.list
diff --git a/demo/recommendation/run.sh b/demo/recommendation/run.sh
index 846b59cec9fc50bcfc8f76c5e220c32fb2e66ddb..e341d1cc7a3267bef9db916719b2e4b1981e31bc 100755
--- a/demo/recommendation/run.sh
+++ b/demo/recommendation/run.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/recommendation/trainer_config.py b/demo/recommendation/trainer_config.py
index cec340b0b65a841029a1c0538d9881bb38f026ff..aabcd335253faf69c940024ac8098a54da030463 100755
--- a/demo/recommendation/trainer_config.py
+++ b/demo/recommendation/trainer_config.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/semantic_role_labeling/.gitignore b/demo/semantic_role_labeling/.gitignore
index cd90ca7bbe9be46f54cb656a8067c794a55d8cfc..65c9b674c7d1dad53b7d1c6ee1dcbdb72553888d 100644
--- a/demo/semantic_role_labeling/.gitignore
+++ b/demo/semantic_role_labeling/.gitignore
@@ -8,3 +8,7 @@ data/test.wsj.seq_pair
 data/test.wsj.words
 data/tgt.dict
 output
+data/emb
+data/targetDict.txt
+data/verbDict.txt
+data/wordDict.txt
diff --git a/demo/semantic_role_labeling/data/extract_dict_feature.py b/demo/semantic_role_labeling/data/extract_dict_feature.py
index daca5f01cf2b3bd231bf530f17ec760272ce93e0..da44111976a0dec68345fc139d0aa459ca9211c2 100644
--- a/demo/semantic_role_labeling/data/extract_dict_feature.py
+++ b/demo/semantic_role_labeling/data/extract_dict_feature.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ def extract_dict_features(pair_file, feature_file):
                 ctx_n1 = sentence_list[verb_index - 1]
             else:
                 ctx_n1 = 'bos'
-            
+
             if verb_index > 1:
                 mark[verb_index - 2] = 1
                 ctx_n2 = sentence_list[verb_index - 2]
@@ -43,13 +43,13 @@ def extract_dict_features(pair_file, feature_file):
             mark[verb_index] = 1
             ctx_0 = sentence_list[verb_index]
 
-            if verb_index < len(labels_list) - 2:
+            if verb_index < len(labels_list) - 1:
                 mark[verb_index + 1] = 1
                 ctx_p1 = sentence_list[verb_index + 1]
             else:
                 ctx_p1 = 'eos'
-            
-            if verb_index < len(labels_list) - 3:
+
+            if verb_index < len(labels_list) - 2:
                 mark[verb_index + 2] = 1
                 ctx_p2 = sentence_list[verb_index + 2]
             else:
@@ -69,7 +69,6 @@ def extract_dict_features(pair_file, feature_file):
             feature_out.write(feature_str + '\n')
 
 
-
 if __name__ == '__main__':
 
     usage = '-p pair_file -f feature_file'
diff --git a/demo/semantic_role_labeling/data/extract_pairs.py b/demo/semantic_role_labeling/data/extract_pairs.py
index 86ab00ce41723169de035a841d9e129a1b9e82a3..94a8488c16734eb1882d54f7ec36f4b9308c09d4 100644
--- a/demo/semantic_role_labeling/data/extract_pairs.py
+++ b/demo/semantic_role_labeling/data/extract_pairs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -66,8 +66,8 @@ def transform_labels(sentences, labels):
         else:
             verb_list = []
             for x in labels[i][0]:
-                if x !='-':
-                   verb_list.append(x)
+                if x != '-':
+                    verb_list.append(x)
 
             for j in xrange(1, len(labels[i])):
                 label_list = labels[i][j]
@@ -93,7 +93,7 @@ def transform_labels(sentences, labels):
                         is_in_bracket = True
                     else:
                         print 'error:', ll
-                sen_lab_pair.append((sentences[i], verb_list[j-1], label_seq))
+                sen_lab_pair.append((sentences[i], verb_list[j - 1], label_seq))
     return sen_lab_pair
 
 
@@ -103,7 +103,7 @@ def write_file(sen_lab_pair, output_file):
             sentence = x[0]
             label_seq = ' '.join(x[2])
             assert len(sentence.split()) == len(x[2])
-            fout.write(sentence + '\t' + x[1]+'\t' +label_seq + '\n')
+            fout.write(sentence + '\t' + x[1] + '\t' + label_seq + '\n')
 
 
 if __name__ == '__main__':
diff --git a/demo/semantic_role_labeling/data/get_data.sh b/demo/semantic_role_labeling/data/get_data.sh
old mode 100644
new mode 100755
index 55e33f4685627ed483aa6642c518a33558091531..a0ef26a13b9a03392cb8b6207d6d21b7761e38e8
--- a/demo/semantic_role_labeling/data/get_data.sh
+++ b/demo/semantic_role_labeling/data/get_data.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,10 +14,10 @@
 # limitations under the License.
 set -e
 wget http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/verbDict.txt --no-check-certificate
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/targetDict.txt --no-check-certificate
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/wordDict.txt --no-check-certificate
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/emb --no-check-certificate
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt 
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt 
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb
 tar -xzvf conll05st-tests.tar.gz
 rm conll05st-tests.tar.gz
 cp ./conll05st-release/test.wsj/words/test.wsj.words.gz  .
diff --git a/demo/semantic_role_labeling/dataprovider.py b/demo/semantic_role_labeling/dataprovider.py
index d4c137ef42c4e2ec609f3e6f809363e602dfd8dd..042cd4e7a9e256cd597ac34eed423040f1d7ccd5 100644
--- a/demo/semantic_role_labeling/dataprovider.py
+++ b/demo/semantic_role_labeling/dataprovider.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,32 +21,36 @@ def hook(settings, word_dict, label_dict, predicate_dict, **kwargs):
     settings.word_dict = word_dict
     settings.label_dict = label_dict
     settings.predicate_dict = predicate_dict
-   
+
     #all inputs are integral and sequential type
     settings.slots = [
         integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(predicate_dict)),
         integer_value_sequence(len(word_dict)),
         integer_value_sequence(len(word_dict)),
         integer_value_sequence(len(word_dict)),
         integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)), integer_value_sequence(2),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(predicate_dict)), integer_value_sequence(2),
         integer_value_sequence(len(label_dict))
     ]
 
 
 def get_batch_size(yeild_data):
     return len(yeild_data[0])
-    
 
-@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size, 
-          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
+
+@provider(
+    init_hook=hook,
+    should_shuffle=True,
+    calc_batch_size=get_batch_size,
+    can_over_batch_size=False,
+    cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_name):
     with open(file_name, 'r') as fdata:
         for line in fdata:
             sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
                 line.strip().split('\t')
-           
+
             words = sentence.split()
             sen_len = len(words)
             word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
@@ -63,5 +67,5 @@ def process(settings, file_name):
 
             label_list = label.split()
             label_slot = [settings.label_dict.get(w) for w in label_list]
-            yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
-                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
+            yield word_slot, ctx_n2_slot, ctx_n1_slot, \
+                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot, label_slot
diff --git a/demo/semantic_role_labeling/db_lstm.py b/demo/semantic_role_labeling/db_lstm.py
index 54ceff0e724220cc9ea96b9e0ec6844947a8343e..04e2a559b19bd4b9aec0242eb43edf6ab1e7624e 100644
--- a/demo/semantic_role_labeling/db_lstm.py
+++ b/demo/semantic_role_labeling/db_lstm.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@ from paddle.trainer_config_helpers import *
 #file paths
 word_dict_file = './data/wordDict.txt'
 label_dict_file = './data/targetDict.txt'
-predicate_file= './data/verbDict.txt'
+predicate_file = './data/verbDict.txt'
 train_list_file = './data/train.list'
 test_list_file = './data/test.list'
 
@@ -47,7 +47,6 @@ if not is_predict:
             w = line.strip()
             predicate_dict[w] = i
 
-
     if is_test:
         train_list_file = None
 
@@ -57,9 +56,11 @@ if not is_predict:
         test_list=test_list_file,
         module='dataprovider',
         obj='process',
-        args={'word_dict': word_dict,
-              'label_dict': label_dict,
-              'predicate_dict': predicate_dict })
+        args={
+            'word_dict': word_dict,
+            'label_dict': label_dict,
+            'predicate_dict': predicate_dict
+        })
 
     word_dict_len = len(word_dict)
     label_dict_len = len(label_dict)
@@ -77,24 +78,16 @@ mark_dim = 5
 hidden_dim = 512
 depth = 8
 
-
-
 ########################### Optimizer #######################################
 
-
 settings(
     batch_size=150,
     learning_method=MomentumOptimizer(momentum=0),
     learning_rate=2e-2,
     regularization=L2Regularization(8e-4),
     is_async=False,
-    model_average=ModelAverage(average_window=0.5,
-                               max_average_window=10000),
-                               
-)
-
-
-
+    model_average=ModelAverage(
+        average_window=0.5, max_average_window=10000), )
 
 ####################################### network ##############################
 #8 features and 1 target
@@ -108,22 +101,28 @@ ctx_p1 = data_layer(name='ctx_p1_data', size=word_dict_len)
 ctx_p2 = data_layer(name='ctx_p2_data', size=word_dict_len)
 mark = data_layer(name='mark_data', size=mark_dict_len)
 
-
 if not is_predict:
     target = data_layer(name='target', size=label_dict_len)
 
-
-default_std=1/math.sqrt(hidden_dim)/3.0
+default_std = 1 / math.sqrt(hidden_dim) / 3.0
 
 emb_para = ParameterAttribute(name='emb', initial_std=0., learning_rate=0.)
 std_0 = ParameterAttribute(initial_std=0.)
-std_default = ParameterAttribute(initial_std=default_std) 
-
-predicate_embedding = embedding_layer(size=word_dim, input=predicate, param_attr=ParameterAttribute(name='vemb',initial_std=default_std))
-mark_embedding = embedding_layer(name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0)
-
-word_input=[word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-emb_layers = [embedding_layer(size=word_dim, input=x, param_attr=emb_para) for x in word_input]
+std_default = ParameterAttribute(initial_std=default_std)
+
+predicate_embedding = embedding_layer(
+    size=word_dim,
+    input=predicate,
+    param_attr=ParameterAttribute(
+        name='vemb', initial_std=default_std))
+mark_embedding = embedding_layer(
+    name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0)
+
+word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+emb_layers = [
+    embedding_layer(
+        size=word_dim, input=x, param_attr=emb_para) for x in word_input
+]
 emb_layers.append(predicate_embedding)
 emb_layers.append(mark_embedding)
 
@@ -131,84 +130,89 @@ hidden_0 = mixed_layer(
     name='hidden0',
     size=hidden_dim,
     bias_attr=std_default,
-    input=[ full_matrix_projection(input=emb, param_attr=std_default ) for emb in emb_layers ])
-
+    input=[
+        full_matrix_projection(
+            input=emb, param_attr=std_default) for emb in emb_layers
+    ])
 
 mix_hidden_lr = 1e-3
 lstm_para_attr = ParameterAttribute(initial_std=0.0, learning_rate=1.0)
-hidden_para_attr = ParameterAttribute(initial_std=default_std, learning_rate=mix_hidden_lr)
-
-lstm_0 = lstmemory(name='lstm0',
-                   input=hidden_0, 
-                   act=ReluActivation(),
-                   gate_act=SigmoidActivation(),
-                   state_act=SigmoidActivation(),
-                   bias_attr=std_0,
-                   param_attr=lstm_para_attr)
+hidden_para_attr = ParameterAttribute(
+    initial_std=default_std, learning_rate=mix_hidden_lr)
+
+lstm_0 = lstmemory(
+    name='lstm0',
+    input=hidden_0,
+    act=ReluActivation(),
+    gate_act=SigmoidActivation(),
+    state_act=SigmoidActivation(),
+    bias_attr=std_0,
+    param_attr=lstm_para_attr)
 
 #stack L-LSTM and R-LSTM with direct edges
 input_tmp = [hidden_0, lstm_0]
 
-
 for i in range(1, depth):
 
-    mix_hidden = mixed_layer(name='hidden'+str(i),
-                             size=hidden_dim, 
-                             bias_attr=std_default,
-                             input=[full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr),
-                                    full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr)
-                                   ]
-                             )
-
-    lstm = lstmemory(name='lstm'+str(i),
-                     input=mix_hidden,
-                     act=ReluActivation(),
-                     gate_act=SigmoidActivation(),
-                     state_act=SigmoidActivation(),
-                     reverse=((i % 2)==1),
-                     bias_attr=std_0,
-                     param_attr=lstm_para_attr)
+    mix_hidden = mixed_layer(
+        name='hidden' + str(i),
+        size=hidden_dim,
+        bias_attr=std_default,
+        input=[
+            full_matrix_projection(
+                input=input_tmp[0], param_attr=hidden_para_attr),
+            full_matrix_projection(
+                input=input_tmp[1], param_attr=lstm_para_attr)
+        ])
+
+    lstm = lstmemory(
+        name='lstm' + str(i),
+        input=mix_hidden,
+        act=ReluActivation(),
+        gate_act=SigmoidActivation(),
+        state_act=SigmoidActivation(),
+        reverse=((i % 2) == 1),
+        bias_attr=std_0,
+        param_attr=lstm_para_attr)
 
     input_tmp = [mix_hidden, lstm]
 
-feature_out = mixed_layer(name='output',
-                          size=label_dict_len,
-                          bias_attr=std_default, 
-                          input=[full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr),
-                                 full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr)
-                                ],
-                          )
-
-
+feature_out = mixed_layer(
+    name='output',
+    size=label_dict_len,
+    bias_attr=std_default,
+    input=[
+        full_matrix_projection(
+            input=input_tmp[0], param_attr=hidden_para_attr),
+        full_matrix_projection(
+            input=input_tmp[1], param_attr=lstm_para_attr)
+    ], )
 
 if not is_predict:
-    crf_l = crf_layer( name = 'crf',
-                       size = label_dict_len,
-                       input = feature_out, 
-                       label = target,
-                       param_attr=ParameterAttribute(name='crfw',initial_std=default_std, learning_rate=mix_hidden_lr)
-
-                      )
-
-    
-    crf_dec_l = crf_decoding_layer(name = 'crf_dec_l',
-                                   size = label_dict_len,
-                                   input = feature_out,
-                                   label = target,
-                                   param_attr=ParameterAttribute(name='crfw')
-                                       )
-
+    crf_l = crf_layer(
+        name='crf',
+        size=label_dict_len,
+        input=feature_out,
+        label=target,
+        param_attr=ParameterAttribute(
+            name='crfw', initial_std=default_std, learning_rate=mix_hidden_lr))
+
+    crf_dec_l = crf_decoding_layer(
+        name='crf_dec_l',
+        size=label_dict_len,
+        input=feature_out,
+        label=target,
+        param_attr=ParameterAttribute(name='crfw'))
 
     eval = sum_evaluator(input=crf_dec_l)
-        
+
     outputs(crf_l)
 
 else:
-    crf_dec_l = crf_decoding_layer(name = 'crf_dec_l',
-                                   size = label_dict_len,
-                                   input = feature_out,
-                                   param_attr=ParameterAttribute(name='crfw')
-                                       )
+    crf_dec_l = crf_decoding_layer(
+        name='crf_dec_l',
+        size=label_dict_len,
+        input=feature_out,
+        param_attr=ParameterAttribute(name='crfw'))
 
     outputs(crf_dec_l)
-
diff --git a/demo/semantic_role_labeling/predict.py b/demo/semantic_role_labeling/predict.py
index 2761814e1811e701122e0be4850526c5b290c457..372fd090b6e8f08f5bb34697772c2e4976810595 100644
--- a/demo/semantic_role_labeling/predict.py
+++ b/demo/semantic_role_labeling/predict.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,7 +26,8 @@ UNK_IDX = 0
 
 
 class Prediction():
-    def __init__(self, train_conf, dict_file, model_dir, label_file, predicate_dict_file):
+    def __init__(self, train_conf, dict_file, model_dir, label_file,
+                 predicate_dict_file):
         """
         train_conf: trainer configure.
         dict_file: word dictionary file name.
@@ -35,7 +36,7 @@ class Prediction():
 
         self.dict = {}
         self.labels = {}
-        self.predicate_dict={}
+        self.predicate_dict = {}
         self.labels_reverse = {}
         self.load_dict_label(dict_file, label_file, predicate_dict_file)
 
@@ -44,28 +45,17 @@ class Prediction():
         len_pred = len(self.predicate_dict)
 
         conf = parse_config(
-            train_conf,
-            'dict_len=' + str(len_dict) + 
-            ',label_len=' + str(len_label) +
-            ',pred_len=' + str(len_pred) +
-            ',is_predict=True')
+            train_conf, 'dict_len=' + str(len_dict) + ',label_len=' +
+            str(len_label) + ',pred_len=' + str(len_pred) + ',is_predict=True')
         self.network = swig_paddle.GradientMachine.createFromConfigProto(
             conf.model_config)
         self.network.loadParameters(model_dir)
 
         slots = [
-            integer_value_sequence(len_dict),
-            integer_value_sequence(len_pred),
-            integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict), 
-            integer_value_sequence(2)
-            ]
             integer_value_sequence(len_dict), integer_value_sequence(len_dict),
             integer_value_sequence(len_dict), integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict), integer_value_sequence(2)
+            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
+            integer_value_sequence(len_pred), integer_value_sequence(2)
         ]
         self.converter = DataProviderConverter(slots)
 
@@ -82,6 +72,7 @@ class Prediction():
 
         for line_count, line in enumerate(open(predicate_dict_file, 'r')):
             self.predicate_dict[line.strip()] = line_count
+
     def get_data(self, data_file):
         """
         Get input data of paddle format.
@@ -92,9 +83,10 @@ class Prediction():
                 ).split('\t')
                 words = sentence.split()
                 sen_len = len(words)
-                 
+
                 word_slot = [self.dict.get(w, UNK_IDX) for w in words]
-                predicate_slot = [self.predicate_dict.get(predicate, UNK_IDX)] * sen_len
+                predicate_slot = [self.predicate_dict.get(predicate, UNK_IDX)
+                                  ] * sen_len
                 ctx_n2_slot = [self.dict.get(ctx_n2, UNK_IDX)] * sen_len
                 ctx_n1_slot = [self.dict.get(ctx_n1, UNK_IDX)] * sen_len
                 ctx_0_slot = [self.dict.get(ctx_0, UNK_IDX)] * sen_len
@@ -103,9 +95,9 @@ class Prediction():
 
                 marks = mark.split()
                 mark_slot = [int(w) for w in marks]
-                
-                yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
-                      ctx_0_slot, ctx_p1_slot, ctx_p2_slot,  mark_slot
+
+                yield word_slot, ctx_n2_slot, ctx_n1_slot, \
+                      ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot
 
     def predict(self, data_file, output_file):
         """
@@ -127,8 +119,9 @@ class Prediction():
 
 
 def option_parser():
-    usage = ("python predict.py -c config -w model_dir " 
-             "-d word dictionary -l label_file -i input_file  -p pred_dict_file")
+    usage = (
+        "python predict.py -c config -w model_dir "
+        "-d word dictionary -l label_file -i input_file  -p pred_dict_file")
     parser = OptionParser(usage="usage: %s [options]" % usage)
     parser.add_option(
         "-c",
@@ -191,8 +184,9 @@ def main():
     output_file = options.output_file
 
     swig_paddle.initPaddle("--use_gpu=0")
-    predict = Prediction(train_conf, dict_file, model_path, label_file, predict_dict_file)
-    predict.predict(data_file,output_file)
+    predict = Prediction(train_conf, dict_file, model_path, label_file,
+                         predict_dict_file)
+    predict.predict(data_file, output_file)
 
 
 if __name__ == '__main__':
diff --git a/demo/semantic_role_labeling/predict.sh b/demo/semantic_role_labeling/predict.sh
old mode 100644
new mode 100755
index d0acdb0bd093974485475cf796c6d41ac7899135..873aad670d16803ce321ab60baabe9fe29ea64bf
--- a/demo/semantic_role_labeling/predict.sh
+++ b/demo/semantic_role_labeling/predict.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@ set -e
 function get_best_pass() {
   cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
   sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
-  sort | head -n 1
+  sort -n | head -n 1
 }   
 
 log=train.log
diff --git a/demo/semantic_role_labeling/test.sh b/demo/semantic_role_labeling/test.sh
old mode 100644
new mode 100755
index c4ab44f5ca08aefd18f2851a1410aa08563925a9..11d9d6a19c1b17ad1b7540ee7a03017f85dd821e
--- a/demo/semantic_role_labeling/test.sh
+++ b/demo/semantic_role_labeling/test.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@ set -e
 function get_best_pass() {
   cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
   sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
-  sort | head -n 1
+  sort -n | head -n 1
 }
 
 log=train.log
diff --git a/demo/semantic_role_labeling/train.sh b/demo/semantic_role_labeling/train.sh
old mode 100644
new mode 100755
index 420768bb2b4ebed7b135a49c5eee5e5538426ae1..9354e72f46dc4dfc46138a04c330933d404c6cb8
--- a/demo/semantic_role_labeling/train.sh
+++ b/demo/semantic_role_labeling/train.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/sentiment/data/get_imdb.sh b/demo/sentiment/data/get_imdb.sh
index 28fa86232d89964b3f1680080239cf8a4ebefa9a..7600af6fbb900ee845702f1297779c1f0ed9bf84 100755
--- a/demo/sentiment/data/get_imdb.sh
+++ b/demo/sentiment/data/get_imdb.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/sentiment/dataprovider.py b/demo/sentiment/dataprovider.py
index 53e3d1d20df92b8815347bd8937064871f326b3f..00f72cecacb454a0dd1184fa2098be4543007de7 100755
--- a/demo/sentiment/dataprovider.py
+++ b/demo/sentiment/dataprovider.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/sentiment/predict.py b/demo/sentiment/predict.py
index bc0f6f31264294034ed38309f7fda370865b2845..8ec490f64691924013200a3d0038d39aa834b038 100755
--- a/demo/sentiment/predict.py
+++ b/demo/sentiment/predict.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
+import os, sys
 import numpy as np
 from optparse import OptionParser
 from py_paddle import swig_paddle, DataProviderConverter
@@ -66,34 +66,24 @@ class SentimentPrediction():
         for v in open(label_file, 'r'):
             self.label[int(v.split('\t')[1])] = v.split('\t')[0]
 
-    def get_data(self, data_file):
+    def get_index(self, data):
         """
-        Get input data of paddle format.
+        transform word into integer index according to the dictionary.
         """
-        with open(data_file, 'r') as fdata:
-            for line in fdata:
-                words = line.strip().split()
-                word_slot = [
-                    self.word_dict[w] for w in words if w in self.word_dict
-                ]
-                if not word_slot:
-                    print "all words are not in dictionary: %s", line
-                    continue
-                yield [word_slot]
-
-    def predict(self, data_file):
-        """
-        data_file: file name of input data.
-        """
-        input = self.converter(self.get_data(data_file))
+        words = data.strip().split()
+        word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
+        return word_slot
+
+    def batch_predict(self, data_batch):
+        input = self.converter(data_batch)
         output = self.network.forwardTest(input)
         prob = output[0]["value"]
-        lab = np.argsort(-prob)
-        if self.label is None:
-            print("%s: predicting label is %d" % (data_file, lab[0][0]))
-        else:
-            print("%s: predicting label is %s" %
-                  (data_file, self.label[lab[0][0]]))
+        labs = np.argsort(-prob)
+        for idx, lab in enumerate(labs):
+            if self.label is None:
+                print("predicting label is %d" % (lab[0]))
+            else:
+                print("predicting label is %s" % (self.label[lab[0]]))
 
 
 def option_parser():
@@ -119,11 +109,13 @@ def option_parser():
         default=None,
         help="dictionary file")
     parser.add_option(
-        "-i",
-        "--data",
+        "-c",
+        "--batch_size",
+        type="int",
         action="store",
-        dest="data",
-        help="data file to predict")
+        dest="batch_size",
+        default=1,
+        help="the batch size for prediction")
     parser.add_option(
         "-w",
         "--model",
@@ -137,13 +129,21 @@ def option_parser():
 def main():
     options, args = option_parser()
     train_conf = options.train_conf
-    data = options.data
+    batch_size = options.batch_size
     dict_file = options.dict_file
     model_path = options.model_path
     label = options.label
     swig_paddle.initPaddle("--use_gpu=0")
     predict = SentimentPrediction(train_conf, dict_file, model_path, label)
-    predict.predict(data)
+
+    batch = []
+    for line in sys.stdin:
+        batch.append([predict.get_index(line)])
+        if len(batch) == batch_size:
+            predict.batch_predict(batch)
+            batch = []
+    if len(batch) > 0:
+        predict.batch_predict(batch)
 
 
 if __name__ == '__main__':
diff --git a/demo/sentiment/predict.sh b/demo/sentiment/predict.sh
index 053f23e491ab8d8af082e0b1ff1093be714a8268..c72a8e8641516543ef267fcb4b448630246d1e8d 100755
--- a/demo/sentiment/predict.sh
+++ b/demo/sentiment/predict.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,9 +19,9 @@ set -e
 model=model_output/pass-00002/
 config=trainer_config.py
 label=data/pre-imdb/labels.list
-python predict.py \
-     -n $config\
-     -w $model \
-     -b $label \
-     -d ./data/pre-imdb/dict.txt \
-     -i ./data/aclImdb/test/pos/10007_10.txt 
+cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
+     --tconf=$config\
+     --model=$model \
+     --label=$label \
+     --dict=./data/pre-imdb/dict.txt \
+     --batch_size=1
diff --git a/demo/sentiment/preprocess.py b/demo/sentiment/preprocess.py
index 7146e95d751c4de649e204fab724085994dfa4d3..29b3682b747c66574590de5ea70574981cc536bb 100755
--- a/demo/sentiment/preprocess.py
+++ b/demo/sentiment/preprocess.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/sentiment/preprocess.sh b/demo/sentiment/preprocess.sh
index 5f5c78e222917ddd0f3b4b3387a5753288c0b3a8..19ec34d4f016365d18db01ddec559d26202b19c6 100755
--- a/demo/sentiment/preprocess.sh
+++ b/demo/sentiment/preprocess.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/sentiment/sentiment_net.py b/demo/sentiment/sentiment_net.py
index ff6a3624a404cb52d5d7ac0934fedba0d489dc22..a01577ca5ae025b7bec67c6d54c7dbd931dbee74 100644
--- a/demo/sentiment/sentiment_net.py
+++ b/demo/sentiment/sentiment_net.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/sentiment/test.sh b/demo/sentiment/test.sh
index 098fbb91389b89c8b69ccf2f5d308e4e715ac950..8af827c3388c8df88a872bd87d121a4f9631c3ff 100755
--- a/demo/sentiment/test.sh
+++ b/demo/sentiment/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@ set -e
 function get_best_pass() {
   cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
   sed  -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
-  sort | head -n 1
+  sort -n | head -n 1
 }
 
 log=train.log
diff --git a/demo/sentiment/train.sh b/demo/sentiment/train.sh
index f44a9a53f2db9a57a1f018b090a3cd74dae70e72..5ce8bf4b997d962b9b61593cec0954d76c4874bc 100755
--- a/demo/sentiment/train.sh
+++ b/demo/sentiment/train.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/sentiment/trainer_config.py b/demo/sentiment/trainer_config.py
index 114a9138ebfef054c7d3ba99b4a510a452f8f2cd..2defecd178262900c03c1eda60b351dc44629d1f 100644
--- a/demo/sentiment/trainer_config.py
+++ b/demo/sentiment/trainer_config.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/seqToseq/data/paraphrase_data.sh b/demo/seqToseq/data/paraphrase_data.sh
index ea1f8dbcfad35699189f6cd4efc81d97e8c89148..e6497c91286d44b5ef3b66c5f824e36a09728720 100755
--- a/demo/seqToseq/data/paraphrase_data.sh
+++ b/demo/seqToseq/data/paraphrase_data.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,9 +16,7 @@ set -e
 set -x
 
 # download the in-house paraphrase dataset
-# following is the google drive address
-# you can also directly download from https://pan.baidu.com/s/1o8q577s
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/paraphrase.tar.gz --no-check-certificate
+wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/paraphrase.tar.gz
 
 # untar the dataset
 tar -zxvf paraphrase.tar.gz
diff --git a/demo/seqToseq/data/paraphrase_model.sh b/demo/seqToseq/data/paraphrase_model.sh
index 041f69cf467b1322f9d261a9b6b533d98320957d..d0e7f214a38c4dad0fdf7c10ba3b76eb0ab40f06 100755
--- a/demo/seqToseq/data/paraphrase_model.sh
+++ b/demo/seqToseq/data/paraphrase_model.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/seqToseq/data/wmt14_data.sh b/demo/seqToseq/data/wmt14_data.sh
index 6c360b206011a7ad2eeab6bde7aef8ce35ec01e7..43f67168d2a876ba5401e0f8490a88adac9c5551 100755
--- a/demo/seqToseq/data/wmt14_data.sh
+++ b/demo/seqToseq/data/wmt14_data.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/seqToseq/data/wmt14_model.sh b/demo/seqToseq/data/wmt14_model.sh
index 2cec30688d27a57902cdf64d7be5712d12c69bdd..c4b55b90a3eb98f94e0eb3be028c6de1ef57326b 100755
--- a/demo/seqToseq/data/wmt14_model.sh
+++ b/demo/seqToseq/data/wmt14_model.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,9 +16,7 @@ set -e
 set -x
 
 # download the pretrained model
-# following is the google drive address
-# you can also directly download from https://pan.baidu.com/s/1o8q577s
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/wmt14_model.tar.gz --no-check-certificate
+wget http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz
 
 # untar the model
 tar -zxvf wmt14_model.tar.gz
diff --git a/demo/seqToseq/dataprovider.py b/demo/seqToseq/dataprovider.py
index c5da1b7685f47fda337921c7c60ac1497b9e48bb..c2b49804be582d7d0bc3ef6332741be03936eb24 100755
--- a/demo/seqToseq/dataprovider.py
+++ b/demo/seqToseq/dataprovider.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,27 +19,43 @@ START = "<s>"
 END = "<e>"
 
 
-def hook(settings, src_dict, trg_dict, file_list, **kwargs):
+def hook(settings, src_dict_path, trg_dict_path, is_generating, file_list,
+         **kwargs):
     # job_mode = 1: training mode
     # job_mode = 0: generating mode
-    settings.job_mode = trg_dict is not None
-    settings.src_dict = src_dict
+    settings.job_mode = not is_generating
+
+    def fun(dict_path):
+        out_dict = dict()
+        with open(dict_path, "r") as fin:
+            out_dict = {
+                line.strip(): line_count
+                for line_count, line in enumerate(fin)
+            }
+        return out_dict
+
+    settings.src_dict = fun(src_dict_path)
+    settings.trg_dict = fun(trg_dict_path)
+
     settings.logger.info("src dict len : %d" % (len(settings.src_dict)))
-    settings.sample_count = 0
 
     if settings.job_mode:
-        settings.trg_dict = trg_dict
-        settings.slots = [
+        settings.slots = {
+            'source_language_word':
             integer_value_sequence(len(settings.src_dict)),
+            'target_language_word':
             integer_value_sequence(len(settings.trg_dict)),
+            'target_language_next_word':
             integer_value_sequence(len(settings.trg_dict))
-        ]
+        }
         settings.logger.info("trg dict len : %d" % (len(settings.trg_dict)))
     else:
-        settings.slots = [
+        settings.slots = {
+            'source_language_word':
             integer_value_sequence(len(settings.src_dict)),
+            'sent_id':
             integer_value_sequence(len(open(file_list[0], "r").readlines()))
-        ]
+        }
 
 
 def _get_ids(s, dictionary):
@@ -69,6 +85,10 @@ def process(settings, file_name):
                     continue
                 trg_ids_next = trg_ids + [settings.trg_dict[END]]
                 trg_ids = [settings.trg_dict[START]] + trg_ids
-                yield src_ids, trg_ids, trg_ids_next
+                yield {
+                    'source_language_word': src_ids,
+                    'target_language_word': trg_ids,
+                    'target_language_next_word': trg_ids_next
+                }
             else:
-                yield src_ids, [line_count]
+                yield {'source_language_word': src_ids, 'sent_id': [line_count]}
diff --git a/demo/seqToseq/paraphrase/train.conf b/demo/seqToseq/paraphrase/train.conf
index 748920e2c7253790f1f0b9635e7e604f598a0da3..be79c5e771c0e864fd1776cedb3ef37c997b6df6 100644
--- a/demo/seqToseq/paraphrase/train.conf
+++ b/demo/seqToseq/paraphrase/train.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/seqToseq/paraphrase/train.sh b/demo/seqToseq/paraphrase/train.sh
index 2aa7b84060b19869f73f9a64df125bd429395947..33a42f6eff2b0414c466d5f78c89989a6a517eb9 100755
--- a/demo/seqToseq/paraphrase/train.sh
+++ b/demo/seqToseq/paraphrase/train.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/seqToseq/preprocess.py b/demo/seqToseq/preprocess.py
index bd1c51b1514b790ec385d48f49197b3e0285e736..03f371331a0755e5939e457f4bdfb1770b8dad88 100755
--- a/demo/seqToseq/preprocess.py
+++ b/demo/seqToseq/preprocess.py
@@ -1,5 +1,5 @@
 #!/bin/env python
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py
index ad5e3339c1461de06732eb62aca9e8323eea707b..e523a34d5a95120d1f0a583be8bbdbff5678d1ab 100644
--- a/demo/seqToseq/seqToseq_net.py
+++ b/demo/seqToseq/seqToseq_net.py
@@ -1,6 +1,6 @@
 # edit-mode: -*- python -*-
 
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -37,17 +37,10 @@ def seq_to_seq_data(data_dir,
     """
     src_lang_dict = os.path.join(data_dir, 'src.dict')
     trg_lang_dict = os.path.join(data_dir, 'trg.dict')
-    src_dict = dict()
-    for line_count, line in enumerate(open(src_lang_dict, "r")):
-        src_dict[line.strip()] = line_count
-    trg_dict = dict()
-    for line_count, line in enumerate(open(trg_lang_dict, "r")):
-        trg_dict[line.strip()] = line_count
 
     if is_generating:
         train_list = None
         test_list = os.path.join(data_dir, gen_list)
-        trg_dict = None
     else:
         train_list = os.path.join(data_dir, train_list)
         test_list = os.path.join(data_dir, test_list)
@@ -57,8 +50,11 @@ def seq_to_seq_data(data_dir,
         test_list,
         module="dataprovider",
         obj="process",
-        args={"src_dict": src_dict,
-              "trg_dict": trg_dict})
+        args={
+            "src_dict_path": src_lang_dict,
+            "trg_dict_path": trg_lang_dict,
+            "is_generating": is_generating
+        })
 
     return {
         "src_dict_path": src_lang_dict,
diff --git a/demo/seqToseq/translation/eval_bleu.sh b/demo/seqToseq/translation/eval_bleu.sh
index ef0ede717a740f4dcbf38fe6bde504470779e81a..54c2ed237e93adb3456dbe62f75626d36c2d90bc 100755
--- a/demo/seqToseq/translation/eval_bleu.sh
+++ b/demo/seqToseq/translation/eval_bleu.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/seqToseq/translation/gen.conf b/demo/seqToseq/translation/gen.conf
index 63c5c2f9a6052c16211c80284617ab1dad0133d7..e9bea4e4559ff31ad83c4474e91de7e7acc77e9f 100644
--- a/demo/seqToseq/translation/gen.conf
+++ b/demo/seqToseq/translation/gen.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/seqToseq/translation/gen.sh b/demo/seqToseq/translation/gen.sh
index ad977c05ff98975c2ce667372e7b491af9243ecc..a700ae213473dfe7c5b77156de15775b8fe9a9f0 100755
--- a/demo/seqToseq/translation/gen.sh
+++ b/demo/seqToseq/translation/gen.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/seqToseq/translation/moses_bleu.sh b/demo/seqToseq/translation/moses_bleu.sh
index bfaba40b26905ce04934253ff8a0e7738018ecc2..2f230d7f4c736da003966fbdb277f6b8b1ec952c 100755
--- a/demo/seqToseq/translation/moses_bleu.sh
+++ b/demo/seqToseq/translation/moses_bleu.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/seqToseq/translation/train.conf b/demo/seqToseq/translation/train.conf
index cf1bde15c4a8aa2bf853f0006c3addcf71cf2e55..72b7ccdbb95dbda8f06674079db9a3257bb31622 100644
--- a/demo/seqToseq/translation/train.conf
+++ b/demo/seqToseq/translation/train.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/seqToseq/translation/train.sh b/demo/seqToseq/translation/train.sh
index 976b5ba3b054c49bd87fda87d2520bd6cc55116c..bdece693e5c407c89bc172c461bac7f9b20560d3 100755
--- a/demo/seqToseq/translation/train.sh
+++ b/demo/seqToseq/translation/train.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/sequence_tagging/data/get_data.sh b/demo/sequence_tagging/data/get_data.sh
index e579d6c46ce5ed96f442acc448b4cc61bf8394a3..0cdb394035e782b3a647f7f13e79d55b5d3dff48 100755
--- a/demo/sequence_tagging/data/get_data.sh
+++ b/demo/sequence_tagging/data/get_data.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/sequence_tagging/dataprovider.py b/demo/sequence_tagging/dataprovider.py
index 37dcb7aa17c0abd197ef2f3121bf8be6c54375c2..bb4b4465bc7e032c50c1d21263651e2578af67be 100644
--- a/demo/sequence_tagging/dataprovider.py
+++ b/demo/sequence_tagging/dataprovider.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/demo/sequence_tagging/linear_crf.py b/demo/sequence_tagging/linear_crf.py
index 64895742e1b8c0a11cbedee0b88e61b5b63b007f..0624b17787aaf90732707e5f4fc6c2195b8f65ee 100644
--- a/demo/sequence_tagging/linear_crf.py
+++ b/demo/sequence_tagging/linear_crf.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -74,7 +74,8 @@ sum_evaluator(
 
 chunk_evaluator(
     name="chunk_f1",
-    input=[crf_decoding, chunk],
+    input=crf_decoding,
+    label=chunk,
     chunk_scheme="IOB",
     num_chunk_types=11, )
 
diff --git a/demo/sequence_tagging/rnn_crf.py b/demo/sequence_tagging/rnn_crf.py
index 90d4bbdddfdb4e38b930d54a2bc865df9fac589c..b9b41b2433461eb1bfb309659834661c2ae43253 100644
--- a/demo/sequence_tagging/rnn_crf.py
+++ b/demo/sequence_tagging/rnn_crf.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -112,7 +112,8 @@ sum_evaluator(
 
 chunk_evaluator(
     name="chunk_f1",
-    input=[crf_decoding, chunk],
+    input=crf_decoding,
+    label=chunk,
     chunk_scheme="IOB",
     num_chunk_types=11, )
 
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index ef4e9d102d35fc95e96711175a57f7e181a946c6..6fa42fd0c71e78cc2fa6b0fe2cb970baf4ac89ed 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -7,40 +7,50 @@ if(NOT DEFINED SPHINX_THEME_DIR)
 endif()
 
 # configured documentation tools and intermediate build results
-set(BINARY_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/_build")
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
 
 # Sphinx cache with pickled ReST documents
-set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 
-# HTML output directory
-set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html")
-
-
-set(PADDLE_DOXYGEN_OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/doxygen_xml")
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
 
 configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
-    "${BINARY_BUILD_DIR}/conf.py"
+    "${CMAKE_CURRENT_SOURCE_DIR}/templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
     @ONLY)
 
-configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in"
-    "${CMAKE_CURRENT_BINARY_DIR}/Doxyfile"
-    @ONLY
-  )
+sphinx_add_target(paddle_docs
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
 
-add_custom_target(paddle_doxygen_docs ALL
-    ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
-    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-)
+add_dependencies(paddle_docs
+  gen_proto_py)
 
-sphinx_add_target(paddle_docs
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
+
+# HTML output directory
+set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/templates/conf.py.cn.in"
+    "${BINARY_BUILD_DIR_CN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_docs_cn
                   html
-                  ${BINARY_BUILD_DIR}
-                  ${SPHINX_CACHE_DIR}
+                  ${BINARY_BUILD_DIR_CN}
+                  ${SPHINX_CACHE_DIR_CN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
-                  ${SPHINX_HTML_DIR})
+                  ${SPHINX_HTML_DIR_CN})
 
-add_dependencies(paddle_docs 
-  gen_proto_py
-  paddle_doxygen_docs)
+add_dependencies(paddle_docs_cn
+  gen_proto_py)
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
deleted file mode 100644
index a1fc3801925dd340709ac77c9aa77c82051ee111..0000000000000000000000000000000000000000
--- a/doc/Doxyfile.in
+++ /dev/null
@@ -1,2384 +0,0 @@
-# Doxyfile 1.8.10
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project.
-#
-# All text after a double hash (##) is considered a comment and is placed in
-# front of the TAG it is preceding.
-#
-# All text after a single hash (#) is considered a comment and will be ignored.
-# The format is:
-# TAG = value [value, ...]
-# For lists, items can also be appended using:
-# TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (\" \").
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
-# The default value is: UTF-8.
-
-DOXYFILE_ENCODING      = UTF-8
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
-# double-quotes, unless you are using Doxywizard) that should identify the
-# project for which the documentation is generated. This name is used in the
-# title of most generated pages and in a few other places.
-# The default value is: My Project.
-
-PROJECT_NAME           = "paddle"
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
-# could be handy for archiving the generated documentation or if some version
-# control system is used.
-
-PROJECT_NUMBER         = 1.0.0
-
-# Using the PROJECT_BRIEF tag one can provide an optional one line description
-# for a project that appears at the top of each page and should give viewer a
-# quick idea about the purpose of the project. Keep the description short.
-
-PROJECT_BRIEF          =
-
-# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
-# in the documentation. The maximum height of the logo should not exceed 55
-# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
-# the logo to the output directory.
-
-PROJECT_LOGO           =
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
-# into which the generated documentation will be written. If a relative path is
-# entered, it will be relative to the location where doxygen was started. If
-# left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       = @PADDLE_DOXYGEN_OUTPUT@
-
-# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
-# option can be useful when feeding doxygen a huge amount of source files, where
-# putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
-# The default value is: NO.
-
-CREATE_SUBDIRS         = NO
-
-# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
-# characters to appear in the names of generated files. If set to NO, non-ASCII
-# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
-# U+3044.
-# The default value is: NO.
-
-ALLOW_UNICODE_NAMES    = NO
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
-# The default value is: English.
-
-OUTPUT_LANGUAGE        = English
-
-# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
-# descriptions after the members that are listed in the file and class
-# documentation (similar to Javadoc). Set to NO to disable this.
-# The default value is: YES.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
-# description of a member or function before the detailed description
-#
-# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
-# brief descriptions will be completely suppressed.
-# The default value is: YES.
-
-REPEAT_BRIEF           = YES
-
-# This tag implements a quasi-intelligent brief description abbreviator that is
-# used to form the text in various listings. Each string in this list, if found
-# as the leading text of the brief description, will be stripped from the text
-# and the result, after processing the whole list, is used as the annotated
-# text. Otherwise, the brief description is used as-is. If left blank, the
-# following values are used ($name is automatically replaced with the name of
-# the entity):The $name class, The $name widget, The $name file, is, provides,
-# specifies, contains, represents, a, an and the.
-
-ABBREVIATE_BRIEF       =
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# doxygen will generate a detailed section even if there is only a brief
-# description.
-# The default value is: NO.
-
-ALWAYS_DETAILED_SEC    = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
-# inherited members of a class in the documentation of that class as if those
-# members were ordinary class members. Constructors, destructors and assignment
-# operators of the base classes will not be shown.
-# The default value is: NO.
-
-INLINE_INHERITED_MEMB  = NO
-
-# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
-# before files name in the file list and in the header files. If set to NO the
-# shortest path that makes the file name unique will be used
-# The default value is: YES.
-
-FULL_PATH_NAMES        = YES
-
-# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
-# Stripping is only done if one of the specified strings matches the left-hand
-# part of the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the path to
-# strip.
-#
-# Note that you can specify absolute paths here, but also relative paths, which
-# will be relative from the directory where doxygen is started.
-# This tag requires that the tag FULL_PATH_NAMES is set to YES.
-
-STRIP_FROM_PATH        =
-
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
-# path mentioned in the documentation of a class, which tells the reader which
-# header file to include in order to use a class. If left blank only the name of
-# the header file containing the class definition is used. Otherwise one should
-# specify the list of include paths that are normally passed to the compiler
-# using the -I flag.
-
-STRIP_FROM_INC_PATH    =
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
-# less readable) file names. This can be useful is your file systems doesn't
-# support long names like on DOS, Mac, or CD-ROM.
-# The default value is: NO.
-
-SHORT_NAMES            = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
-# first line (until the first dot) of a Javadoc-style comment as the brief
-# description. If set to NO, the Javadoc-style will behave just like regular Qt-
-# style comments (thus requiring an explicit @brief command for a brief
-# description.)
-# The default value is: NO.
-
-JAVADOC_AUTOBRIEF      = NO
-
-# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
-# line (until the first dot) of a Qt-style comment as the brief description. If
-# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
-# requiring an explicit \brief command for a brief description.)
-# The default value is: NO.
-
-QT_AUTOBRIEF           = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
-# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
-# a brief description. This used to be the default behavior. The new default is
-# to treat a multi-line C++ comment block as a detailed description. Set this
-# tag to YES if you prefer the old behavior instead.
-#
-# Note that setting this tag to YES also means that rational rose comments are
-# not recognized any more.
-# The default value is: NO.
-
-MULTILINE_CPP_IS_BRIEF = NO
-
-# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
-# documentation from any documented member that it re-implements.
-# The default value is: YES.
-
-INHERIT_DOCS           = YES
-
-# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
-# page for each member. If set to NO, the documentation of a member will be part
-# of the file/class/namespace that contains it.
-# The default value is: NO.
-
-SEPARATE_MEMBER_PAGES  = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
-# uses this value to replace tabs by spaces in code fragments.
-# Minimum value: 1, maximum value: 16, default value: 4.
-
-TAB_SIZE               = 2
-
-# This tag can be used to specify a number of aliases that act as commands in
-# the documentation. An alias has the form:
-# name=value
-# For example adding
-# "sideeffect=@par Side Effects:\n"
-# will allow you to put the command \sideeffect (or @sideeffect) in the
-# documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
-
-ALIASES                =
-
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
-# only. Doxygen will then generate output that is more tailored for C. For
-# instance, some of the names that are used will be different. The list of all
-# members will be omitted, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_FOR_C  = NO
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
-# Python sources only. Doxygen will then generate output that is more tailored
-# for that language. For instance, namespaces will be presented as packages,
-# qualified scopes will look different, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_JAVA   = NO
-
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
-# sources. Doxygen will then generate output that is tailored for Fortran.
-# The default value is: NO.
-
-OPTIMIZE_FOR_FORTRAN   = NO
-
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
-# sources. Doxygen will then generate output that is tailored for VHDL.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_VHDL   = NO
-
-# Doxygen selects the parser to use depending on the extension of the files it
-# parses. With this tag you can assign which parser to use for a given
-# extension. Doxygen has a built-in mapping, but you can override or extend it
-# using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
-#
-# Note: For files without extension you can use no_extension as a placeholder.
-#
-# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
-
-EXTENSION_MAPPING      =
-
-# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
-# according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
-# The output of markdown processing is further processed by doxygen, so you can
-# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
-# case of backward compatibilities issues.
-# The default value is: YES.
-
-MARKDOWN_SUPPORT       = YES
-
-# When enabled doxygen tries to link words that correspond to documented
-# classes, or namespaces to their corresponding documentation. Such a link can
-# be prevented in individual cases by putting a % sign in front of the word or
-# globally by setting AUTOLINK_SUPPORT to NO.
-# The default value is: YES.
-
-AUTOLINK_SUPPORT       = YES
-
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
-# to include (a tag file for) the STL sources as input, then you should set this
-# tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string);
-# versus func(std::string) {}). This also make the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
-# The default value is: NO.
-
-BUILTIN_STL_SUPPORT    = YES
-
-# If you use Microsoft's C++/CLI language, you should set this option to YES to
-# enable parsing support.
-# The default value is: NO.
-
-CPP_CLI_SUPPORT        = NO
-
-# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
-# will parse them like normal C++ but will assume all classes use public instead
-# of private inheritance when no explicit protection keyword is present.
-# The default value is: NO.
-
-SIP_SUPPORT            = NO
-
-# For Microsoft's IDL there are propget and propput attributes to indicate
-# getter and setter methods for a property. Setting this option to YES will make
-# doxygen to replace the get and set methods by a property in the documentation.
-# This will only work if the methods are indeed getting or setting a simple
-# type. If this is not the case, or you want to show the methods anyway, you
-# should set this option to NO.
-# The default value is: YES.
-
-IDL_PROPERTY_SUPPORT   = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES then doxygen will reuse the documentation of the first
-# member in the group (if any) for the other members of the group. By default
-# all members of a group must be documented explicitly.
-# The default value is: NO.
-
-DISTRIBUTE_GROUP_DOC   = NO
-
-# If one adds a struct or class to a group and this option is enabled, then also
-# any nested class or struct is added to the same group. By default this option
-# is disabled and one has to add nested compounds explicitly via \ingroup.
-# The default value is: NO.
-
-GROUP_NESTED_COMPOUNDS = NO
-
-# Set the SUBGROUPING tag to YES to allow class member groups of the same type
-# (for instance a group of public functions) to be put as a subgroup of that
-# type (e.g. under the Public Functions section). Set it to NO to prevent
-# subgrouping. Alternatively, this can be done per class using the
-# \nosubgrouping command.
-# The default value is: YES.
-
-SUBGROUPING            = YES
-
-# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
-# are shown inside the group in which they are included (e.g. using \ingroup)
-# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
-# and RTF).
-#
-# Note that this feature does not work in combination with
-# SEPARATE_MEMBER_PAGES.
-# The default value is: NO.
-
-INLINE_GROUPED_CLASSES = NO
-
-# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
-# with only public data fields or simple typedef fields will be shown inline in
-# the documentation of the scope in which they are defined (i.e. file,
-# namespace, or group documentation), provided this scope is documented. If set
-# to NO, structs, classes, and unions are shown on a separate page (for HTML and
-# Man pages) or section (for LaTeX and RTF).
-# The default value is: NO.
-
-INLINE_SIMPLE_STRUCTS  = NO
-
-# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
-# enum is documented as struct, union, or enum with the name of the typedef. So
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
-# with name TypeT. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named TypeS. This can typically be
-# useful for C code in case the coding convention dictates that all compound
-# types are typedef'ed and only the typedef is referenced, never the tag name.
-# The default value is: NO.
-
-TYPEDEF_HIDES_STRUCT   = NO
-
-# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
-# cache is used to resolve symbols given their name and scope. Since this can be
-# an expensive process and often the same symbol appears multiple times in the
-# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
-# doxygen will become slower. If the cache is too large, memory is wasted. The
-# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
-# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
-# symbols. At the end of a run doxygen will report the cache usage and suggest
-# the optimal cache size from a speed point of view.
-# Minimum value: 0, maximum value: 9, default value: 0.
-
-LOOKUP_CACHE_SIZE      = 0
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
-# documentation are documented, even if no documentation was available. Private
-# class members and static file members will be hidden unless the
-# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
-# Note: This will also disable the warnings about undocumented members that are
-# normally produced when WARNINGS is set to YES.
-# The default value is: NO.
-
-EXTRACT_ALL            = NO
-
-# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
-# be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PRIVATE        = NO
-
-# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
-# scope will be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PACKAGE        = NO
-
-# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
-# included in the documentation.
-# The default value is: NO.
-
-EXTRACT_STATIC         = NO
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
-# locally in source files will be included in the documentation. If set to NO,
-# only classes defined in header files are included. Does not have any effect
-# for Java sources.
-# The default value is: YES.
-
-EXTRACT_LOCAL_CLASSES  = YES
-
-# This flag is only useful for Objective-C code. If set to YES, local methods,
-# which are defined in the implementation section but not in the interface are
-# included in the documentation. If set to NO, only methods in the interface are
-# included.
-# The default value is: NO.
-
-EXTRACT_LOCAL_METHODS  = NO
-
-# If this flag is set to YES, the members of anonymous namespaces will be
-# extracted and appear in the documentation as a namespace called
-# 'anonymous_namespace{file}', where file will be replaced with the base name of
-# the file that contains the anonymous namespace. By default anonymous namespace
-# are hidden.
-# The default value is: NO.
-
-EXTRACT_ANON_NSPACES   = NO
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
-# undocumented members inside documented classes or files. If set to NO these
-# members will be included in the various overviews, but no documentation
-# section is generated. This option has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_MEMBERS     = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy. If set
-# to NO, these classes will be included in the various overviews. This option
-# has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_CLASSES     = NO
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
-# The default value is: NO.
-
-HIDE_FRIEND_COMPOUNDS  = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
-# documentation blocks found inside the body of a function. If set to NO, these
-# blocks will be appended to the function's detailed documentation block.
-# The default value is: NO.
-
-HIDE_IN_BODY_DOCS      = NO
-
-# The INTERNAL_DOCS tag determines if documentation that is typed after a
-# \internal command is included. If the tag is set to NO then the documentation
-# will be excluded. Set it to YES to include the internal documentation.
-# The default value is: NO.
-
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-# The default value is: system dependent.
-
-CASE_SENSE_NAMES       = YES
-
-# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
-# their full class and namespace scopes in the documentation. If set to YES, the
-# scope will be hidden.
-# The default value is: NO.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
-# append additional text to a page's title, such as Class Reference. If set to
-# YES the compound reference will be hidden.
-# The default value is: NO.
-
-HIDE_COMPOUND_REFERENCE= NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
-# the files that are included by a file in the documentation of that file.
-# The default value is: YES.
-
-SHOW_INCLUDE_FILES     = NO
-
-# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
-# grouped member an include statement to the documentation, telling the reader
-# which file to include in order to use the member.
-# The default value is: NO.
-
-SHOW_GROUPED_MEMB_INC  = NO
-
-# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
-# files with double quotes in the documentation rather than with sharp brackets.
-# The default value is: NO.
-
-FORCE_LOCAL_INCLUDES   = NO
-
-# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
-# documentation for inline members.
-# The default value is: YES.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
-# (detailed) documentation of file and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order.
-# The default value is: YES.
-
-SORT_MEMBER_DOCS       = YES
-
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
-# descriptions of file, namespace and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order. Note that
-# this will also influence the order of the classes in the class list.
-# The default value is: NO.
-
-SORT_BRIEF_DOCS        = NO
-
-# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
-# (brief and detailed) documentation of class members so that constructors and
-# destructors are listed first. If set to NO the constructors will appear in the
-# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
-# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
-# member documentation.
-# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
-# detailed member documentation.
-# The default value is: NO.
-
-SORT_MEMBERS_CTORS_1ST = NO
-
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
-# of group names into alphabetical order. If set to NO the group names will
-# appear in their defined order.
-# The default value is: NO.
-
-SORT_GROUP_NAMES       = NO
-
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
-# fully-qualified names, including namespaces. If set to NO, the class list will
-# be sorted only by class name, not including the namespace part.
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the alphabetical
-# list.
-# The default value is: NO.
-
-SORT_BY_SCOPE_NAME     = NO
-
-# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
-# type resolution of all parameters of a function it will reject a match between
-# the prototype and the implementation of a member function even if there is
-# only one candidate or it is obvious which candidate to choose by doing a
-# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
-# accept a match between prototype and implementation in such cases.
-# The default value is: NO.
-
-STRICT_PROTO_MATCHING  = NO
-
-# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
-# list. This list is created by putting \todo commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
-# list. This list is created by putting \test commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
-# list. This list is created by putting \bug commands in the documentation.
-# The default value is: YES.
-
-GENERATE_BUGLIST       = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
-# the deprecated list. This list is created by putting \deprecated commands in
-# the documentation.
-# The default value is: YES.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional documentation
-# sections, marked by \if <section_label> ... \endif and \cond <section_label>
-# ... \endcond blocks.
-
-ENABLED_SECTIONS       =
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
-# initial value of a variable or macro / define can have for it to appear in the
-# documentation. If the initializer consists of more lines than specified here
-# it will be hidden. Use a value of 0 to hide initializers completely. The
-# appearance of the value of individual variables and macros / defines can be
-# controlled using \showinitializer or \hideinitializer command in the
-# documentation regardless of this setting.
-# Minimum value: 0, maximum value: 10000, default value: 30.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
-# the bottom of the documentation of classes and structs. If set to YES, the
-# list will mention the files that were used to generate the documentation.
-# The default value is: YES.
-
-SHOW_USED_FILES        = YES
-
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
-# will remove the Files entry from the Quick Index and from the Folder Tree View
-# (if specified).
-# The default value is: YES.
-
-SHOW_FILES             = YES
-
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
-# page. This will remove the Namespaces entry from the Quick Index and from the
-# Folder Tree View (if specified).
-# The default value is: YES.
-
-SHOW_NAMESPACES        = YES
-
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from
-# the version control system). Doxygen will invoke the program by executing (via
-# popen()) the command command input-file, where command is the value of the
-# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
-# by doxygen. Whatever the program writes to standard output is used as the file
-# version. For an example see the documentation.
-
-FILE_VERSION_FILTER    =
-
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
-# by doxygen. The layout file controls the global structure of the generated
-# output files in an output format independent way. To create the layout file
-# that represents doxygen's defaults, run doxygen with the -l option. You can
-# optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
-#
-# Note that if you run doxygen from a directory containing a file called
-# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
-# tag is left empty.
-
-LAYOUT_FILE            =
-
-# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
-# the reference definitions. This must be a list of .bib files. The .bib
-# extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
-# For LaTeX the style of the bibliography can be controlled using
-# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
-# search path. See also \cite for info how to create references.
-
-CITE_BIB_FILES         =
-
-#---------------------------------------------------------------------------
-# Configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated to
-# standard output by doxygen. If QUIET is set to YES this implies that the
-# messages are off.
-# The default value is: NO.
-
-QUIET                  = NO
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
-# this implies that the warnings are on.
-#
-# Tip: Turn warnings on while writing the documentation.
-# The default value is: YES.
-
-WARNINGS               = YES
-
-# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
-# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
-# will automatically be disabled.
-# The default value is: YES.
-
-WARN_IF_UNDOCUMENTED   = NO
-
-# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
-# The default value is: YES.
-
-WARN_IF_DOC_ERROR      = YES
-
-# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
-# are documented, but have no documentation for their parameters or return
-# value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
-# The default value is: NO.
-
-WARN_NO_PARAMDOC       = NO
-
-# The WARN_FORMAT tag determines the format of the warning messages that doxygen
-# can produce. The string should contain the $file, $line, and $text tags, which
-# will be replaced by the file and line number from which the warning originated
-# and the warning text. Optionally the format may contain $version, which will
-# be replaced by the version of the file (if it could be obtained via
-# FILE_VERSION_FILTER)
-# The default value is: $file:$line: $text.
-
-WARN_FORMAT            = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning and error
-# messages should be written. If left blank the output is written to standard
-# error (stderr).
-
-WARN_LOGFILE           =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag is used to specify the files and/or directories that contain
-# documented source files. You may enter file names like myfile.cpp or
-# directories like /usr/src/myproject. Separate the files or directories with
-# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
-# Note: If this tag is empty the current directory is searched.
-
-INPUT                  = @PROJ_ROOT@/paddle
-
-# This tag can be used to specify the character encoding of the source files
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
-# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
-# The default value is: UTF-8.
-
-INPUT_ENCODING         = UTF-8
-
-# If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
-# *.h) to filter out the source-files in the directories.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# read by doxygen.
-#
-# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
-# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd,
-# *.vhdl, *.ucf, *.qsf, *.as and *.js.
-
-FILE_PATTERNS          = *.c *.cc *.cpp *.cu *.h *.hpp *.cuh *.ph
-
-# The RECURSIVE tag can be used to specify whether or not subdirectories should
-# be searched for input files as well.
-# The default value is: NO.
-
-RECURSIVE              = YES
-
-# The EXCLUDE tag can be used to specify files and/or directories that should be
-# excluded from the INPUT source files. This way you can easily exclude a
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-#
-# Note that relative paths are relative to the directory from which doxygen is
-# run.
-
-EXCLUDE                = 
-
-# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
-# directories that are symbolic links (a Unix file system feature) are excluded
-# from the input.
-# The default value is: NO.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories.
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories for example use the pattern */test/*
-
-EXCLUDE_PATTERNS       = */x86_64-scm-linux-gnu/* */internals/* */mkl/* */test/* */tests/* */platform/*
-
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the
-# output. The symbol name can be a fully qualified name, a word, or if the
-# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
-
-EXCLUDE_SYMBOLS        =
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or directories
-# that contain example code fragments that are included (see the \include
-# command).
-
-EXAMPLE_PATH           =
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank all
-# files are included.
-
-EXAMPLE_PATTERNS       =
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude commands
-# irrespective of the value of the RECURSIVE tag.
-# The default value is: NO.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or directories
-# that contain images that are to be included in the documentation (see the
-# \image command).
-
-IMAGE_PATH             =
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should
-# invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command:
-#
-# <filter> <input-file>
-#
-# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
-# name of an input file. Doxygen will then use the output that the filter
-# program writes to standard output. If FILTER_PATTERNS is specified, this tag
-# will be ignored.
-#
-# Note that the filter must not add or remove lines; it is applied before the
-# code is scanned, but not when the output code is generated. If lines are added
-# or removed, the anchors will not be placed correctly.
-
-INPUT_FILTER           =
-
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis. Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match. The filters are a list of the form: pattern=filter
-# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
-# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
-# patterns match the file name, INPUT_FILTER is applied.
-
-FILTER_PATTERNS        =
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER) will also be used to filter the input files that are used for
-# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
-# The default value is: NO.
-
-FILTER_SOURCE_FILES    = NO
-
-# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
-# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
-# it is also possible to disable source filtering for a specific pattern using
-# *.ext= (so without naming a filter).
-# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
-
-FILTER_SOURCE_PATTERNS =
-
-# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
-# is part of the input, its contents will be placed on the main page
-# (index.html). This can be useful if you have a project on for instance GitHub
-# and want to reuse the introduction page also for the doxygen output.
-
-USE_MDFILE_AS_MAINPAGE =
-
-#---------------------------------------------------------------------------
-# Configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
-# generated. Documented entities will be cross-referenced with these sources.
-#
-# Note: To get rid of all source code in the generated output, make sure that
-# also VERBATIM_HEADERS is set to NO.
-# The default value is: NO.
-
-SOURCE_BROWSER         = NO
-
-# Setting the INLINE_SOURCES tag to YES will include the body of functions,
-# classes and enums directly into the documentation.
-# The default value is: NO.
-
-INLINE_SOURCES         = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
-# special comment blocks from generated source code fragments. Normal C, C++ and
-# Fortran comments will always remain visible.
-# The default value is: YES.
-
-STRIP_CODE_COMMENTS    = YES
-
-# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
-# The default value is: NO.
-
-REFERENCED_BY_RELATION = NO
-
-# If the REFERENCES_RELATION tag is set to YES then for each documented function
-# all documented entities called/used by that function will be listed.
-# The default value is: NO.
-
-REFERENCES_RELATION    = NO
-
-# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
-# to YES then the hyperlinks from functions in REFERENCES_RELATION and
-# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
-# link to the documentation.
-# The default value is: YES.
-
-REFERENCES_LINK_SOURCE = YES
-
-# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
-# source code will show a tooltip with additional information such as prototype,
-# brief description and links to the definition and documentation. Since this
-# will make the HTML file larger and loading of large files a bit slower, you
-# can opt to disable this feature.
-# The default value is: YES.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-SOURCE_TOOLTIPS        = YES
-
-# If the USE_HTAGS tag is set to YES then the references to source code will
-# point to the HTML generated by the htags(1) tool instead of doxygen built-in
-# source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
-# 4.8.6 or higher.
-#
-# To use it do the following:
-# - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
-# - Make sure the INPUT points to the root of the source tree
-# - Run doxygen as normal
-#
-# Doxygen will invoke htags (and that will in turn invoke gtags), so these
-# tools must be available from the command line (i.e. in the search path).
-#
-# The result: instead of the source browser generated by doxygen, the links to
-# source code will now point to the output of htags.
-# The default value is: NO.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-USE_HTAGS              = NO
-
-# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
-# verbatim copy of the header file for each class for which an include is
-# specified. Set to NO to disable this.
-# See also: Section \class.
-# The default value is: YES.
-
-VERBATIM_HEADERS       = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
-# compounds will be generated. Enable this if the project contains a lot of
-# classes, structs, unions or interfaces.
-# The default value is: YES.
-
-ALPHABETICAL_INDEX     = YES
-
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-IGNORE_PREFIX          =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
-# The default value is: YES.
-
-GENERATE_HTML          = NO
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_OUTPUT            = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
-# generated HTML page (for example: .htm, .php, .asp).
-# The default value is: .html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
-# each generated HTML page. If the tag is left blank doxygen will generate a
-# standard header.
-#
-# To get valid HTML the header file that includes any scripts and style sheets
-# that doxygen needs, which is dependent on the configuration options used (e.g.
-# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
-# default header using
-# doxygen -w html new_header.html new_footer.html new_stylesheet.css
-# YourConfigFile
-# and then modify the file new_header.html. See also section "Doxygen usage"
-# for information on how to generate the default header that doxygen normally
-# uses.
-# Note: The header is subject to change so you typically have to regenerate the
-# default header when upgrading to a newer version of doxygen. For a description
-# of the possible markers and block names see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_HEADER            =
-
-# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
-# generated HTML page. If the tag is left blank doxygen will generate a standard
-# footer. See HTML_HEADER for more information on how to generate a default
-# footer and what special commands can be used inside the footer. See also
-# section "Doxygen usage" for information on how to generate the default footer
-# that doxygen normally uses.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FOOTER            =
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
-# sheet that is used by each HTML page. It can be used to fine-tune the look of
-# the HTML output. If left blank doxygen will generate a default style sheet.
-# See also section "Doxygen usage" for information on how to generate the style
-# sheet that doxygen normally uses.
-# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
-# it is more robust and this tag (HTML_STYLESHEET) will in the future become
-# obsolete.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_STYLESHEET        =
-
-# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# cascading style sheets that are included after the standard style sheets
-# created by doxygen. Using this option one can overrule certain style aspects.
-# This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefore more robust against future updates.
-# Doxygen will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list). For an example see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_STYLESHEET  =
-
-# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the HTML output directory. Note
-# that these files will be copied to the base HTML output directory. Use the
-# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
-# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
-# files will be copied as-is; there are no commands or markers available.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_FILES       =
-
-# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
-# will adjust the colors in the style sheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
-# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
-# purple, and 360 is red again.
-# Minimum value: 0, maximum value: 359, default value: 220.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_HUE    = 220
-
-# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
-# value of 255 will produce the most vivid colors.
-# Minimum value: 0, maximum value: 255, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_SAT    = 100
-
-# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
-# luminance component of the colors in the HTML output. Values below 100
-# gradually make the output lighter, whereas values above 100 make the output
-# darker. The value divided by 100 is the actual gamma applied, so 80 represents
-# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
-# change the gamma.
-# Minimum value: 40, maximum value: 240, default value: 80.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_GAMMA  = 80
-
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to YES can help to show when doxygen was last run and thus if the
-# documentation is up to date.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = NO
-
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
-# documentation will contain sections that can be hidden and shown after the
-# page has loaded.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_DYNAMIC_SECTIONS  = NO
-
-# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
-# shown in the various tree structured indices initially; the user can expand
-# and collapse entries dynamically later on. Doxygen will expand the tree to
-# such a level that at most the specified number of entries are visible (unless
-# a fully collapsed tree already exceeds this amount). So setting the number of
-# entries 1 will produce a full collapsed tree by default. 0 is a special value
-# representing an infinite number of entries and will result in a full expanded
-# tree by default.
-# Minimum value: 0, maximum value: 9999, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_INDEX_NUM_ENTRIES = 100
-
-# If the GENERATE_DOCSET tag is set to YES, additional index files will be
-# generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_DOCSET        = NO
-
-# This tag determines the name of the docset feed. A documentation feed provides
-# an umbrella under which multiple documentation sets from a single provider
-# (such as a company or product suite) can be grouped.
-# The default value is: Doxygen generated docs.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-
-# This tag specifies a string that should uniquely identify the documentation
-# set bundle. This should be a reverse domain-name style string, e.g.
-# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-
-# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
-# the documentation publisher. This should be a reverse domain-name style
-# string, e.g. com.mycompany.MyDocSet.documentation.
-# The default value is: org.doxygen.Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-
-# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
-# The default value is: Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_NAME  = Publisher
-
-# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
-# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
-# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
-#
-# The HTML Help Workshop contains a compiler that can convert all HTML output
-# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
-# files are now used as the Windows 98 help format, and will replace the old
-# Windows help format (.hlp) on all Windows platforms in the future. Compressed
-# HTML files also contain an index, a table of contents, and you can search for
-# words in the documentation. The HTML workshop also contains a viewer for
-# compressed HTML files.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_HTMLHELP      = NO
-
-# The CHM_FILE tag can be used to specify the file name of the resulting .chm
-# file. You can add a path in front of the file if the result should not be
-# written to the html output directory.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_FILE               =
-
-# The HHC_LOCATION tag can be used to specify the location (absolute path
-# including file name) of the HTML help compiler (hhc.exe). If non-empty,
-# doxygen will try to run the HTML help compiler on the generated index.hhp.
-# The file has to be specified with full path.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-HHC_LOCATION           =
-
-# The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-GENERATE_CHI           = NO
-
-# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
-# and project file content.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_INDEX_ENCODING     =
-
-# The BINARY_TOC flag controls whether a binary table of contents is generated
-# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
-# enables the Previous and Next buttons.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members to
-# the table of contents of the HTML help documentation and to the tree view.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-TOC_EXPAND             = NO
-
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
-# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
-# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
-# (.qch) of the generated HTML documentation.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_QHP           = NO
-
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
-# the file name of the resulting .qch file. The path specified is relative to
-# the HTML output folder.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QCH_FILE               =
-
-# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
-# Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_NAMESPACE          = org.doxygen.Project
-
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
-# Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
-# The default value is: doc.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_VIRTUAL_FOLDER     = doc
-
-# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
-# filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_NAME   =
-
-# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
-# custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_ATTRS  =
-
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
-# project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_SECT_FILTER_ATTRS  =
-
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHG_LOCATION           =
-
-# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
-# generated, together with the HTML files, they form an Eclipse help plugin. To
-# install this plugin and make it available under the help contents menu in
-# Eclipse, the contents of the directory containing the HTML and XML files needs
-# to be copied into the plugins directory of eclipse. The name of the directory
-# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
-# After copying Eclipse needs to be restarted before the help appears.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_ECLIPSEHELP   = NO
-
-# A unique identifier for the Eclipse help plugin. When installing the plugin
-# the directory name containing the HTML and XML files should also have this
-# name. Each documentation set should have its own identifier.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
-
-ECLIPSE_DOC_ID         = org.doxygen.Project
-
-# If you want full control over the layout of the generated HTML pages it might
-# be necessary to disable the index and replace it with your own. The
-# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
-# of each HTML page. A value of NO enables the index and the value YES disables
-# it. Since the tabs in the index contain the same information as the navigation
-# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-DISABLE_INDEX          = NO
-
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
-# structure should be generated to display hierarchical information. If the tag
-# value is set to YES, a side panel will be generated containing a tree-like
-# index structure (just like the one that is generated for HTML Help). For this
-# to work a browser that supports JavaScript, DHTML, CSS and frames is required
-# (i.e. any modern browser). Windows users are probably better off using the
-# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_TREEVIEW      = NO
-
-# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
-# doxygen will group on one line in the generated HTML documentation.
-#
-# Note that a value of 0 will completely suppress the enum values from appearing
-# in the overview section.
-# Minimum value: 0, maximum value: 20, default value: 4.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-ENUM_VALUES_PER_LINE   = 4
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
-# to set the initial width (in pixels) of the frame in which the tree is shown.
-# Minimum value: 0, maximum value: 1500, default value: 250.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-TREEVIEW_WIDTH         = 250
-
-# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
-# external symbols imported via tag files in a separate window.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-EXT_LINKS_IN_WINDOW    = NO
-
-# Use this tag to change the font size of LaTeX formulas included as images in
-# the HTML documentation. When you change the font size after a successful
-# doxygen run you need to manually remove any form_*.png images from the HTML
-# output directory to force them to be regenerated.
-# Minimum value: 8, maximum value: 50, default value: 10.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_FONTSIZE       = 10
-
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_TRANSPARENT    = YES
-
-# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
-# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
-# installed or if you want to formulas look prettier in the HTML output. When
-# enabled you may also need to install MathJax separately and configure the path
-# to it using the MATHJAX_RELPATH option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-USE_MATHJAX            = NO
-
-# When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
-# Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
-# The default value is: HTML-CSS.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_FORMAT         = HTML-CSS
-
-# When MathJax is enabled you need to specify the location relative to the HTML
-# output directory using the MATHJAX_RELPATH option. The destination directory
-# should contain the MathJax.js script. For instance, if the mathjax directory
-# is located at the same level as the HTML output directory, then
-# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
-# Content Delivery Network so you can quickly see the result without installing
-# MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-
-# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
-# extension names that should be enabled during MathJax rendering. For example
-# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_EXTENSIONS     =
-
-# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
-# of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
-# example see the documentation.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_CODEFILE       =
-
-# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
-# the HTML output. The underlying search engine uses javascript and DHTML and
-# should work on any modern browser. Note that when using HTML help
-# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
-# there is already a search function so this one should typically be disabled.
-# For large projects the javascript based search engine can be slow, then
-# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
-# search using the keyboard; to jump to the search box use <access key> + S
-# (what the <access key> is depends on the OS and browser, but it is typically
-# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
-# key> to jump into the search results window, the results can be navigated
-# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
-# the search. The filter options can be selected when the cursor is inside the
-# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
-# to select a filter and <Enter> or <escape> to activate or cancel the filter
-# option.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-SEARCHENGINE           = YES
-
-# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
-# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
-# setting. When disabled, doxygen will generate a PHP script for searching and
-# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
-# and searching needs to be provided by external tools. See the section
-# "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SERVER_BASED_SEARCH    = NO
-
-# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
-# script for searching. Instead the search results are written to an XML file
-# which needs to be processed by an external indexer. Doxygen will invoke an
-# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
-# search results.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
-#
-# See the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH        = NO
-
-# The SEARCHENGINE_URL should point to a search engine hosted by a web server
-# which will return the search results when EXTERNAL_SEARCH is enabled.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHENGINE_URL       =
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
-# search data is written to a file for indexing by an external tool. With the
-# SEARCHDATA_FILE tag the name of this file can be specified.
-# The default file is: searchdata.xml.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHDATA_FILE        = searchdata.xml
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
-# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
-# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
-# projects and redirect the results back to the right project.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH_ID     =
-
-# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
-# projects other than the one defined by this configuration file, but that are
-# all added to the same external search index. Each project needs to have a
-# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
-# to a relative location where the documentation can be found. The format is:
-# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTRA_SEARCH_MAPPINGS  =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
-# The default value is: YES.
-
-GENERATE_LATEX         = NO
-
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_OUTPUT           = latex
-
-# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
-# invoked.
-#
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_CMD_NAME         = latex
-
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
-# index for LaTeX.
-# The default file is: makeindex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-MAKEINDEX_CMD_NAME     = makeindex
-
-# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-COMPACT_LATEX          = NO
-
-# The PAPER_TYPE tag can be used to set the paper type that is used by the
-# printer.
-# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
-# 14 inches) and executive (7.25 x 10.5 inches).
-# The default value is: a4.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PAPER_TYPE             = a4
-
-# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
-# that should be included in the LaTeX output. The package can be specified just
-# by its name or with the correct syntax as to be used with the LaTeX
-# \usepackage command. To get the times font for instance you can specify :
-# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
-# To use the option intlimits with the amsmath package you can specify:
-# EXTRA_PACKAGES=[intlimits]{amsmath}
-# If left blank no extra packages will be included.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-EXTRA_PACKAGES         =
-
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
-#
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
-# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
-# string, for the replacement values of the other commands the user is referred
-# to HTML_HEADER.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HEADER           =
-
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer. See
-# LATEX_HEADER for more information on how to generate a default footer and what
-# special commands can be used inside the footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_FOOTER           =
-
-# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# LaTeX style sheets that are included after the standard style sheets created
-# by doxygen. Using this option one can overrule certain style aspects. Doxygen
-# will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list).
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_STYLESHEET =
-
-# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the LATEX_OUTPUT output
-# directory. Note that the files will be copied as-is; there are no commands or
-# markers available.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_FILES      =
-
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
-# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
-# contain links (just like the HTML output) instead of page references. This
-# makes the output suitable for online browsing using a PDF viewer.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PDF_HYPERLINKS         = YES
-
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-USE_PDFLATEX           = YES
-
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BATCHMODE        = NO
-
-# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
-# index chapters (such as File Index, Compound Index, etc.) in the output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HIDE_INDICES     = NO
-
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
-# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
-# bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
-# The default value is: plain.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BIB_STYLE        = plain
-
-#---------------------------------------------------------------------------
-# Configuration options related to the RTF output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
-# RTF output is optimized for Word 97 and may not look too pretty with other RTF
-# readers/editors.
-# The default value is: NO.
-
-GENERATE_RTF           = NO
-
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: rtf.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_OUTPUT             = rtf
-
-# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-COMPACT_RTF            = NO
-
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
-# contain hyperlink fields. The RTF file will contain links (just like the HTML
-# output) instead of page references. This makes the output suitable for online
-# browsing using Word or some other Word compatible readers that support those
-# fields.
-#
-# Note: WordPad (write) and others do not support links.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_HYPERLINKS         = NO
-
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
-#
-# See also section "Doxygen usage" for information on how to generate the
-# default style sheet that doxygen normally uses.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_STYLESHEET_FILE    =
-
-# Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_EXTENSIONS_FILE    =
-
-# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
-# with syntax highlighting in the RTF output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_SOURCE_CODE        = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the man page output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
-# classes and files.
-# The default value is: NO.
-
-GENERATE_MAN           = NO
-
-# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it. A directory man3 will be created inside the directory specified by
-# MAN_OUTPUT.
-# The default directory is: man.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_OUTPUT             = man
-
-# The MAN_EXTENSION tag determines the extension that is added to the generated
-# man pages. In case the manual section does not start with a number, the number
-# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
-# optional.
-# The default value is: .3.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_EXTENSION          = .3
-
-# The MAN_SUBDIR tag determines the name of the directory created within
-# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
-# MAN_EXTENSION with the initial . removed.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_SUBDIR             =
-
-# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
-# will generate one additional man file for each entity documented in the real
-# man page(s). These additional files only source the real man page, but without
-# them the man command would be unable to find the correct page.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_LINKS              = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
-# captures the structure of the code including all documentation.
-# The default value is: NO.
-
-GENERATE_XML           = YES
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: xml.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_OUTPUT             = xml
-
-# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
-# listings (including syntax highlighting and cross-referencing information) to
-# the XML output. Note that enabling this will significantly increase the size
-# of the XML output.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_PROGRAMLISTING     = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the DOCBOOK output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
-# that can be used to generate PDF.
-# The default value is: NO.
-
-GENERATE_DOCBOOK       = NO
-
-# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
-# front of it.
-# The default directory is: docbook.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_OUTPUT         = docbook
-
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
-# program listings (including syntax highlighting and cross-referencing
-# information) to the DOCBOOK output. Note that enabling this will significantly
-# increase the size of the DOCBOOK output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_PROGRAMLISTING = NO
-
-#---------------------------------------------------------------------------
-# Configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
-# structure of the code including all documentation. Note that this feature is
-# still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_AUTOGEN_DEF   = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
-# file that captures the structure of the code including all documentation.
-#
-# Note that this feature is still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_PERLMOD       = NO
-
-# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
-# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
-# output from the Perl module output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_LATEX          = NO
-
-# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
-# formatted so it can be parsed by a human reader. This is useful if you want to
-# understand what is going on. On the other hand, if this tag is set to NO, the
-# size of the Perl module output will be much smaller and Perl will parse it
-# just the same.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_PRETTY         = YES
-
-# The names of the make variables in the generated doxyrules.make file are
-# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
-# so different doxyrules.make files included by the same Makefile don't
-# overwrite each other's variables.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_MAKEVAR_PREFIX =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
-# C-preprocessor directives found in the sources and include files.
-# The default value is: YES.
-
-ENABLE_PREPROCESSING   = YES
-
-# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
-# in the source code. If set to NO, only conditional compilation will be
-# performed. Macro expansion can be done in a controlled way by setting
-# EXPAND_ONLY_PREDEF to YES.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-MACRO_EXPANSION        = NO
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
-# the macro expansion is limited to the macros specified with the PREDEFINED and
-# EXPAND_AS_DEFINED tags.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_ONLY_PREDEF     = NO
-
-# If the SEARCH_INCLUDES tag is set to YES, the include files in the
-# INCLUDE_PATH will be searched if a #include is found.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SEARCH_INCLUDES        = YES
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by the
-# preprocessor.
-# This tag requires that the tag SEARCH_INCLUDES is set to YES.
-
-INCLUDE_PATH           =
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
-# patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will be
-# used.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-INCLUDE_FILE_PATTERNS  =
-
-# The PREDEFINED tag can be used to specify one or more macro names that are
-# defined before the preprocessor is started (similar to the -D option of e.g.
-# gcc). The argument of the tag is a list of macros of the form: name or
-# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
-# is assumed. To prevent a macro definition from being undefined via #undef or
-# recursively expanded use the := operator instead of the = operator.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-PREDEFINED             =
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
-# tag can be used to specify a list of macro names that should be expanded. The
-# macro definition that is found in the sources will be used. Use the PREDEFINED
-# tag if you want to use a different macro definition that overrules the
-# definition found in the source code.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_AS_DEFINED      =
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
-# remove all references to function-like macros that are alone on a line, have
-# an all uppercase name, and do not end with a semicolon. Such function macros
-# are typically used for boiler-plate code, and will confuse the parser if not
-# removed.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SKIP_FUNCTION_MACROS   = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to external references
-#---------------------------------------------------------------------------
-
-# The TAGFILES tag can be used to specify one or more tag files. For each tag
-# file the location of the external documentation should be added. The format of
-# a tag file without this location is as follows:
-# TAGFILES = file1 file2 ...
-# Adding location for the tag files is done as follows:
-# TAGFILES = file1=loc1 "file2 = loc2" ...
-# where loc1 and loc2 can be relative or absolute paths or URLs. See the
-# section "Linking to external documentation" for more information about the use
-# of tag files.
-# Note: Each tag file must have a unique name (where the name does NOT include
-# the path). If a tag file is not located in the directory in which doxygen is
-# run, you must also specify the path to the tagfile here.
-
-TAGFILES               =
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
-# tag file that is based on the input files it reads. See section "Linking to
-# external documentation" for more information about the usage of tag files.
-
-GENERATE_TAGFILE       =
-
-# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
-# the class index. If set to NO, only the inherited external classes will be
-# listed.
-# The default value is: NO.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will be
-# listed.
-# The default value is: YES.
-
-EXTERNAL_GROUPS        = YES
-
-# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
-# the related pages index. If set to NO, only the current project's pages will
-# be listed.
-# The default value is: YES.
-
-EXTERNAL_PAGES         = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
-# You can include diagrams made with dia in doxygen documentation. Doxygen will
-# then run dia to produce the diagram and insert it in the documentation. The
-# DIA_PATH tag allows you to specify the directory where the dia binary resides.
-# If left empty dia is assumed to be found in the default search path.
-
-DIA_PATH               =
-
-# If set to YES the inheritance and collaboration graphs will hide inheritance
-# and usage relations if the target is undocumented or is not a class.
-# The default value is: YES.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
-# Bell Labs. The other options in this section have no effect if this option is
-# set to NO
-# The default value is: NO.
-
-HAVE_DOT               = NO
-
-# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
-# to run in parallel. When set to 0 doxygen will base this on the number of
-# processors available in the system. You can set it explicitly to a value
-# larger than 0 to get control over the balance between CPU load and processing
-# speed.
-# Minimum value: 0, maximum value: 32, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_NUM_THREADS        = 0
-
-# When you want a differently looking font in the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTNAME           = Helvetica
-
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTSIZE           = 10
-
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTPATH           =
-
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
-# graph for each documented class showing the direct and indirect implementation
-# dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-COLLABORATION_GRAPH    = YES
-
-# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GROUP_GRAPHS           = YES
-
-# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
-# collaboration diagrams in a style similar to the OMG's Unified Modeling
-# Language.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LOOK               = NO
-
-# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
-# class node. If there are many fields or methods and many nodes the graph may
-# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
-# number of items for each type to make the size more manageable. Set this to 0
-# for no limit. Note that the threshold may be exceeded by 50% before the limit
-# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
-# but if the number exceeds 15, the total amount of fields shown is limited to
-# 10.
-# Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LIMIT_NUM_FIELDS   = 10
-
-# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
-# collaboration graphs will show the relations between templates and their
-# instances.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-TEMPLATE_RELATIONS     = NO
-
-# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
-# YES then doxygen will generate a graph for each documented file showing the
-# direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDE_GRAPH          = YES
-
-# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
-# set to YES then doxygen will generate a graph for each documented file showing
-# the direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDED_BY_GRAPH      = YES
-
-# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command. Disabling a call graph can be
-# accomplished by means of the command \hidecallgraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALL_GRAPH             = NO
-
-# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command. Disabling a caller graph can be
-# accomplished by means of the command \hidecallergraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALLER_GRAPH           = NO
-
-# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
-# hierarchy of all classes instead of a textual one.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
-# dependencies a directory has on other directories in a graphical way. The
-# dependency relations are determined by the #include relations between the
-# files in the directories.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DIRECTORY_GRAPH        = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot. For an explanation of the image formats see the section
-# output formats in the documentation of the dot tool (Graphviz (see:
-# http://www.graphviz.org/)).
-# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
-# to make the SVG files visible in IE 9+ (other browsers do not have this
-# requirement).
-# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
-# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
-# png:gdiplus:gdiplus.
-# The default value is: png.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_IMAGE_FORMAT       = png
-
-# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
-# enable generation of interactive SVG images that allow zooming and panning.
-#
-# Note that this requires a modern browser other than Internet Explorer. Tested
-# and working are Firefox, Chrome, Safari, and Opera.
-# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
-# the SVG files visible. Older versions of IE do not have SVG support.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INTERACTIVE_SVG        = NO
-
-# The DOT_PATH tag can be used to specify the path where the dot tool can be
-# found. If left blank, it is assumed the dot tool can be found in the path.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_PATH               =
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the \dotfile
-# command).
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOTFILE_DIRS           =
-
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
-
-MSCFILE_DIRS           =
-
-# The DIAFILE_DIRS tag can be used to specify one or more directories that
-# contain dia files that are included in the documentation (see the \diafile
-# command).
-
-DIAFILE_DIRS           =
-
-# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
-# path where java can find the plantuml.jar file. If left blank, it is assumed
-# PlantUML is not used or called during a preprocessing step. Doxygen will
-# generate a warning when it encounters a \startuml command in this case and
-# will not generate output for the diagram.
-
-PLANTUML_JAR_PATH      =
-
-# When using plantuml, the specified paths are searched for files specified by
-# the !include statement in a plantuml block.
-
-PLANTUML_INCLUDE_PATH  =
-
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
-# that will be shown in the graph. If the number of nodes in a graph becomes
-# larger than this value, doxygen will truncate the graph, which is visualized
-# by representing a node as a red box. Note that doxygen if the number of direct
-# children of the root node in a graph is already larger than
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
-# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
-# Minimum value: 0, maximum value: 10000, default value: 50.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_GRAPH_MAX_NODES    = 50
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
-# generated by dot. A depth value of 3 means that only nodes reachable from the
-# root by following a path via at most 3 edges will be shown. Nodes that lay
-# further from the root node will be omitted. Note that setting this option to 1
-# or 2 may greatly reduce the computation time needed for large code bases. Also
-# note that the size of a graph can be further restricted by
-# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
-# Minimum value: 0, maximum value: 1000, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-MAX_DOT_GRAPH_DEPTH    = 0
-
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
-# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
-# files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10) support
-# this, this feature is disabled by default.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_MULTI_TARGETS      = NO
-
-# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
-# explaining the meaning of the various boxes and arrows in the dot generated
-# graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
-# files that are used to generate the various graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_CLEANUP            = YES
diff --git a/doc/about/index_cn.md b/doc/about/index_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..3bf030004d4de8c6f3cb773c6e78c09f40878c5f
--- /dev/null
+++ b/doc/about/index_cn.md
@@ -0,0 +1,11 @@
+关于PaddlePaddle
+================
+
+PaddlePaddle是一个最早由百度科学家和工程师共同研发的并行分布式深度学习平台，兼备易用性、高效性、灵活性和可扩展性，目前已被百度内部多个产品线广泛使用。
+PaddlePaddle目前已经开放源码, 但是远未完善，我们希望能在这个基础上不断的改进、扩展和延伸。
+同时我们希望广大开发者积极提供反馈和贡献源代码，建立一个活跃的开源社区。
+
+致谢
+--------
+
+在此，特别感谢PaddlePaddle的[所有贡献者](https://github.com/PaddlePaddle/Paddle/graphs/contributors)。
diff --git a/doc/about/index_en.rst b/doc/about/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..065c430cdea802ed3c9f487cd00255b85a5598a5
--- /dev/null
+++ b/doc/about/index_en.rst
@@ -0,0 +1,14 @@
+ABOUT
+=======
+
+PaddlPaddle is an easy-to-use, efficient, flexible and scalable deep learning platform,
+which is originally developed by Baidu scientists and engineers for the purpose of applying deep learning to many products at Baidu.
+
+PaddlePaddle is now open source but far from complete, which is intended to be built upon, improved, scaled, and extended.
+We hope to build an active open source community both by providing feedback and by actively contributing to the source code.
+
+
+Credits
+--------
+
+We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/graphs/contributors>`_ of PaddlePaddle!
diff --git a/doc/algorithm/index.rst b/doc/algorithm/index.rst
deleted file mode 100644
index 6073add3c0cbb12529eabb0f8d8a051bcb84e628..0000000000000000000000000000000000000000
--- a/doc/algorithm/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Algorithm Tutorial
-==================
-
-..  toctree::
-  :maxdepth: 1
-
-  rnn/rnn.rst
diff --git a/doc/algorithm/rnn/bi_lstm.jpg b/doc/algorithm/rnn/bi_lstm.jpg
deleted file mode 120000
index a53296cf806f97f7f2520e1700c4fb93f6bfc9d8..0000000000000000000000000000000000000000
--- a/doc/algorithm/rnn/bi_lstm.jpg
+++ /dev/null
@@ -1 +0,0 @@
-../../demo/sentiment_analysis/bi_lstm.jpg
\ No newline at end of file
diff --git a/doc/algorithm/rnn/encoder-decoder-attention-model.png b/doc/algorithm/rnn/encoder-decoder-attention-model.png
deleted file mode 120000
index db71321a43a37b774e7de0af3765a60345033743..0000000000000000000000000000000000000000
--- a/doc/algorithm/rnn/encoder-decoder-attention-model.png
+++ /dev/null
@@ -1 +0,0 @@
-../../demo/text_generation/encoder-decoder-attention-model.png
\ No newline at end of file
diff --git a/doc/api/data_provider/dataprovider_cn.rst b/doc/api/data_provider/dataprovider_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6861ecece8cad19aa8a1e4e67e819f40873ef07c
--- /dev/null
+++ b/doc/api/data_provider/dataprovider_cn.rst
@@ -0,0 +1,13 @@
+DataProvider的介绍
+==================
+
+DataProvider是PaddlePaddle负责提供数据的模块。其作用是将数据传入内存或显存，让神经网络可以进行训练或预测。用户可以通过简单使用Python接口 `PyDataProvider2 <pydataprovider2.html>`_ ，来自定义传数据的过程。如果有更复杂的使用，或者需要更高的效率，用户也可以在C++端自定义一个 ``DataProvider`` 。
+
+PaddlePaddle需要用户在网络配置（trainer_config.py）中定义使用哪种DataProvider，并且在DataProvider中实现如何访问训练文件列表（train.list）或测试文件列表（test.list）。
+
+- train.list和test.list存放在本地（推荐直接存放到训练目录，以相对路径引用)。一般情况下，两者均为纯文本文件，其中每一行对应一个数据文件地址：
+  
+  - 如果数据文件存于本地磁盘，这个地址则为它的绝对路径或相对路径(相对于PaddlePaddle程序运行时的路径)。
+  - 地址也可以为hdfs文件路径，或者数据库连接路径等。
+  - 由于这个地址会被DataProvider使用，因此，如何解析该地址也是用户自定义DataProvider时需要考虑的地方。
+- 如果没有设置test.list，或设置为None，那么在训练过程中不会执行测试操作；否则，会根据命令行参数指定的测试方式，在训练过程中进行测试，从而防止过拟合。
diff --git a/doc/ui/data_provider/index.rst b/doc/api/data_provider/dataprovider_en.rst
similarity index 87%
rename from doc/ui/data_provider/index.rst
rename to doc/api/data_provider/dataprovider_en.rst
index 3db5b57376257b83fc2a27c518b0db663682136d..96efbb1da959daec561009fdcc95d353b191dec8 100644
--- a/doc/ui/data_provider/index.rst
+++ b/doc/api/data_provider/dataprovider_en.rst
@@ -1,5 +1,5 @@
-DataProvider Introduction
-=========================
+Introduction
+==============
 DataProvider is a module that loads training or testing data into cpu or gpu
 memory for the following triaining or testing process.
 
@@ -32,11 +32,3 @@ Each line of train.list and test.list is an absolute or relative path (relative
 to the PaddePaddle program runtime) of data file. Fascinatingly more, each line
 can also be a HDFS file path or a SQL connection string. As long as the user
 assures how to access each file in DataProvider.
-
-Please refer to the following articles for more information about the detail
-usages of DataProvider and how to implement a new DataProvider,
-
-..  toctree::
-
-    pydataprovider2.rst
-    write_new_dataprovider.rst
diff --git a/doc/api/data_provider/pydataprovider2_cn.rst b/doc/api/data_provider/pydataprovider2_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f243ea775a6b4c0961a8948653ad54ea9b531dcb
--- /dev/null
+++ b/doc/api/data_provider/pydataprovider2_cn.rst
@@ -0,0 +1,227 @@
+PyDataProvider2的使用
+=====================
+
+PyDataProvider2是PaddlePaddle使用Python提供数据的推荐接口。该接口使用多线程读取数据，并提供了简单的Cache功能；同时可以使用户只关注如何从文件中读取每一条数据，而不用关心数据如何传输，如何存储等等。
+
+..  contents::
+
+MNIST的使用场景
+---------------
+
+我们以MNIST手写识别为例，来说明PyDataProvider2的简单使用场景。
+
+样例数据
+++++++++
+
+MNIST是一个包含有70,000张灰度图片的数字分类数据集。样例数据 ``mnist_train.txt`` 如下：
+
+..  literalinclude:: src/mnist_train.txt
+
+其中每行数据代表一张图片，行内使用 ``;`` 分成两部分。第一部分是图片的标签，为0-9中的一个数字；第二部分是28*28的图片像素灰度值。 对应的 ``train.list`` 即为这个数据文件的名字：
+
+..  literalinclude:: src/train.list
+
+dataprovider的使用
+++++++++++++++++++
+
+..  literalinclude:: src/mnist_provider.dict.py
+
+- 首先，引入PaddlePaddle的PyDataProvider2包。
+- 其次，定义一个Python的 `Decorator <http://www.learnpython.org/en/Decorators>`_ `@provider`_ 。用于将下一行的数据输入函数标记成一个PyDataProvider2，同时设置它的input_types属性。
+  
+  - `input_types`_：设置这个PyDataProvider2返回什么样的数据。本例根据网络配置中 ``data_layer`` 的名字，显式指定返回的是一个28*28维的稠密浮点数向量和一个[0-9]的10维整数标签。
+
+    ..  literalinclude:: src/mnist_config.py
+         :lines: 9-10
+
+  - 注意：如果用户不显示指定返回数据的对应关系，那么PaddlePaddle会根据layer的声明顺序，来确定对应关系。但这个关系可能不正确，所以推荐使用显式指定的方式来设置input_types。
+- 最后，实现数据输入函数（如本例的 ``process`` 函数）。
+
+  - 该函数的功能是：打开文本文件，读取每一行，将行中的数据转换成与input_types一致的格式，然后返回给PaddlePaddle进程。注意，
+    
+    - 返回的顺序需要和input_types中定义的顺序一致。
+    - 返回时，必须使用Python关键词 ``yield`` ，相关概念是 ``generator`` 。
+    - 一次yield调用，返回一条完整的样本。如果想为一个数据文件返回多条样本，只需要在函数中调用多次yield即可（本例中使用for循环进行多次调用）。
+  
+  - 该函数具有两个参数：
+  
+    - settings：在本例中没有使用，具体可以参考 `init_hook`_ 中的说明。
+    - filename：为 ``train.list`` 或 ``test.list`` 中的一行，即若干数据文件路径的某一个。
+
+网络配置中的调用
+++++++++++++++++
+
+在网络配置里，只需要一行代码就可以调用这个PyDataProvider2，如，
+
+..  literalinclude:: src/mnist_config.py
+     :lines: 1-7
+
+训练数据是 ``train.list`` ，没有测试数据，调用的PyDataProvider2是 ``mnist_provider`` 模块中的 ``process`` 函数。
+
+小结
++++++
+
+至此，简单的PyDataProvider2样例就说明完毕了。对用户来说，仅需要知道如何从 **一个文件** 中读取 **一条样本** ，就可以将数据传送给PaddlePaddle了。而PaddlePaddle则会帮用户做以下工作：
+
+* 将数据组合成Batch进行训练
+* 对训练数据进行Shuffle
+* 多线程的数据读取
+* 缓存训练数据到内存(可选)
+* CPU->GPU双缓存
+
+是不是很简单呢？
+
+时序模型的使用场景
+------------------
+样例数据
+++++++++
+
+时序模型是指数据的某一维度是一个序列形式，即包含时间步信息。所谓时间步信息，不一定和时间有关系，只是说明数据的顺序是重要的。例如，文本信息就是一个序列数据。
+
+本例采用英文情感分类的数据，即将一段英文文本数据，分类成正面情绪和负面情绪两类(用0和1表示)。样例数据 ``sentimental_train.txt`` 如下：
+
+..  literalinclude:: src/sentimental_train.txt
+
+dataprovider的使用
+++++++++++++++++++
+
+相对MNIST而言，这个dataprovider较复杂，主要原因是增加了初始化机制 `init_hook`_。本例的 ``on_init`` 函数就是根据该机制配置的，它会在dataprovider创建的时候执行。
+
+- 其中 ``input_types`` 和在 `@provider`_ 中配置的效果一致。本例中的输入特征是词ID的序列，因此使用 ``integer_value_sequence`` 类型来设置。
+- 将 ``dictionary`` 存入settings对象，在 ``process`` 函数中使用。 dictionary是从网络配置中传入的dict对象，即一个将单词字符串映射到单词ID的字典。
+
+..  literalinclude:: src/sentimental_provider.py
+
+网络配置中的调用
+++++++++++++++++
+
+调用这个PyDataProvider2的方法，基本上和MNIST样例一致，除了
+
+* 在配置中需要读取外部字典。
+* 在声明DataProvider的时候传入dictionary作为参数。
+
+..  literalinclude:: src/sentimental_config.py
+     :emphasize-lines: 12-14
+
+参考(Reference)
+---------------
+
+@provider
++++++++++
+
+``@provider`` 是一个Python的 `Decorator`_ ，可以将某一个函数标记成一个PyDataProvider2。如果不了解 `Decorator`_ 是什么也没关系，只需知道这是一个标记属性的方法就可以了。它包含的属性参数如下:
+
+*  input_types：数据输入格式。具体的格式说明，请参考 `input_types`_ 。
+*  should_shuffle：是不是要对数据做Shuffle。训练时默认shuffle，测试时默认不shuffle。
+*  min_pool_size：设置内存中最小暂存的数据条数，也是PaddlePaddle所能够保证的shuffle粒度。如果为-1，则会预先读取全部数据到内存中。
+*  pool_size： 设置内存中暂存的数据条数。如果为-1（默认），则不在乎内存暂存多少条数据。如果设置，则推荐大于训练时batch size的值，并且在内存足够的情况下越大越好。
+*  can_over_batch_size：是否允许暂存略微多余pool_size的数据。由于这样做可以避免很多死锁问题，一般推荐设置成True。
+*  calc_batch_size：可以传入一个函数，用于自定义每条数据的batch size（默认为1）。
+*  cache： 数据缓存的策略，具体请参考 `cache`_ 。
+*  init_hook：初始化时调用的函数，具体请参考 `init_hook`_ 。
+*  check：如果为true，会根据input_types检查数据的合法性。
+*  check_fail_continue：如果为true，那么当check出数据不合法时，会扔到这条数据，继续训练或预测。（对check=false的情况，没有作用）
+
+input_types
++++++++++++
+
+PaddlePaddle的数据包括四种主要类型，和三种序列模式。
+
+四种数据类型：
+
+* dense_vector：稠密的浮点数向量。
+* sparse_binary_vector：稀疏的01向量，即大部分值为0，但有值的地方必须为1。
+* sparse_float_vector：稀疏的向量，即大部分值为0，但有值的部分可以是任何浮点数。
+* integer：整数标签。
+
+三种序列模式：
+
+* SequenceType.NO_SEQUENCE：不是一条序列
+* SequenceType.SEQUENCE：是一条时间序列
+* SequenceType.SUB_SEQUENCE： 是一条时间序列，且序列的每一个元素还是一个时间序列。
+
+不同的数据类型和序列模式返回的格式不同，列表如下：
+
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+|                      | NO_SEQUENCE         | SEQUENCE                          |  SUB_SEQUENCE                                  |
++======================+=====================+===================================+================================================+
+| dense_vector         | [f, f, ...]         | [[f, ...], [f, ...], ...]         | [[[f, ...], ...], [[f, ...], ...],...]         |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| sparse_binary_vector | [i, i, ...]         | [[i, ...], [i, ...], ...]         | [[[i, ...], ...], [[i, ...], ...],...]         |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| sparse_float_vector  | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| integer_value        |  i                  | [i, i, ...]                       | [[i, ...], [i, ...], ...]                      |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+
+其中，f代表一个浮点数，i代表一个整数。
+
+注意：对sparse_binary_vector和sparse_float_vector，PaddlePaddle存的是有值位置的索引。例如，
+
+- 对一个5维非序列的稀疏01向量 ``[0, 1, 1, 0, 0]`` ，类型是sparse_binary_vector，返回的是 ``[1, 2]`` 。
+- 对一个5维非序列的稀疏浮点向量 ``[0, 0.5, 0.7, 0, 0]`` ，类型是sparse_float_vector，返回的是 ``[(1, 0.5), (2, 0.7)]`` 。
+
+init_hook
++++++++++
+
+init_hook可以传入一个函数。该函数在初始化的时候会被调用，其参数如下:
+
+* 第一个参数是settings对象，它和数据传入函数的第一个参数（如本例中 ``process`` 函数的 ``settings`` 参数）必须一致。该对象具有以下两个属性：
+    * settings.input_types：数据输入格式，具体请参考 `input_types`_ 。
+    * settings.logger：一个logging对象。
+* 其他参数使用 ``kwargs`` （key word arguments）传入，包括以下两种：
+    * PaddlePaddle定义的参数: 1）is_train：bool型参数，表示用于训练或预测；2）file_list：所有文件列表。
+    * 用户定义的参数：使用args在网络配置中设置。
+
+注意：PaddlePaddle保留添加参数的权力，因此init_hook尽量使用 ``**kwargs`` 来接受不使用的函数以保证兼容性。
+
+cache
++++++
+
+PyDataProvider2提供了两种简单的Cache策略：
+
+* CacheType.NO_CACHE：不缓存任何数据，每次都会从python端读取数据
+* CacheType.CACHE_PASS_IN_MEM：第一个pass会从python端读取数据，剩下的pass会直接从内存里
+  读取数据。 
+
+
+注意事项
+--------
+
+可能的内存泄露问题
+++++++++++++++++++
+
+PaddlePaddle将train.list中的每一行都传递给process函数，从而生成多个generator。当训练数据非常多时，就会生成非常多的generator。
+
+虽然每个generator在没有调用的时候，是几乎不占内存的；但当调用过一次后，generator便会存下当前的上下文(Context)，而这个Context可能会非常大。并且，generator至少需要调用两次才会知道是否停止。所以，即使process函数里面只有一个yield，也需要两次随机选择到相同generator的时候，才会释放该段内存。
+
+..  code-block:: python
+
+    def func():
+        yield 0
+
+    f = func()  # 创建generator
+    tmp = next(f)  # 调用一次，返回0
+    tmp = next(f)  # 调用第二次的时候，才会Stop Iteration
+
+由于顺序调用这些generator不会出现上述问题，因此有两种解决方案：
+
+1. **最佳推荐**：将样本的地址放入另一个文本文件，train.list写入那个文本文件的地址。即不要将每一个样本都放入train.list。
+2. 在generator的上下文中尽量留下非常少的变量引用，例如
+
+..  code-block:: python
+
+    def real_process(fn):
+        # ... read from fn
+        return result   # 当函数返回的时候，python可以解除掉内部变量的引用。
+
+    def process(fn):
+        yield real_process(fn)
+
+注意：这个问题是PyDataProvider读数据时候的逻辑问题，很难整体修正。
+
+内存不够用的情况
+++++++++++++++++
+
+PyDataProvider2会尽可能多的使用内存。因此，对于内存较小的机器，推荐使用 ``pool_size`` 变量来设置内存中暂存的数据条。具体请参考 `@provider`_ 中的说明。
+
diff --git a/doc/ui/data_provider/pydataprovider2.rst b/doc/api/data_provider/pydataprovider2_en.rst
similarity index 94%
rename from doc/ui/data_provider/pydataprovider2.rst
rename to doc/api/data_provider/pydataprovider2_en.rst
index e105d3be308705d228c0b188e15742a0f7325ab6..30357be32538db4423ad0eaf899138256c84edc7 100644
--- a/doc/ui/data_provider/pydataprovider2.rst
+++ b/doc/api/data_provider/pydataprovider2_en.rst
@@ -1,5 +1,7 @@
-How to use PyDataProvider2
-==========================
+..  _api_pydataprovider2:
+
+PyDataProvider2
+===============
 
 We highly recommand users to use PyDataProvider2 to provide training or testing
 data to PaddlePaddle. The user only needs to focus on how to read a single
@@ -22,18 +24,18 @@ of 28 x 28 pixels.
 
 A small part of the original data as an example is shown as below:
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_train.txt
+.. literalinclude:: src/mnist_train.txt
 
 Each line of the data contains two parts, separated by :code:`;`. The first part is
 label of an image. The second part contains 28x28 pixel float values.
 
 Just write path of the above data into train.list. It looks like this:
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/train.list
+.. literalinclude:: src/train.list
 
 The corresponding dataprovider is shown as below:
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_provider.py
+.. literalinclude:: src/mnist_provider.dict.py
 
 The first line imports PyDataProvider2 package.
 The main function is the process function, that has two parameters.
@@ -72,7 +74,7 @@ sample by using keywords :code:`yield`.
 Only a few lines of codes need to be added into the training configuration file,
 you can take this as an example.
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_config.py
+.. literalinclude:: src/mnist_config.py
 
 Here we specify training data by :code:`train.list`, and no testing data is specified.
 The method which actually provide data is :code:`process`.
@@ -81,7 +83,7 @@ User also can use another style to provide data, which defines the
 :code:`data_layer`'s name explicitly when `yield`. For example,
 the :code:`dataprovider` is shown as below.
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_provider.dict.py
+.. literalinclude:: src/mnist_provider.dict.py
    :linenos:
 
 If user did't give the :code:`data_layer`'s name, PaddlePaddle will use
@@ -102,6 +104,8 @@ And PaddlePadle will do all of the rest things\:
 
 Is this cool?
 
+..  _api_pydataprovider2_sequential_model:
+
 DataProvider for the sequential model
 -------------------------------------
 A sequence model takes sequences as its input. A sequence is made up of several
@@ -117,11 +121,11 @@ negative sentiment (marked by 0 and 1 respectively).
 
 A small part of the original data as an example can be found in the path below:
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/sentimental_train.txt
+.. literalinclude:: src/sentimental_train.txt
 
 The corresponding data provider can be found in the path below:
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/sentimental_provider.py
+.. literalinclude:: src/sentimental_provider.py
 
 This data provider for sequential model is a little more complex than that
 for MINST dataset.
@@ -139,7 +143,7 @@ initialized. The :code:`on_init` function has the following parameters:
 To pass these parameters into DataProvider, the following lines should be added
 into trainer configuration file.
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/sentimental_config.py
+.. literalinclude:: src/sentimental_config.py
 
 The definition is basically same as MNIST example, except:
 * Load dictionary in this configuration
diff --git a/doc_cn/ui/data_provider/mnist_config.py b/doc/api/data_provider/src/mnist_config.py
similarity index 99%
rename from doc_cn/ui/data_provider/mnist_config.py
rename to doc/api/data_provider/src/mnist_config.py
index 39becff03b08f5e75b8503aaf01e782d2b0fb3be..429338c57f8f865f0c5835d933445b65ee2ea7aa 100644
--- a/doc_cn/ui/data_provider/mnist_config.py
+++ b/doc/api/data_provider/src/mnist_config.py
@@ -5,5 +5,6 @@ define_py_data_sources2(
     test_list=None,
     module='mnist_provider',
     obj='process')
+
 img = data_layer(name='pixel', size=784)
 label = data_layer(name='label', size=10)
diff --git a/doc_cn/ui/data_provider/mnist_provider.dict.py b/doc/api/data_provider/src/mnist_provider.dict.py
similarity index 100%
rename from doc_cn/ui/data_provider/mnist_provider.dict.py
rename to doc/api/data_provider/src/mnist_provider.dict.py
diff --git a/doc_cn/ui/data_provider/mnist_train.txt b/doc/api/data_provider/src/mnist_train.txt
similarity index 100%
rename from doc_cn/ui/data_provider/mnist_train.txt
rename to doc/api/data_provider/src/mnist_train.txt
diff --git a/doc_cn/ui/data_provider/sentimental_config.py b/doc/api/data_provider/src/sentimental_config.py
similarity index 100%
rename from doc_cn/ui/data_provider/sentimental_config.py
rename to doc/api/data_provider/src/sentimental_config.py
diff --git a/doc_cn/ui/data_provider/sentimental_provider.py b/doc/api/data_provider/src/sentimental_provider.py
similarity index 81%
rename from doc_cn/ui/data_provider/sentimental_provider.py
rename to doc/api/data_provider/src/sentimental_provider.py
index 0fb0bb88e95a230f01f18b78ebb37b659c3768f1..14bd0e05a921dbfd5212d8483524d3af3e4ae98f 100644
--- a/doc_cn/ui/data_provider/sentimental_provider.py
+++ b/doc/api/data_provider/src/sentimental_provider.py
@@ -8,19 +8,16 @@ def on_init(settings, dictionary, **kwargs):
 
     # set input types in runtime. It will do the same thing as
     # @provider(input_types) will do, but it is set dynamically during runtime.
-    settings.input_types = [
+    settings.input_types = {
         # The text is a sequence of integer values, and each value is a word id.
         # The whole sequence is the sentences that we want to predict its
         # sentimental.
-        integer_value(
-            len(dictionary), seq_type=SequenceType),  # text input
+        'data': integer_value_sequence(len(dictionary)),  # text input
+        'label': integer_value(2)  # label positive/negative
+    }
 
-        # label positive/negative
-        integer_value(2)
-    ]
-
-    # save dictionary as settings.dictionary. It will be used in process
-    # method.
+    # save dictionary as settings.dictionary. 
+    # It will be used in process method.
     settings.dictionary = dictionary
 
 
diff --git a/doc_cn/ui/data_provider/sentimental_train.txt b/doc/api/data_provider/src/sentimental_train.txt
similarity index 100%
rename from doc_cn/ui/data_provider/sentimental_train.txt
rename to doc/api/data_provider/src/sentimental_train.txt
diff --git a/doc_cn/ui/data_provider/train.list b/doc/api/data_provider/src/train.list
similarity index 100%
rename from doc_cn/ui/data_provider/train.list
rename to doc/api/data_provider/src/train.list
diff --git a/doc/api/index_cn.rst b/doc/api/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3718cd73a2003b8ef6c406a9bd51dc68e76402dc
--- /dev/null
+++ b/doc/api/index_cn.rst
@@ -0,0 +1,37 @@
+API中文手册
+============
+
+DataProvider API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    data_provider/dataprovider_cn.rst
+    data_provider/pydataprovider2_cn.rst
+
+..  _api_trainer_config:
+
+Model Config API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    trainer_config_helpers/optimizers.rst
+    trainer_config_helpers/data_sources.rst
+    trainer_config_helpers/layers.rst
+    trainer_config_helpers/activations.rst 
+    trainer_config_helpers/poolings.rst
+    trainer_config_helpers/networks.rst
+    trainer_config_helpers/evaluators.rst
+    trainer_config_helpers/attrs.rst
+
+
+Applications API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    predict/swig_py_paddle_cn.rst
diff --git a/doc/api/index_en.rst b/doc/api/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..10c297a71d6988c002de868e804ed9ee2345fbd7
--- /dev/null
+++ b/doc/api/index_en.rst
@@ -0,0 +1,37 @@
+API
+===
+
+DataProvider API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    data_provider/dataprovider_en.rst
+    data_provider/pydataprovider2_en.rst
+
+..  _api_trainer_config:
+
+Model Config API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    trainer_config_helpers/optimizers.rst
+    trainer_config_helpers/data_sources.rst
+    trainer_config_helpers/layers.rst
+    trainer_config_helpers/activations.rst 
+    trainer_config_helpers/poolings.rst
+    trainer_config_helpers/networks.rst
+    trainer_config_helpers/evaluators.rst
+    trainer_config_helpers/attrs.rst
+
+
+Applications API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    predict/swig_py_paddle_en.rst
diff --git a/doc/ui/predict/predict_sample.py b/doc/api/predict/src/predict_sample.py
similarity index 99%
rename from doc/ui/predict/predict_sample.py
rename to doc/api/predict/src/predict_sample.py
index 63e8b36d26057d4a87dabb8745de8e13efe2524f..51349250e80ce11e476d952991f0d046f65286b4 100644
--- a/doc/ui/predict/predict_sample.py
+++ b/doc/api/predict/src/predict_sample.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/doc/api/predict/swig_py_paddle_cn.rst b/doc/api/predict/swig_py_paddle_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..15e35353bb25e7906191e47eae49c824b521c7fd
--- /dev/null
+++ b/doc/api/predict/swig_py_paddle_cn.rst
@@ -0,0 +1,56 @@
+基于Python的预测
+================
+
+预测流程
+--------
+
+PaddlePaddle使用swig对常用的预测接口进行了封装，通过编译会生成py_paddle软件包，安装该软件包就可以在python环境下实现模型预测。可以使用python的 ``help()`` 函数查询软件包相关API说明。
+
+基于Python的模型预测，主要包括以下五个步骤。
+
+1. 初始化PaddlePaddle环境
+
+   在程序开始阶段，通过调用 ``swig_paddle.initPaddle()`` 并传入相应的命令行参数初始化PaddlePaddle。
+
+2. 解析模型配置文件
+   
+   初始化之后，可以通过调用 ``parse_config()`` 解析训练模型时用的配置文件。注意预测数据通常不包含label, 同时预测网络通常直接输出最后一层的结果而不是像训练网络一样再接一层cost layer，所以一般需要对训练用的模型配置文件稍作相应修改才能在预测时使用。
+
+3. 构造paddle.GradientMachine
+  
+   通过调用 ``swig_paddle.GradientMachine.createFromConfigproto()`` 传入上一步解析出来的模型配置就可以创建一个 ``GradientMachine``。
+
+4. 准备预测数据
+  
+   swig_paddle中的预测接口的参数是自定义的C++数据类型，py_paddle里面提供了一个工具类 ``DataProviderConverter`` 可以用于接收和PyDataProvider2一样的输入数据并转换成预测接口所需的数据类型。
+
+5. 模型预测
+  
+   通过调用 ``forwardTest()`` 传入预测数据，直接返回计算结果。
+
+
+预测Demo
+--------
+
+如下是一段使用mnist model来实现手写识别的预测代码。完整的代码见 ``src_root/doc/ui/predict/predict_sample.py`` 。mnist model可以通过 ``src_root\demo\mnist`` 目录下的demo训练出来。
+
+..  literalinclude:: src/predict_sample.py
+    :language: python
+    :lines: 15-18,121-136
+
+
+Demo预测输出如下，其中value即为softmax层的输出。由于TEST_DATA包含两条预测数据，所以输出的value包含两个向量 。
+
+..  code-block:: text
+
+    [{'id': None, 'value': array(
+      [[  5.53018653e-09,   1.12194102e-05,   1.96644767e-09,
+          1.43630644e-02,   1.51111044e-13,   9.85625684e-01,
+          2.08823112e-10,   2.32777140e-08,   2.00186201e-09,
+          1.15501715e-08],
+       [  9.99982715e-01,   1.27787406e-10,   1.72296313e-05,
+          1.49316648e-09,   1.36540484e-11,   6.93137714e-10,
+          2.70634608e-08,   3.48565123e-08,   5.25639710e-09,
+          4.48684503e-08]], dtype=float32)}]
+
+
diff --git a/doc/ui/predict/swig_py_paddle_en.rst b/doc/api/predict/swig_py_paddle_en.rst
similarity index 91%
rename from doc/ui/predict/swig_py_paddle_en.rst
rename to doc/api/predict/swig_py_paddle_en.rst
index b743fc456914664168e1be6c7f18a419c38afa62..1c628e6971fa5643e6a9ca629488049957686193 100644
--- a/doc/ui/predict/swig_py_paddle_en.rst
+++ b/doc/api/predict/swig_py_paddle_en.rst
@@ -1,5 +1,5 @@
-Python Prediction API
-=====================
+Python Prediction
+==================
 
 PaddlePaddle offers a set of clean prediction interfaces for python with the help of
 SWIG. The main steps of predict values in python are:
@@ -13,7 +13,7 @@ Here is a sample python script that shows the typical prediction process for the
 MNIST classification problem. A complete sample code could be found at
 :code:`src_root/doc/ui/predict/predict_sample.py`.
 
-..  literalinclude:: ./predict_sample.py
+..  literalinclude:: src/predict_sample.py
     :language: python
     :lines: 15-18,90-100,101-104
 
@@ -23,7 +23,7 @@ python's :code:`help()` function. Let's walk through the above python script:
 
 * At the beginning, use :code:`swig_paddle.initPaddle()` to initialize
   PaddlePaddle with command line arguments, for more about command line arguments
-  see `Command Line Arguments <../cmd_argument/detail_introduction.html>`_.
+  see :ref:`cmd_detail_introduction` .
 * Parse the configuration file that is used in training with :code:`parse_config()`.
   Because data to predict with always have no label, and output of prediction work
   normally is the output layer rather than the cost layer, so you should modify
@@ -36,7 +36,7 @@ python's :code:`help()` function. Let's walk through the above python script:
     - Note: As swig_paddle can only accept C++ matrices, we offer a utility
       class DataProviderConverter that can accept the same input data with
       PyDataProvider2, for more information please refer to document
-      of `PyDataProvider2 <../data_provider/pydataprovider2.html>`_.
+      of :ref:`api_pydataprovider2` .
 * Do the prediction with :code:`forwardTest()`, which takes the converted
   input data and outputs the activations of the output layer.
 
diff --git a/doc/ui/api/trainer_config_helpers/activations.rst b/doc/api/trainer_config_helpers/activations.rst
similarity index 100%
rename from doc/ui/api/trainer_config_helpers/activations.rst
rename to doc/api/trainer_config_helpers/activations.rst
diff --git a/doc/api/trainer_config_helpers/attrs.rst b/doc/api/trainer_config_helpers/attrs.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ac63127bf7d9db6351365ab7b58f43db12347a8e
--- /dev/null
+++ b/doc/api/trainer_config_helpers/attrs.rst
@@ -0,0 +1,5 @@
+Parameter Attributes
+=======================
+
+..  automodule:: paddle.trainer_config_helpers.attrs
+    :members:
diff --git a/doc/ui/api/trainer_config_helpers/data_sources.rst b/doc/api/trainer_config_helpers/data_sources.rst
similarity index 67%
rename from doc/ui/api/trainer_config_helpers/data_sources.rst
rename to doc/api/trainer_config_helpers/data_sources.rst
index 44ea59df43762508e86c7b867fcf136d84c8351e..b9dd4dda01ae59d1260356aff50ddf298d02c87f 100644
--- a/doc/ui/api/trainer_config_helpers/data_sources.rst
+++ b/doc/api/trainer_config_helpers/data_sources.rst
@@ -1,3 +1,5 @@
+..  _api_trainer_config_helpers_data_sources:
+
 DataSources
 ===========
 
diff --git a/doc/ui/api/trainer_config_helpers/evaluators.rst b/doc/api/trainer_config_helpers/evaluators.rst
similarity index 100%
rename from doc/ui/api/trainer_config_helpers/evaluators.rst
rename to doc/api/trainer_config_helpers/evaluators.rst
diff --git a/doc/ui/api/trainer_config_helpers/layers.rst b/doc/api/trainer_config_helpers/layers.rst
similarity index 96%
rename from doc/ui/api/trainer_config_helpers/layers.rst
rename to doc/api/trainer_config_helpers/layers.rst
index b487b739a719e9f7118efcc143301da36f7a978e..52a6cfb120504d57617f0d777b5ca49cd7d269d7 100644
--- a/doc/ui/api/trainer_config_helpers/layers.rst
+++ b/doc/api/trainer_config_helpers/layers.rst
@@ -1,3 +1,5 @@
+..  _api_trainer_config_helpers_layers:
+
 ======
 Layers
 ======
@@ -20,6 +22,8 @@ LayerOutput
 Data layer
 ===========
 
+..  _api_trainer_config_helpers_layers_data_layer:
+
 data_layer
 ----------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -29,6 +33,8 @@ data_layer
 Fully Connected Layers
 ======================
 
+..  _api_trainer_config_helpers_layers_fc_layer:
+
 fc_layer
 --------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -68,6 +74,8 @@ img_conv_layer
     :members: img_conv_layer
     :noindex:
 
+..  _api_trainer_config_helpers_layers_context_projection:
+
 context_projection 
 ------------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -185,6 +193,8 @@ mixed_layer
     :members: mixed_layer
     :noindex:
 
+..  _api_trainer_config_helpers_layers_embedding_layer:
+
 embedding_layer
 ---------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -237,6 +247,8 @@ trans_full_matrix_projection
 Aggregate Layers
 ================
 
+..  _api_trainer_config_helpers_layers_pooling_layer:
+
 pooling_layer
 -------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -333,6 +345,8 @@ tensor_layer
     :members: tensor_layer
     :noindex:
 
+..  _api_trainer_config_helpers_layers_cos_sim:
+
 cos_sim
 -------
 ..  automodule:: paddle.trainer_config_helpers.layers
diff --git a/doc/ui/api/trainer_config_helpers/networks.rst b/doc/api/trainer_config_helpers/networks.rst
similarity index 97%
rename from doc/ui/api/trainer_config_helpers/networks.rst
rename to doc/api/trainer_config_helpers/networks.rst
index 29c52c5ce3078f1755162dbbdd65a059d8ba9fa4..e13c368051abe3c50036c3baab988f170df4c641 100644
--- a/doc/ui/api/trainer_config_helpers/networks.rst
+++ b/doc/api/trainer_config_helpers/networks.rst
@@ -13,6 +13,8 @@ sequence_conv_pool
     :members: sequence_conv_pool
     :noindex:
 
+..  _api_trainer_config_helpers_network_text_conv_pool:
+
 text_conv_pool
 --------------
 ..  automodule:: paddle.trainer_config_helpers.networks
diff --git a/doc/ui/api/trainer_config_helpers/optimizers.rst b/doc/api/trainer_config_helpers/optimizers.rst
similarity index 100%
rename from doc/ui/api/trainer_config_helpers/optimizers.rst
rename to doc/api/trainer_config_helpers/optimizers.rst
diff --git a/doc/ui/api/trainer_config_helpers/poolings.rst b/doc/api/trainer_config_helpers/poolings.rst
similarity index 100%
rename from doc/ui/api/trainer_config_helpers/poolings.rst
rename to doc/api/trainer_config_helpers/poolings.rst
diff --git a/doc/build/docker_install.rst b/doc/build/docker_install.rst
deleted file mode 100644
index e95de35f4da35fee511551f13bc6026532cce5c3..0000000000000000000000000000000000000000
--- a/doc/build/docker_install.rst
+++ /dev/null
@@ -1,122 +0,0 @@
-Docker installation guide
-==========================
-
-PaddlePaddle provide the `Docker <https://www.docker.com/>`_ image. `Docker`_ is a lightweight container utilities. The performance of PaddlePaddle in `Docker`_ container is basically as same as run it in a normal linux. The `Docker`_ is a very convenient way to deliver the binary release for linux programs.
-
-..  note::
-
-    The `Docker`_ image is the recommended way to run PaddlePaddle 
-
-PaddlePaddle Docker images
---------------------------
-
-There are 12 `images <https://hub.docker.com/r/paddledev/paddle/tags/>`_ for PaddlePaddle, and the name is :code:`paddle-dev/paddle`,  tags are\: 
-
-
-+-----------------+------------------+------------------------+-----------------------+
-|                 |   normal         |           devel        |          demo         |
-+=================+==================+========================+=======================+
-|       CPU       | cpu-latest       | cpu-devel-latest       | cpu-demo-latest       |
-+-----------------+------------------+------------------------+-----------------------+
-|       GPU       | gpu-latest       | gpu-devel-latest       | gpu-demo-latest       |
-+-----------------+------------------+------------------------+-----------------------+
-| CPU WITHOUT AVX | cpu-noavx-latest | cpu-devel-noavx-latest | cpu-demo-noavx-latest |
-+-----------------+------------------+------------------------+-----------------------+
-| GPU WITHOUT AVX | gpu-noavx-latest | gpu-devel-noavx-latest | gpu-demo-noavx-latest |
-+-----------------+------------------+------------------------+-----------------------+
-
-And the three columns are:
-
-* normal\: The docker image only contains binary of PaddlePaddle.
-* devel\: The docker image contains PaddlePaddle binary, source code and essential build environment.
-* demo\: The docker image contains the dependencies to run PaddlePaddle demo.
-
-And the four rows are:
-
-* CPU\: CPU Version. Support CPU which has :code:`AVX` instructions.
-* GPU\: GPU Version. Support GPU, and cpu has :code:`AVX` instructions.
-* CPU WITHOUT AVX\: CPU Version, which support most CPU even doesn't have :code:`AVX` instructions.
-* GPU WITHOUT AVX\: GPU Version, which support most CPU even doesn't have :code:`AVX` instructions.
-
-User can choose any version depends on machine. The following script can help you to detect your CPU support :code:`AVX` or not.
-
-..  code-block:: bash
-    
-    if cat /proc/cpuinfo | grep -q avx ; then echo "Support AVX"; else echo "Not support AVX"; fi
-
-If the output is :code:`Support AVX`, then you can choose the AVX version of PaddlePaddle, otherwise, you need select :code:`noavx` version of PaddlePaddle. For example, the CPU develop version of PaddlePaddle is :code:`paddle-dev/paddle:cpu-devel-latest`.
-
-The PaddlePaddle images don't contain any entry command. You need to write your entry command to use this image. See :code:`Remote Access` part or just use following command to run a :code:`bash`
-
-..  code-block:: bash
-
-    docker run -it paddledev/paddle:cpu-latest /bin/bash
-
-
-Download and Run Docker images
-------------------------------
-
-You have to install Docker in your machine which has linux kernel version 3.10+ first. You can refer to the official guide https://docs.docker.com/engine/installation/ for further information.
-
-You can use :code:`docker pull ` to download images first, or just launch a container with :code:`docker run` \:
-
-..  code-block:: bash
-
-    docker run -it paddledev/paddle:cpu-latest
-
-
-If you want to launch container with GPU support, you need to set some environment variables at the same time:
-
-..  code-block:: bash
-
-    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest
-
-
-Some notes for docker
----------------------
-
-Performance
-+++++++++++
-
-Since Docker is based on the lightweight virtual containers, the CPU computing performance maintains well. And GPU driver and equipments are all mapped to the container, so the GPU computing performance would not be seriously affected.
-
-If you use high performance nic, such as RDMA(RoCE 40GbE or IB 56GbE), Ethernet(10GbE), it is recommended to use config "-net = host".
-
-
-
-
-Remote access
-+++++++++++++
-
-
-If you want to enable ssh access background, you need to build an image by yourself. Please refer to official guide https://docs.docker.com/engine/reference/builder/ for further information.
-
-Following is a simple Dockerfile with ssh:
-
-..  literalinclude:: ../../doc_cn/build_and_install/install/paddle_ssh.Dockerfile
-
-Then you can build an image with Dockerfile and launch a container:
-
-..  code-block:: bash
-
-    # cd into Dockerfile directory
-    docker build . -t paddle_ssh
-    # run container, and map host machine port 8022 to container port 22
-    docker run -d -p 8022:22 --name paddle_ssh_machine paddle_ssh
-
-Now, you can ssh on port 8022 to access the container, username is root, password is also root:
-
-..  code-block:: bash
-
-    ssh -p 8022 root@YOUR_HOST_MACHINE
-
-You can stop and delete the container as following:
-
-..  code-block:: bash
-
-    # stop
-    docker stop paddle_ssh_machine
-    # delete
-    docker rm paddle_ssh_machine
diff --git a/doc/cluster/index.rst b/doc/cluster/index.rst
deleted file mode 100644
index 9062f85f98d2981b5c8dcf8149e32c2ccdac77f4..0000000000000000000000000000000000000000
--- a/doc/cluster/index.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-Cluster Train
-====================
-
-.. toctree::
-  :glob:
-
-  opensource/cluster_train.md
-  internal/index.md
diff --git a/doc/demo/image_classification/index.rst b/doc/demo/image_classification/index.rst
deleted file mode 100644
index 1ea68f14164b22cd211d09d72a7358fe24e4fed7..0000000000000000000000000000000000000000
--- a/doc/demo/image_classification/index.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-Image Classification Tutorial
-=============================
-
-.. toctree::
-    :maxdepth: 3
-    :glob:
-
-    Training Locally <image_classification.md>
-    cluster_train/internal/cluster_train.md
-    cluster_train/opensource/cluster_train.md
diff --git a/doc/demo/index.md b/doc/demo/index.md
deleted file mode 100644
index 289199d496eb3b527fa8c8261820bc8e4d301786..0000000000000000000000000000000000000000
--- a/doc/demo/index.md
+++ /dev/null
@@ -1,21 +0,0 @@
-# Examples and demos
-There are serveral examples and demos here.
-
-## Image
-
-* [Image Classification](image_classification/index.rst)
-
-## NLP
-
-* [Sentiment Analysis](sentiment_analysis/index.rst)
-* [Text Generation](text_generation/index.rst)
-* [Semantic Role Labeling](semantic_role_labeling/index.rst)
-
-## Recommendation
-
-* [MovieLens Dataset](rec/ml_dataset.md)
-* [MovieLens Regression](rec/ml_regression.rst)
-
-## Model Zoo
-* [ImageNet: ResNet](imagenet_model/resnet_model.md)
-* [Embedding: Chinese Word](embedding_model/index.md)
diff --git a/doc/demo/semantic_role_labeling/index.rst b/doc/demo/semantic_role_labeling/index.rst
deleted file mode 100644
index ff3035059bd77a8688714db484e420b113c73e53..0000000000000000000000000000000000000000
--- a/doc/demo/semantic_role_labeling/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Semantic Role Labeling Tutorial
-===============================
-
-.. toctree::
-    :maxdepth: 3
-
-    semantic_role_labeling.md
diff --git a/doc/demo/sentiment_analysis/index.rst b/doc/demo/sentiment_analysis/index.rst
deleted file mode 100644
index 9ee6d3a177c19de9fabf7b7e86c7c371bc094736..0000000000000000000000000000000000000000
--- a/doc/demo/sentiment_analysis/index.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-Sentiment Analasis Tutorial
-===========================
-
-.. toctree::
-    :maxdepth: 3
-    :glob:
-
-    Training Locally <sentiment_analysis.md>
-    internal/cluster_train.md
diff --git a/doc/demo/text_generation/index.rst b/doc/demo/text_generation/index.rst
deleted file mode 100644
index 82da5524197ac8d4652f0e30f446b5a88bf1629d..0000000000000000000000000000000000000000
--- a/doc/demo/text_generation/index.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-Text Generation Tutorial
-========================
-
-.. toctree::
-    :maxdepth: 3
-    :glob:
-
-    Training Locally <text_generation.md>
-    internal/cluster_train.md
diff --git a/doc/dev/index.rst b/doc/dev/index.rst
deleted file mode 100644
index 0468dd492b6246cfe0771a05c3597ddee95b3ddd..0000000000000000000000000000000000000000
--- a/doc/dev/index.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-Development Guide
-=================
-
-..  toctree::
-  :maxdepth: 1
-
-  layer.md
-  new_layer/new_layer.rst
-  ../source/index.md
diff --git a/doc/dev/layer.md b/doc/dev/layer.md
deleted file mode 100644
index 930fb0de1ac074b15d06197ed0e732f92288b411..0000000000000000000000000000000000000000
--- a/doc/dev/layer.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Layer Documents
-
-* [Layer Source Code Document](../source/gserver/layers/index.rst)
-* [Layer Python API Document](../ui/api/trainer_config_helpers/index.rst)
diff --git a/doc_cn/faq/index.rst b/doc/faq/index_cn.rst
similarity index 55%
rename from doc_cn/faq/index.rst
rename to doc/faq/index_cn.rst
index 3eb0e10ae2228740cd384270db5070e367f7007b..f2f114065c4109cd0b36752da622ea01ce822ceb 100644
--- a/doc_cn/faq/index.rst
+++ b/doc/faq/index_cn.rst
@@ -1,25 +1,21 @@
 ####################
-PaddlePaddle常见问题
+FAQ
 ####################
 
 ..  contents::
 
-1. 如何减少PaddlePaddle的内存占用
+1. 如何减少内存占用
 ---------------------------------
 
-神经网络的训练本身是一个非常消耗内存和显存的工作。经常会消耗数十G的内存和数G的显存。
+神经网络的训练本身是一个非常消耗内存和显存的工作，经常会消耗数10GB的内存和数GB的显存。
 PaddlePaddle的内存占用主要分为如下几个方面\:
 
-* DataProvider缓冲池内存 (只针对内存)
-* 神经元激活内存 （针对内存和显存）
-* 参数内存 (针对内存和显存)
+* DataProvider缓冲池内存（只针对内存）
+* 神经元激活内存（针对内存和显存）
+* 参数内存 （针对内存和显存）
 * 其他内存杂项
 
-这其中，其他内存杂项是指PaddlePaddle本身所用的一些内存，包括字符串分配，临时变量等等，
-这些内存就不考虑如何缩减了。
-
-其他的内存的减少方法依次为
-
+其中，其他内存杂项是指PaddlePaddle本身所用的一些内存，包括字符串分配，临时变量等等，暂不考虑在内。
 
 减少DataProvider缓冲池内存
 ++++++++++++++++++++++++++
@@ -37,30 +33,29 @@ PyDataProvider使用的是异步加载，同时在内存里直接随即选取数
 个内存池实际上决定了shuffle的粒度。所以，如果将这个内存池减小，又要保证数据是随机的，
 那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为
 
-..  literalinclude:: reduce_min_pool_size.py
+..  literalinclude:: src/reduce_min_pool_size.py
 
-这样做可以极大的减少内存占用，并且可能会加速训练过程。 详细文档参考 `这里
-<../ui/data_provider/pydataprovider2.html#provider>`_ 。
+这样做可以极大的减少内存占用，并且可能会加速训练过程，详细文档参考 `这里 <../ui/data_provider/pydataprovider2.html#provider>`_ 。
 
 神经元激活内存
 ++++++++++++++
 
-神经网络在训练的时候，会对每一个激活暂存一些数据，包括激活，參差等等。
+神经网络在训练的时候，会对每一个激活暂存一些数据，如神经元激活值等。
 在反向传递的时候，这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系，
 一是batch size，另一个是每条序列(Sequence)长度。所以，其实也是和每个mini-batch中包含
 的时间步信息成正比。
 
-所以，做法可以有两种。他们是
+所以做法可以有两种：
 
 * 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数，减小batch size可能会对训练结果产生影响。
 * 减小序列的长度，或者直接扔掉非常长的序列。比如，一个数据集大部分序列长度是100-200,
-  但是突然有一个10000长的序列，就很容易导致内存超限。特别是在LSTM等RNN中。
+  但是突然有一个10000长的序列，就很容易导致内存超限，特别是在LSTM等RNN中。
 
 参数内存
 ++++++++
 
 PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需要使用不同大小的内存。
-例如如果使用 :code:`adadelta` 算法，则需要使用参数规模大约5倍的内存。 如果参数保存下来的
+例如使用 :code:`adadelta` 算法，则需要使用等于权重参数规模大约5倍的内存。举例，如果参数保存下来的模型目录
 文件为 :code:`100M`， 那么该优化算法至少需要 :code:`500M` 的内存。
 
 可以考虑使用一些优化算法，例如 :code:`momentum`。
@@ -68,11 +63,11 @@ PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需
 2. 如何加速PaddlePaddle的训练速度
 ---------------------------------
 
-PaddlePaddle是神经网络训练平台，加速PaddlePaddle训练有如下几个方面\：
+加速PaddlePaddle训练可以考虑从以下几个方面\：
 
 * 减少数据载入的耗时
 * 加速训练速度
-* 利用更多的计算资源
+* 利用分布式训练驾驭更多的计算资源
 
 减少数据载入的耗时
 ++++++++++++++++++
@@ -80,7 +75,7 @@ PaddlePaddle是神经网络训练平台，加速PaddlePaddle训练有如下几
 使用 :code:`pydataprovider`时，可以减少缓存池的大小，同时设置内存缓存功能，即可以极大的加速数据载入流程。
 :code:`DataProvider` 缓存池的减小，和之前减小通过减小缓存池来减小内存占用的原理一致。
 
-..  literalinclude:: reduce_min_pool_size.py
+..  literalinclude:: src/reduce_min_pool_size.py
 
 同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法，将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话，会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里，在之后的 :code:`pass` 中，不会再从 :code:`python` 端读取数据，而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。
 
@@ -94,11 +89,11 @@ PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`spa
 
 使用一个词前两个词和后两个词，来预测这个中间的词。这个任务的DataProvider为\:
 
-..  literalinclude:: word2vec_dataprovider.py
+..  literalinclude:: src/word2vec_dataprovider.py
 
 这个任务的配置为\:
 
-..  literalinclude:: word2vec_config.py
+..  literalinclude:: src/word2vec_config.py
 
 更多关于sparse训练的内容请参考 `sparse训练的文档 <TBD>`_
 
@@ -108,25 +103,20 @@ PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`spa
 利用更多的计算资源可以分为一下几个方式来进行\:
 
 * 单机CPU训练
-  * 使用多线程训练。设置命令行参数 :code:`trainer_count`，即可以设置参与训练的线程数量。使用方法为 :code:`paddle train --trainer_count=4`
+  * 使用多线程训练。设置命令行参数 :code:`trainer_count`。
+
 * 单机GPU训练
-  * 使用显卡训练。设置命令行参数 :code:`use_gpu`。 使用方法为 :code:`paddle train --use_gpu=true`
-  * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count`。使用 :code:`--use_gpu=True` 开启GPU训练，使用 :code:`trainer_count` 指定显卡数量。使用方法为 :code:`paddle train --use_gpu=true --trainer_count=4`
+  * 使用显卡训练。设置命令行参数 :code:`use_gpu`。
+  * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count` 。
+
 * 多机训练
-  * 使用多机训练的方法也比较简单，需要先在每个节点启动 :code:`paddle pserver`，在使用 :code:`paddle train --pservers=192.168.100.1,192.168.100.2` 来指定每个pserver的ip地址
-  * 具体的多机训练方法参考 `多机训练 <TBD>`_ 文档。
+  * 具体的多机训练方法参考  `多机训练文档 <../ui/data_provider/pydataprovider2.html#provider>`_ 。
 
 
-3. 遇到“非法指令”或者是“illegal instruction” 
+3. 遇到“非法指令”或者是“illegal instruction”
 --------------------------------------------
 
-paddle在进行计算的时候为了提升计算性能，使用了avx指令。部分老的cpu型号无法支持这样的指令。通常来说执行下grep avx /proc/cpuinfo看看是否有输出即可知道是否支持。（另：用此方法部分虚拟机可能检测到支持avx指令但是实际运行会挂掉，请当成是不支持，看下面的解决方案）
-
-解决办法是\:
-
-* 使用 NO_AVX的 `安装包 <../build_and_install/index.html>`_ 或者 `Docker image <../build_and_install/install/docker_install.html>`_
-* 或者，使用 :code:`-DWITH_AVX=OFF` 重新编译PaddlePaddle。
-
+PaddlePaddle使用avx SIMD指令提高cpu执行效率，因此错误的使用二进制发行版可能会导致这种错误，请选择正确的版本。
 
 4. 如何选择SGD算法的学习率
 --------------------------
@@ -150,7 +140,7 @@ paddle在进行计算的时候为了提升计算性能，使用了avx指令。
 
 ..  code-block:: python
 
-    hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0), 
+    hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
                       bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
 
 上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。
@@ -158,7 +148,7 @@ paddle在进行计算的时候为了提升计算性能，使用了avx指令。
 6. 如何共享参数
 ---------------
 
-PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字的参数，会共享参数。设置参数的名字，可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式，是想要共享的参数使用同样的 :code:`ParamAttr` 对象。
+PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字的参数，会共享参数。设置参数的名字，可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式，是使得要共享的参数使用同样的 :code:`ParamAttr` 对象。
 
 简单的全连接网络，参数共享的配置示例为\:
 
@@ -166,8 +156,8 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 
 这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
 
-7. *-cp27mu-linux_x86_64.whl is not a supported wheel on this platform.
------------------------------------------------------------------------
+7. \*-cp27mu-linux_x86_64.whl is not a supported wheel on this platform.
+------------------------------------------------------------------------
 
 出现这个问题的主要原因是，系统编译wheel包的时候，使用的 :code:`wheel` 包是最新的，
 而系统中的 :code:`pip` 包比较老。具体的解决方法是，更新 :code:`pip` 包并重新编译PaddlePaddle。
@@ -200,17 +190,116 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
     41 - test_config_parser (Failed)
     42 - test_swig_api (Failed)
     43 - layers_test (Failed)
-    
+
 并且查询PaddlePaddle单元测试的日志，提示：
 
 ..  code-block:: bash
-    
+
     paddle package is already in your PYTHONPATH. But unittest need a clean environment.
     Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
-    
-解决办法是：卸载paddle包 :code:`pip uninstall paddle`。
 
-原因是：单元测试使用了一个旧版本的python包，而没有测试到代码中实际修改的python包。即单元测试需要一个干净的环境：
+解决办法是：
+
+* 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包，使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面，单元测试会引用site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。同时，即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
+
+
+9. 运行Docker GPU镜像出现 "CUDA driver version is insufficient"
+----------------------------------------------------------------
+
+用户在使用PaddlePaddle GPU的Docker镜像的时候，常常出现 `Cuda Error: CUDA driver version is insufficient for CUDA runtime version`, 原因在于没有把机器上CUDA相关的驱动和库映射到容器内部。
+具体的解决方法是：
+
+..  code-block:: bash
+
+    $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddlepaddle:latest-gpu
+
+更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/doc_cn/build_and_install/install/docker_install.html>`_ 。
+
+
+10. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致
+----------------------------------------------------------------
+
+这是目前CMake寻找Python的逻辑存在缺陷，如果系统安装了多个Python版本，CMake找到的Python库和Python解释器版本可能有不一致现象，导致编译PaddlePaddle失败。正确的解决方法是，
+用户强制指定特定的Python版本，具体操作如下：
+
+    ..  code-block:: bash
+
+        cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
+
+用户需要指定本机上Python的路径：``<exc_path>``, ``<lib_path>``, ``<inc_path>``
+
+10. A protocol message was rejected because it was too big
+----------------------------------------------------------
+
+如果在训练NLP相关模型时，出现以下错误：
+
+..  code-block:: bash
+
+    [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes).  To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
+    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
+
+可能的原因是：传给dataprovider的某一个args过大，一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似：
+
+..  code-block:: python
+
+     src_dict = dict()
+     for line_count, line in enumerate(open(src_dict_path, "r")):
+        src_dict[line.strip()] = line_count
+
+     define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={"src_dict": src_dict})
+
+解决方案是：将字典的地址作为args传给dataprovider，然后在dataprovider里面根据该地址加载字典。即define_py_data_sources2应改为：
+
+..  code-block:: python
+
+     define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={"src_dict_path": src_dict_path})
+
+完整源码可参考 `seqToseq <https://github.com/PaddlePaddle/Paddle/tree/develop/demo/seqToseq>`_ 示例。
+
+11. 如何指定GPU设备
+-------------------
+
+例如机器上有4块GPU，编号从0开始，指定使用2、3号GPU：
+
+* 方式1：通过 `CUDA_VISIBLE_DEVICES <http://www.acceleware.com/blog/cudavisibledevices-masking-gpus>`_ 环境变量来指定特定的GPU。
+
+..      code-block:: bash
+
+        env CUDA_VISIBLE_DEVICES=2,3 paddle train --use_gpu=true --trainer_count=2
+
+* 方式2：通过命令行参数 ``--gpu_id`` 指定。
+
+..      code-block:: bash
+
+        paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
+
+12. 编译源码提示warp-ctc/include/ctc.h 找不到的情况
+---------------------------------------------------
+
+目前Paddle使用\ :code:`git submodule`\ 来引用一些第三方模块。简单的\
+:code:`git clone`\ 命令不能得到第三方模块的代码。需要使用\:
+
+..  code-block:: bash
+
+    git clone --recursive https://github.com/PaddlePaddle/Paddle.git
+
+来获取所有源码。对于已经clone的git版本库，可以在Paddle的源码目录中执行\:
+
+..  code-block:: bash
+
+    git submodule init
+    git submodule update
 
-* 如果paddle包已经在python的site-packages里面了，那么单元测试时使用的paddle包，就是site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。
-* 即便设置了 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
\ No newline at end of file
+来获得所有第三方模块。
\ No newline at end of file
diff --git a/doc_cn/faq/reduce_min_pool_size.py b/doc/faq/src/reduce_min_pool_size.py
similarity index 100%
rename from doc_cn/faq/reduce_min_pool_size.py
rename to doc/faq/src/reduce_min_pool_size.py
diff --git a/doc_cn/faq/word2vec_config.py b/doc/faq/src/word2vec_config.py
similarity index 100%
rename from doc_cn/faq/word2vec_config.py
rename to doc/faq/src/word2vec_config.py
diff --git a/doc_cn/faq/word2vec_dataprovider.py b/doc/faq/src/word2vec_dataprovider.py
similarity index 100%
rename from doc_cn/faq/word2vec_dataprovider.py
rename to doc/faq/src/word2vec_dataprovider.py
diff --git a/doc/getstarted/basic_usage/index_cn.rst b/doc/getstarted/basic_usage/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d01cdaaeb75ec7d02480eb9162cabaad2a947db9
--- /dev/null
+++ b/doc/getstarted/basic_usage/index_cn.rst
@@ -0,0 +1,108 @@
+经典的线性回归任务
+==================
+
+PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍将向你展示如何利用PaddlePaddle来解决一个经典的线性回归问题。
+
+任务简介
+--------
+
+我们展示如何用PaddlePaddle解决 `单变量的线性回归 <https://www.baidu.com/s?wd=单变量线性回归>`_ 问题。线性回归的输入是一批点 `(x, y)` ，其中 `y = wx + b + ε`， 而 ε 是一个符合高斯分布的随机变量。线性回归的输出是从这批点估计出来的参数 `w` 和 `b` 。
+
+一个例子是房产估值。我们假设房产的价格（y）是其大小（x）的一个线性函数，那么我们可以通过收集市场上房子的大小和价格，用来估计线性函数的参数w 和 b。
+
+准备数据
+-----------
+
+假设变量 `x` 和 `y` 的真实关系为： `y = 2x + 0.3 + ε`，这里展示如何使用观测数据来拟合这一线性关系。首先，Python代码将随机产生2000个观测点，作为线性回归的输入。下面脚本符合PaddlePaddle期待的读取数据的Python程序的模式。
+
+.. code-block:: python
+
+    # dataprovider.py
+    from paddle.trainer.PyDataProvider2 import *
+    import random
+
+    # 定义输入数据的类型: 2个浮点数
+    @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
+    def process(settings, input_file):
+        for i in xrange(2000):
+            x = random.random()
+            yield [x], [2*x+0.3]
+
+训练模型
+-----------
+
+为了还原 `y = 2x + 0.3`，我们先从一条随机的直线 `y' = wx + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `y'` 和 `y` 的差距不断减小，最终趋于接近。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
+
+在PaddlePaddle里，该模型的网络配置如下。
+
+.. code-block:: python
+
+    # trainer_config.py
+    from paddle.trainer_config_helpers import *
+
+    # 1. 定义数据来源，调用上面的process函数获得观测数据
+    data_file = 'empty.list'
+    with open(data_file, 'w') as f: f.writelines(' ')
+    define_py_data_sources2(train_list=data_file, test_list=None, 
+                            module='dataprovider', obj='process',args={})
+
+    # 2. 学习算法。控制如何改变模型参数 w 和 b
+    settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
+
+    # 3. 神经网络配置
+    x = data_layer(name='x', size=1)
+    y = data_layer(name='y', size=1)
+    # 线性计算网络层: ȳ = wx + b
+    ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
+    # 计算误差函数，即  ȳ 和真实 y 之间的距离
+    cost = regression_cost(input= ȳ, label=y)
+    outputs(cost)
+
+
+这段简短的配置展示了PaddlePaddle的基本用法：
+
+- 第一部分定义了数据输入。一般情况下，PaddlePaddle先从一个文件列表里获得数据文件地址，然后交给用户自定义的函数（例如上面的 `process`函数）进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件，所以放一个空列表（`empty.list`）即可。
+
+- 第二部分主要是选择学习算法，它定义了模型参数改变的规则。PaddlePaddle提供了很多优秀的学习算法，这里使用一个基于momentum的随机梯度下降(SGD)算法，该算法每批量(batch)读取12个采样数据进行随机梯度计算来更新更新。
+
+- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络层，所以很多时候你需要做的只是定义正确的网络层并把它们连接起来。这里使用了三种网络单元：
+    
+    - **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到接下来的网络层。这里数据层有两个，分别对应于变量 `x` 和 `y`。
+    - **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以拟合任意的函数来学习复杂的数据关系。
+    - **回归误差代价层**：回归误差代价层 `regression_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
+
+定义了网络结构并保存为 `trainer_config.py` 之后，运行以下训练命令：
+
+.. code-block:: bash
+
+    paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
+
+PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加误差代价函数的输出在不断的减小，这意味着模型在训练数据上不断的改进，直到逼近真实解：` y = 2x + 0.3 `
+
+模型检验
+-----------
+
+训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用学习的模型对另外一组测试数据进行预测，评价预测的效果。在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
+
+PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件，所以可以利用如下方法读取模型的参数。
+
+.. code-block:: python
+
+    import numpy as np
+    import os
+
+    def load(file_name):
+        with open(file_name, 'rb') as f:
+            f.read(16) # skip header for float type.
+            return np.fromfile(f, dtype=np.float32)
+        
+    print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
+    # w=1.999743, b=0.300137
+
+.. image:: ./parameters.png
+     :align: center
+     :scale: 80 %
+
+从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型一致。
+
+这样，我们用PaddlePaddle解决了单变量线性回归问题， 包括数据输入、模型训练和最后的结果验证。
diff --git a/doc/getstarted/basic_usage/index_en.rst b/doc/getstarted/basic_usage/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c10b897d4292d0c2b062b5c8e23466505afa408a
--- /dev/null
+++ b/doc/getstarted/basic_usage/index_en.rst
@@ -0,0 +1,101 @@
+Simple Linear Regression
+========================
+
+PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
+
+Problem Background
+------------------
+
+Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - `simple linear regression <https://en.wikipedia.org/wiki/Simple_linear_regression>`_: you have observed a set of two-dimensional data points of ``X`` and ``Y``, where ``X`` is an explanatory variable and ``Y`` is corresponding dependent variable, and you want to recover the underlying correlation between ``X`` and ``Y``. Linear regression can be used in many practical scenarios. For example, ``X`` can be a variable about house size, and ``Y`` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
+
+Prepare the Data
+-----------------
+
+Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
+
+    .. code-block:: python
+
+        # dataprovider.py
+        from paddle.trainer.PyDataProvider2 import *
+        import random
+
+        # define data types of input: 2 real numbers
+        @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
+        def process(settings, input_file):
+            for i in xrange(2000):
+                x = random.random()
+                yield [x], [2*x+0.3]
+
+Train a NeuralNetwork
+----------------------
+
+To recover this relationship between ``X`` and ``Y``, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line ``Y' = wX + b`` , then we gradually adapt ``w`` and ``b`` to minimize the difference between ``Y'`` and ``Y``. Here is what it looks like in PaddlePaddle:
+
+    .. code-block:: python
+
+        # trainer_config.py
+        from paddle.trainer_config_helpers import *
+
+        # 1. read data. Suppose you saved above python code as dataprovider.py
+        data_file = 'empty.list'
+        with open(data_file, 'w') as f: f.writelines(' ')
+        define_py_data_sources2(train_list=data_file, test_list=None, 
+                module='dataprovider', obj='process',args={})
+
+        # 2. learning algorithm
+        settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
+
+        # 3. Network configuration
+        x = data_layer(name='x', size=1)
+        y = data_layer(name='y', size=1)
+        y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
+        cost = regression_cost(input=y_predict, label=y)
+        outputs(cost)
+
+Some of the most fundamental usages of PaddlePaddle are demonstrated:
+
+-  The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly.
+
+-  The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time.
+
+-  Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration:
+	-  **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for ``X`` and ``Y``.
+	-  **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model.
+	-  **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters.
+
+Now that everything is ready, you can train the network with a simple command line call:
+
+    .. code-block:: bash
+ 
+        paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
+ 
+
+This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path ``./output``. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
+
+
+Evaluate the Model
+-------------------
+
+Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: ``w=2, b=0.3``, thus a better option is to check out model parameters directly.
+
+In PaddlePaddle, training is just to get a collection of model parameters, which are ``w`` and ``b`` in this case. Each parameter is saved in an individual file in the popular ``numpy`` array format. Here is the code that reads parameters from last pass.
+
+    .. code-block:: python
+
+        import numpy as np
+        import os
+
+        def load(file_name):
+            with open(file_name, 'rb') as f:
+                f.read(16) # skip header for float type.
+                return np.fromfile(f, dtype=np.float32)
+                
+        print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
+        # w=1.999743, b=0.300137
+
+    .. image:: parameters.png
+        :align: center
+
+Although starts from a random guess, you can see that value of ``w`` changes quickly towards 2 and ``b`` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer.
+
+There, you have recovered the underlying pattern between ``X`` and ``Y`` only from observed data.
diff --git a/doc_cn/introduction/parameters.png b/doc/getstarted/basic_usage/parameters.png
similarity index 100%
rename from doc_cn/introduction/parameters.png
rename to doc/getstarted/basic_usage/parameters.png
diff --git a/doc/build/build_from_source.md b/doc/getstarted/build_and_install/build_from_source_en.md
similarity index 75%
rename from doc/build/build_from_source.md
rename to doc/getstarted/build_and_install/build_from_source_en.md
index b8f26f431eb7a04147fe791a8c805427c827fe09..aaa07d49d3148266db27670a98c2b27db4dc0a8f 100644
--- a/doc/build/build_from_source.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -6,22 +6,33 @@ Installing from Sources
 * [3. Build on Ubuntu](#ubuntu)
 
 ## <span id="download">Download and Setup</span> 
-You can download PaddlePaddle from the [github source](https://github.com/gangliao/Paddle).
+You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
 
 ```bash
-git clone https://github.com/baidu/Paddle paddle
+git clone https://github.com/PaddlePaddle/Paddle paddle
 cd paddle
+git submodule update --init --recursive
+```
+
+If you already have a local PaddlePaddle repo and have not initialized the submodule, your local submodule folder will be empty. You can simply run the last line of the above codes in your PaddlePaddle home directory to initialize your submodule folder.
+
+If you have already initialized your submodule and you would like to sync with the upstream submodule repo, you can run the following command
+```
+git submodule update --remote
 ```
 
 ## <span id="requirements">Requirements</span>
 
-To compile the source code, your computer must be equipped with GCC >=4.6 or Clang compiler.
-### Dependencies
+To compile the source code, your computer must be equipped with the following dependencies.
 
+- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1)
 - **CMake**: version >= 2.8
 - **BLAS**: MKL, OpenBlas or ATLAS
-- **protobuf**: version >= 2.4, **Note: 3.x is not supported**
-- **python**: only python 2.7 is supported currently
+- **Protocol Buffers**: version >= 2.4, **Note: 3.x is not supported**
+- **Python**: only python 2.7 is supported currently
+
+**Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
+For CUDA 8.0, GCC versions later than 5.3 are not supported!
 
 ### Options
 
@@ -38,10 +49,8 @@ PaddlePaddle supports some build options. To enable it, first you need to instal
 <tbody>
 <tr><td class="left">WITH_GPU</td><td class="left">Compile with GPU mode.</td></tr>
 <tr><td class="left">WITH_DOUBLE</td><td class="left">Compile with double precision floating-point, default: single precision.</td></tr>
-<tr><td class="left">WITH_GLOG</td><td class="left">Compile with glog. If not found, default: an internal log implementation.</td></tr>
-<tr><td class="left">WITH_GFLAGS</td><td class="left">Compile with gflags. If not found, default: an internal flag implementation.</td></tr>
 <tr><td class="left">WITH_TESTING</td><td class="left">Compile with gtest for PaddlePaddle's unit testing.</td></tr>
-<tr><td class="left">WITH_DOC</td><td class="left">	Compile to generate PaddlePaddle's docs, default: disabled (OFF).</td></tr>
+<tr><td class="left">WITH_DOC</td><td class="left">    Compile to generate PaddlePaddle's docs, default: disabled (OFF).</td></tr>
 <tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile with python predict API, default: disabled (OFF).</td></tr>
 <tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile with code style check, default: enabled (ON).</td></tr>
 </tbody>
@@ -49,8 +58,8 @@ PaddlePaddle supports some build options. To enable it, first you need to instal
 </html>
 
 **Note:**
-  - The GPU version works best with Cuda Toolkit 7.5 and cuDNN v5.
-  - Other versions like Cuda Toolkit 6.5, 7.0, 8.0 and cuDNN v2, v3, v4 are also supported.
+  - The GPU version works best with Cuda Toolkit 8.0 and cuDNN v5.
+  - Other versions like Cuda Toolkit 7.0, 7.5 and cuDNN v3, v4 are also supported.
   - **To utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa.**
 
 As a simple example, consider the following:  
@@ -75,7 +84,7 @@ As a simple example, consider the following:
 
     ```bash
     pip install 'sphinx>=1.4.0'
-    pip install sphinx_rtd_theme breathe recommonmark
+    pip install sphinx_rtd_theme recommonmark
 
     # install doxygen on Ubuntu
     sudo apt-get install doxygen 
@@ -95,7 +104,7 @@ As a simple example, consider the following:
     ```bash
     # necessary
     sudo apt-get update
-    sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
+    sudo apt-get install -y g++ make cmake swig build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
     # optional
     sudo apt-get install libgoogle-glog-dev
     sudo apt-get install libgflags-dev
@@ -149,15 +158,15 @@ If still not found, you can manually set it based on CMake error information fro
 
 As a simple example, consider the following:
 
-- **Only CPU**
+- **Only CPU with swig**
 
   ```bash
-  cmake  .. -DWITH_GPU=OFF
+  cmake  .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON
   ```
-- **GPU**
+- **GPU with swig**
 
   ```bash
-  cmake .. -DWITH_GPU=ON
+  cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON
   ```
 
 - **GPU with doc and swig**
@@ -170,15 +179,13 @@ Finally, you can build PaddlePaddle:
 
 ```bash
 # you can add build option here, such as:    
-cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<path to install>
+cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<path to install> -DWITH_SWIG_PY=ON
 # please use sudo make install, if you want to install PaddlePaddle into the system
 make -j `nproc` && make install
 # set PaddlePaddle installation path in ~/.bashrc
 export PATH=<path to install>/bin:$PATH
 ```
 
-**Note:**
-
 If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
 Otherwise, PaddlePaddle will automatically install python dependencies
 at first time when user run paddle commands, such as `paddle version`, `paddle train`.
diff --git a/doc/build/cmake.png b/doc/getstarted/build_and_install/cmake.png
similarity index 100%
rename from doc/build/cmake.png
rename to doc/getstarted/build_and_install/cmake.png
diff --git a/doc_cn/build_and_install/cmake/compile_options.rst b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
similarity index 94%
rename from doc_cn/build_and_install/cmake/compile_options.rst
rename to doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
index f345ead2bf851bdad7be2fb8185d16fd2a318a66..3a52c8723bbccd70dd89e8913092d92813925f90 100644
--- a/doc_cn/build_and_install/cmake/compile_options.rst
+++ b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
@@ -1,43 +1,43 @@
-PaddlePaddle的编译选项
-======================
-
-PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考 `官方文档 <https://cmake.org/cmake-tutorial>`_ 。
-
-Bool型的编译选项
-----------------
-用户可在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如
-
-..  code-block:: bash
-
-    cmake .. -DWITH_GPU=OFF
-
-..  csv-table:: Bool型的编译选项
-    :widths: 1, 7, 2
-    :file: compile_options.csv
-
-BLAS/CUDA/Cudnn的编译选项
---------------------------
-BLAS
-+++++
-
-PaddlePaddle支持以下任意一种BLAS库：`MKL <https://software.intel.com/en-us/intel-mkl>`_ ，`ATLAS <http://math-atlas.sourceforge.net/>`_ ，`OpenBlAS <http://www.openblas.net/>`_ 和 `REFERENCE BLAS <http://www.netlib.org/blas/>`_ 。
-
-..  csv-table:: BLAS路径相关的编译选项
-    :widths: 1, 2, 7
-    :file: cblas_settings.csv
-
-CUDA/Cudnn
-+++++++++++
-
-PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
-
-编译选项的设置
-++++++++++++++
-
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时，首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
-
-..  code-block:: bash
-
-    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
-
+PaddlePaddle的编译选项
+======================
+
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考 `官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+Bool型的编译选项
+----------------
+用户可在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: Bool型的编译选项
+    :widths: 1, 7, 2
+    :file: compile_options.csv
+
+BLAS/CUDA/Cudnn的编译选项
+--------------------------
+BLAS
++++++
+
+PaddlePaddle支持以下任意一种BLAS库：`MKL <https://software.intel.com/en-us/intel-mkl>`_ ，`ATLAS <http://math-atlas.sourceforge.net/>`_ ，`OpenBlAS <http://www.openblas.net/>`_ 和 `REFERENCE BLAS <http://www.netlib.org/blas/>`_ 。
+
+..  csv-table:: BLAS路径相关的编译选项
+    :widths: 1, 2, 7
+    :file: cblas_settings.csv
+
+CUDA/Cudnn
++++++++++++
+
+PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时，首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
+
+..  code-block:: bash
+
+    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
+
 注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
\ No newline at end of file
diff --git a/doc_cn/build_and_install/cmake/cblas_settings.csv b/doc/getstarted/build_and_install/cmake/cblas_settings.csv
similarity index 100%
rename from doc_cn/build_and_install/cmake/cblas_settings.csv
rename to doc/getstarted/build_and_install/cmake/cblas_settings.csv
diff --git a/doc_cn/build_and_install/cmake/compile_options.csv b/doc/getstarted/build_and_install/cmake/compile_options.csv
similarity index 65%
rename from doc_cn/build_and_install/cmake/compile_options.csv
rename to doc/getstarted/build_and_install/cmake/compile_options.csv
index 12b45eebb2822d77447fa1bc754360605971dcab..463b825470579d0c3736a408b1e82dd33e6f8d42 100644
--- a/doc_cn/build_and_install/cmake/compile_options.csv
+++ b/doc/getstarted/build_and_install/cmake/compile_options.csv
@@ -1,14 +1,12 @@
-选项,说明,默认值
-WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
-WITH_DOUBLE,是否使用双精度浮点数。,否
-WITH_DSO,是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。,是
-WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
-WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
-WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
-WITH_RDMA,是否开启RDMA,否
-WITH_GLOG,是否开启GLOG。如果不开启，则会使用一个简化版的日志，同时方便今后的嵌入式移植工作。,取决于是否寻找到GLOG
-WITH_GFLAGS,是否使用GFLAGS。如果不开启，则会使用一个简化版的命令行参数解析器，同时方便今后的嵌入式移植工作。,取决于是否寻找到GFLAGS
-WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢，打印的日志变多，但是方便调试和测Benchmark,否
-WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
-WITH_DOC,是否编译中英文文档,否
+选项,说明,默认值
+WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
+WITH_DOUBLE,是否使用双精度浮点数。,否
+WITH_DSO,是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。,是
+WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
+WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
+WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
+WITH_RDMA,是否开启RDMA,否
+WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢，打印的日志变多，但是方便调试和测Benchmark,否
+WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
+WITH_DOC,是否编译中英文文档,否
 WITH_SWIG_PY,是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练,取决于是否寻找到SWIG
\ No newline at end of file
diff --git a/doc_cn/build_and_install/install/docker_install.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
similarity index 89%
rename from doc_cn/build_and_install/install/docker_install.rst
rename to doc/getstarted/build_and_install/docker_install_cn.rst
index a5f5fb117e11e8ac1ae49e4271e826fa12d5e810..35234e0eb3ece3cb20d62841c1d75e60b485b9ea 100644
--- a/doc_cn/build_and_install/install/docker_install.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -1,9 +1,7 @@
 安装PaddlePaddle的Docker镜像
 ============================
 
-PaddlePaddle提供了Docker的使用镜像。PaddlePaddle推荐使用Docker进行PaddlePaddle的部署和
-运行。Docker是一个基于容器的轻量级虚拟环境。具有和宿主机相近的运行效率，并提供
-了非常方便的二进制分发手段。
+PaddlePaddle项目提供官方 `Docker <https://www.docker.com/>`_ 镜像。Docker镜像是我们目前唯一官方支持的部署和运行方式。
 
 下述内容将分为如下几个类别描述。
 
@@ -41,7 +39,7 @@ PaddlePaddle提供的Docker镜像版本
 * CPU WITHOUT AVX: CPU版本，不支持AVX指令集的CPU也可以运行
 * GPU WITHOUT AVX: GPU版本，不需要AVX指令集的CPU也可以运行。
 
-用户可以选择对应版本的docker image。使用如下脚本可以确定本机的CPU知否支持 :code:`AVX` 指令集\:
+用户可以选择对应版本的docker image。使用如下脚本可以确定本机的CPU是否支持 :code:`AVX` 指令集\:
 
 ..  code-block:: bash
 
@@ -67,7 +65,7 @@ mac osx或者是windows机器，请参考
 
 ..  code-block:: bash
     
-    $ docker run -it paddledev/paddlepaddle:cpu-latest
+    $ docker run -it paddledev/paddle:cpu-latest
 
 即可启动和进入PaddlePaddle的container。如果运行GPU版本的PaddlePaddle，则需要先将
 cuda相关的Driver和设备映射进container中，脚本类似于
@@ -76,7 +74,7 @@ cuda相关的Driver和设备映射进container中，脚本类似于
 
     $ export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
     $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddlepaddle:latest-gpu
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest
 
 进入Docker container后，运行 :code:`paddle version` 即可打印出PaddlePaddle的版本和构建
 信息。安装完成的PaddlePaddle主体包括三个部分， :code:`paddle` 脚本， python的
@@ -113,7 +111,24 @@ cuda相关的Driver和设备映射进container中，脚本类似于
 
 简单的含有ssh的Dockerfile如下：
 
-..  literalinclude:: paddle_ssh.Dockerfile
+..  code-block:: bash
+
+    FROM paddledev/paddle:cpu-latest
+
+    MAINTAINER PaddlePaddle dev team <paddle-dev@baidu.com>
+
+    RUN apt-get update
+    RUN apt-get install -y openssh-server
+    RUN mkdir /var/run/sshd
+    RUN echo 'root:root' | chpasswd
+
+    RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+    RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+
+    EXPOSE 22
+
+    CMD    ["/usr/sbin/sshd", "-D"]
+
 
 使用该Dockerfile构建出镜像，然后运行这个container即可。相关命令为\:
 
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7633bf4d576ee6a3e75c2c493eb248f5d2636628
--- /dev/null
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -0,0 +1,191 @@
+PaddlePaddle in Docker Containers
+=================================
+
+Docker container is currently the only officially-supported way to
+running PaddlePaddle.  This is reasonable as Docker now runs on all
+major operating systems including Linux, Mac OS X, and Windows.
+Please be aware that you will need to change `Dockers settings
+<https://github.com/PaddlePaddle/Paddle/issues/627>`_ to make full use
+of your hardware resource on Mac OS X and Windows.
+
+
+Development Using Docker
+------------------------
+
+Developers can work on PaddlePaddle using Docker.  This allows
+developers to work on different platforms -- Linux, Mac OS X, and
+Windows -- in a consistent way.
+
+The general development workflow with Docker and Bazel is as follows:
+
+1. Get the source code of Paddle:
+
+   .. code-block:: bash
+
+      git clone --recursive https://github.com/PaddlePaddle/Paddle.git
+
+   
+   Here **git clone --recursive is required** as we have a submodule `warp-ctc <https://github.com/baidu-research/warp-ctc>`_.
+
+   If you have used :code:`git clone https://github.com/PaddlePaddle/Paddle` and find that the directory :code:`warp-ctc` is
+   empty, please use the following command to get the submodule.
+
+   .. code-block:: bash
+
+      git submodule update --init --recursive
+
+
+2. Build a development Docker image :code:`paddle:dev` from the source
+   code.  This image contains all the development tools and
+   dependencies of PaddlePaddle.
+
+
+   .. code-block:: bash
+
+      cd paddle
+      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
+
+
+3. Run the image as a container and mounting local source code
+   directory into the container.  This allows us to change the code on
+   the host and build it within the container.
+
+   .. code-block:: bash
+
+      docker run       \
+       -d              \
+       --name paddle   \
+       -p 2022:22      \
+       -v $PWD:/paddle \
+       -v $HOME/.cache/bazel:/root/.cache/bazel \
+       paddle:dev
+
+   where :code:`-d` makes the container running in background,
+   :code:`--name paddle` allows us to run a nginx container to serve
+   documents in this container, :code:`-p 2022:22` allows us to SSH
+   into this container, :code:`-v $PWD:/paddle` shares the source code
+   on the host with the container, :code:`-v
+   $HOME/.cache/bazel:/root/.cache/bazel` shares Bazel cache on the
+   host with the container.
+
+4. SSH into the container:
+
+   .. code-block:: bash
+
+      ssh root@localhost -p 2022
+
+5. We can edit the source code in the container or on this host.  Then
+   we can build using cmake
+
+   .. code-block:: bash
+
+      cd /paddle # where paddle source code has been mounted into the container
+      mkdir -p build
+      cd build
+      cmake -DWITH_TESTING=ON ..
+      make -j `nproc`
+      CTEST_OUTPUT_ON_FAILURE=1 ctest
+
+   or Bazel in the container:
+
+   .. code-block:: bash
+
+      cd /paddle
+      bazel test ...
+
+
+CPU-only and GPU Images
+-----------------------
+
+For each version of PaddlePaddle, we release 2 Docker images, a
+CPU-only one and a CUDA GPU one.  We do so by configuring
+`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_
+automatically runs the following commands:
+
+.. code-block:: bash
+
+   docker build -t paddle:cpu -f paddle/scripts/docker/Dockerfile .
+   docker build -t paddle:gpu -f paddle/scripts/docker/Dockerfile.gpu .
+
+
+To run the CPU-only image as an interactive container:
+
+.. code-block:: bash
+
+    docker run -it --rm paddledev/paddle:cpu-latest /bin/bash
+
+or, we can run it as a daemon container
+
+.. code-block:: bash
+
+    docker run -d -p 2202:22 paddledev/paddle:cpu-latest
+
+and SSH to this container using password :code:`root`:
+
+.. code-block:: bash
+
+    ssh -p 2202 root@localhost
+
+An advantage of using SSH is that we can connect to PaddlePaddle from
+more than one terminals.  For example, one terminal running vi and
+another one running Python interpreter.  Another advantage is that we
+can run the PaddlePaddle container on a remote server and SSH to it
+from a laptop.
+
+
+Above methods work with the GPU image too -- just please don't forget
+to install CUDA driver and let Docker knows about it:
+
+.. code-block:: bash
+
+    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest
+
+
+Non-AVX Images
+--------------
+
+Please be aware that the CPU-only and the GPU images both use the AVX
+instruction set, but old computers produced before 2008 do not support
+AVX.  The following command checks if your Linux computer supports
+AVX:
+
+.. code-block:: bash
+
+   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+
+
+If it doesn't, we will need to build non-AVX images manually from
+source code:
+
+.. code-block:: bash
+
+   cd ~
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   git submodule update --init --recursive
+   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
+   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
+
+
+Documentation
+-------------
+
+Paddle Docker images include an HTML version of C++ source code
+generated using `woboq code browser
+<https://github.com/woboq/woboq_codebrowser>`_.  This makes it easy
+for users to browse and understand the C++ source code.
+
+As long as we give the Paddle Docker container a name, we can run an
+additional Nginx Docker container to serve the volume from the Paddle
+container:
+
+.. code-block:: bash
+
+   docker run -d --name paddle-cpu-doc paddle:cpu
+   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
+
+
+Then we can direct our Web browser to the HTML version of source code
+at http://localhost:8088/paddle/
diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/getstarted/build_and_install/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3ffa8585041d3023161c2cada8a3dc149f740ba0
--- /dev/null
+++ b/doc/getstarted/build_and_install/index_cn.rst
@@ -0,0 +1,27 @@
+编译与安装
+==========
+
+安装
+++++
+
+PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜像，ubuntu的deb安装包等。我们推荐使用Docker镜像来部署环境，同时欢迎贡献更多的安装包。
+
+.. toctree::
+   :maxdepth: 1
+   
+   docker_install_cn.rst 
+   ubuntu_install_cn.rst
+
+
+
+编译
+++++
+
+..  warning::
+
+    编译选项主要推荐高级用户查看，普通用户请走安装流程。
+
+..  toctree::
+    :maxdepth: 1
+
+    cmake/build_from_source_cn.rst
diff --git a/doc/build/index.rst b/doc/getstarted/build_and_install/index_en.rst
similarity index 67%
rename from doc/build/index.rst
rename to doc/getstarted/build_and_install/index_en.rst
index b4fe4596047c7d201fdf36bc76c26d5134611560..1bfd4f75c0b9b82d61d28a30f03181f7be159f24 100644
--- a/doc/build/index.rst
+++ b/doc/getstarted/build_and_install/index_en.rst
@@ -6,12 +6,9 @@ Install PaddlePaddle
 
 ..  toctree::
     :maxdepth: 1
-    :glob:
 
-    install_*
-    internal/install_from_jumbo.md
-    docker_install.rst
-    ubuntu_install.rst
+    docker_install_en.rst
+    ubuntu_install_en.rst
 
 Build from Source
 -----------------
@@ -22,7 +19,5 @@ Build from Source
 
 ..  toctree::
     :maxdepth: 1
-    :glob:
 
-    build_from_source.md
-    contribute_to_paddle.md
+    build_from_source_en.md
diff --git a/doc/getstarted/build_and_install/ubuntu_install_cn.rst b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d02d9c63bbfb50954d7b75f2c685ce167a3b7146
--- /dev/null
+++ b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
@@ -0,0 +1,72 @@
+Ubuntu部署PaddlePaddle
+===================================
+
+PaddlePaddle提供了ubuntu 14.04 deb安装包。
+
+安装
+------
+
+安装包的下载地址是\: https://github.com/PaddlePaddle/Paddle/releases
+
+它包含四个版本\:
+
+* cpu版本: 支持主流x86处理器平台, 使用了avx指令集。
+
+* cpu-noavx版本：支持主流x86处理器平台，没有使用avx指令集。
+
+* gpu版本：支持主流x86处理器平台，支持nvidia cuda平台，使用了avx指令集。
+
+* gpu-noavx版本：支持主流x86处理器平台，支持nvidia cuda平台，没有使用avx指令集。
+
+下载完相关安装包后，执行:
+
+..  code-block:: shell
+
+    sudo apt-get install gdebi
+    gdebi paddle-*-cpu.deb
+
+或者:
+
+..  code-block:: shell
+
+    dpkg -i paddle-*-cpu.deb
+    apt-get install -f
+
+
+在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的，
+在 :code:`apt-get install -f` 里会继续安装 PaddlePaddle。
+
+安装完成后，可以使用命令 :code:`paddle version` 查看安装后的paddle 版本:
+
+..  code-block:: shell
+
+    PaddlePaddle 0.8.0b1, compiled with
+        with_avx: ON
+        with_gpu: OFF
+        with_double: OFF
+        with_python: ON
+        with_rdma: OFF
+        with_metric_learning:
+        with_timer: OFF
+        with_predict_sdk:
+
+
+可能遇到的问题
+--------------
+
+libcudart.so/libcudnn.so找不到
+++++++++++++++++++++++++++++++
+
+安装完成后，运行 :code:`paddle train` 报错\:
+
+..  code-block:: shell
+
+      0831 12:36:04.151525  1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH.
+
+原因是未设置cuda运行时环境变量。 如果使用GPU版本的PaddlePaddle，请安装CUDA 7.5 和CUDNN 5到本地环境中，并设置：
+
+..  code-block:: shell
+
+    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib:$LD_LIBRARY_PATH
+    export PATH=/usr/local/cuda/bin:$PATH
+
diff --git a/doc/build/ubuntu_install.rst b/doc/getstarted/build_and_install/ubuntu_install_en.rst
similarity index 100%
rename from doc/build/ubuntu_install.rst
rename to doc/getstarted/build_and_install/ubuntu_install_en.rst
diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c6a4d3121c5857cd434acecb389d68f4d4c7a532
--- /dev/null
+++ b/doc/getstarted/index_cn.rst
@@ -0,0 +1,8 @@
+新手入门
+============
+
+..  toctree::
+  :maxdepth: 2
+
+  build_and_install/index_cn.rst
+  basic_usage/index_cn.rst
diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..55d95d8015e56ddae3363d19315db0fad841caad
--- /dev/null
+++ b/doc/getstarted/index_en.rst
@@ -0,0 +1,8 @@
+GET STARTED
+============
+
+..  toctree::
+  :maxdepth: 2
+
+  build_and_install/index_en.rst
+  basic_usage/index_en.rst
diff --git a/doc_cn/algorithm/rnn/hierarchical-layer.md b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
similarity index 50%
rename from doc_cn/algorithm/rnn/hierarchical-layer.md
rename to doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
index 519653df081d6e7919ada3cbff6aaf4d2a2f6115..a9906b8b9c2036ae349f30d7edee770884f73f99 100644
--- a/doc_cn/algorithm/rnn/hierarchical-layer.md
+++ b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
@@ -1,6 +1,11 @@
-# 支持双层序列作为输入的Layer
+###########################
+支持双层序列作为输入的Layer
+###########################
 
-## 概述
+..	contents::
+
+概述
+====
 
 在自然语言处理任务中，序列是一种常见的数据类型。一个独立的词语，可以看作是一个非序列输入，或者，我们称之为一个0层的序列；由词语构成的句子，是一个单层序列；若干个句子构成一个段落，是一个双层的序列。
 
@@ -12,55 +17,79 @@
 + 单层序列：排成一列的多个元素，每个元素是一个0层序列，元素之间的顺序是重要的输入信息
 + 双层序列：排成一列的多个元素，每个元素是一个单层序列，称之为双层序列的一个子序列（subseq），subseq的每个元素是一个0层序列
 
-
 在 PaddlePaddle中，下面这些Layer能够接受双层序列作为输入，完成相应的计算。
-## pooling_layer
-
-pooling_layer的使用示例如下，详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#pooling-layer">配置API</a>。
-```python
-seq_pool = pooling_layer(input=layer,
-                         pooling_type=AvgPooling(),
-                         agg_level=AggregateLevel.EACH_SEQUENCE)
-```
+
+pooling_layer
+==============
+
+pooling_layer 的使用示例如下，详细见 `pooling_layer`_ 配置API。
+
+..	code-block:: bash
+
+        seq_pool = pooling_layer(input=layer,
+                                 pooling_type=AvgPooling(),
+                                 agg_level=AggregateLevel.EACH_SEQUENCE)
+        
 - `pooling_type` 目前支持两种，分别是：MaxPooling()和AvgPooling()。
-- `agg_level=AggregateLevel.TIMESTEP`时（默认值）：
+
+- `agg_level=AggregateLevel.TIMESTEP` 时（默认值）：
+
   - 作用：双层序列经过运算变成一个0层序列，或单层序列经过运算变成一个0层序列
   - 输入：一个双层序列，或一个单层序列
   - 输出：一个0层序列，即整个输入序列（单层或双层）的平均值（或最大值）
-- `agg_level=AggregateLevel.EACH_SEQUENCE`时：
+
+- `agg_level=AggregateLevel.EACH_SEQUENCE` 时：
+
   - 作用：一个双层序列经过运算变成一个单层序列
   - 输入：必须是一个双层序列
   - 输出：一个单层序列，序列的每个元素是原来双层序列每个subseq元素的平均值（或最大值）
 
-## last_seq 和 first_seq
+last_seq 和 first_seq
+=====================
+
+last_seq 的使用示例如下（ `first_seq`_ 类似），详细见 `last_seq`_ 配置API。
+
+..	code-block:: bash
+
+        last = last_seq(input=layer,
+                        agg_level=AggregateLevel.EACH_SEQUENCE)
+        
+- `agg_level=AggregateLevel.TIMESTEP` 时（默认值）：
 
-last_seq的使用示例如下（first_seq类似），详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#last-seq">配置API</a>。
-```python
-last = last_seq(input=layer,
-                agg_level=AggregateLevel.EACH_SEQUENCE)
-```
-- `agg_level=AggregateLevel.TIMESTEP`时（默认值）：
   - 作用：一个双层序列经过运算变成一个0层序列，或一个单层序列经过运算变成一个0层序列
   - 输入：一个双层序列或一个单层序列
   - 输出：一个0层序列，即整个输入序列（双层或者单层）最后一个，或第一个元素。
-- `agg_level=AggregateLevel.EACH_SEQUENCE`时：
+
+- `agg_level=AggregateLevel.EACH_SEQUENCE` 时：
   - 作用：一个双层序列经过运算变成一个单层序列
   - 输入：必须是一个双层序列
   - 输出：一个单层序列，其中每个元素是双层序列中每个subseq最后一个（或第一个）元素。
 
-## expand_layer
+expand_layer
+============
+
+expand_layer 的使用示例如下，详细见 `expand_layer`_ 配置API。
+
+..	code-block:: bash
+
+        expand = expand_layer(input=layer1,
+                              expand_as=layer2,
+                              expand_level=ExpandLevel.FROM_TIMESTEP)
+        
+- `expand_level=ExpandLevel.FROM_TIMESTEP` 时（默认值）：
 
-expand_layer的使用示例如下，详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#expand-layer">配置API</a>。
-```python
-expand = expand_layer(input=layer1,
-                      expand_as=layer2,
-                      expand_level=ExpandLevel.FROM_TIMESTEP)
-```
-- `expand_level=ExpandLevel.FROM_TIMESTEP`时（默认值）：
   - 作用：一个0层序列经过运算扩展成一个单层序列，或者一个双层序列
-  - 输入：layer1必须是一个0层序列，是待扩展的数据；layer2可以是一个单层序列，或者是一个双层序列，提供扩展的长度信息
-  - 输出：一个单层序列，或一个双层序列，输出序列的类型（双层序列，或单层序列）和序列中含有元素的数目同 layer2一致。若输出是单层序列，单层序列的每个元素（0层序列），都是对layer1元素的拷贝；若输出是双层序列，双层序列每个subseq中每个元素（0层序列），都是对layer1元素的拷贝
-- `expand_level=ExpandLevel.FROM_SEQUENCE`时：
+  - 输入：layer1必须是一个0层序列，是待扩展的数据；layer2 可以是一个单层序列，或者是一个双层序列，提供扩展的长度信息
+  - 输出：一个单层序列或一个双层序列，输出序列的类型（双层序列或单层序列）和序列中含有元素的数目同 layer2 一致。若输出是单层序列，单层序列的每个元素（0层序列），都是对layer1元素的拷贝；若输出是双层序列，双层序列每个subseq中每个元素（0层序列），都是对layer1元素的拷贝
+
+- `expand_level=ExpandLevel.FROM_SEQUENCE` 时：
+
   - 作用：一个单层序列经过运算扩展成一个双层序列
-  - 输入：layer1必须是一个单层序列，是待扩展的数据；layer2必须是一个双层序列，提供扩展的长度信息
-  - 输出：一个双层序列，序列中含有元素的数目同layer2一致。要求单层序列含有元素的数目（0层序列），和双层序列含有subseq 的数目一致。单层序列第i个元素（0层序列），被扩展为一个单层序列，构成了输出双层序列的第i个subseq。
+  - 输入：layer1必须是一个单层序列，是待扩展的数据；layer2 必须是一个双层序列，提供扩展的长度信息
+  - 输出：一个双层序列，序列中含有元素的数目同 layer2 一致。要求单层序列含有元素的数目（0层序列）和双层序列含有subseq 的数目一致。单层序列第i个元素（0层序列），被扩展为一个单层序列，构成了输出双层序列的第i个 subseq 。
+
+
+.. _pooling_layer: ../../../doc/ui/api/trainer_config_helpers/layers.html#pooling-layer
+.. _last_seq: ../../../doc/ui/api/trainer_config_helpers/layers.html#last-seq
+.. _first_seq: ../../../doc/ui/api/trainer_config_helpers/layers.html#first-seq
+.. _expand_layer: ../../../doc/ui/api/trainer_config_helpers/layers.html#expand-layer
diff --git a/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst b/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..96e52b910a22576fd75c9d4e1bef6e2cf74bc84f
--- /dev/null
+++ b/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst
@@ -0,0 +1,231 @@
+..  _algo_hrnn_rnn_api_compare:
+
+#####################
+单双层RNN API对比介绍
+#####################
+
+本文以PaddlePaddle的双层RNN单元测试为示例，用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型，来讲解如何使用双层RNN。本文中所有的例子，都只是介绍双层RNN的API接口，并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用，请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp <https://github.com/reyoung/Paddle/blob/develop/paddle/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
+
+示例1：双层RNN，子序列间无Memory
+================================
+
+在双层RNN中的经典情况是将内层的每一个时间序列数据，分别进行序列操作；并且内层的序列操作之间独立无依赖，即不需要使用Memory\ 。
+
+在本示例中，单层RNN和双层RNN的网络配置，都是将每一句分好词后的句子，使用LSTM作为encoder，压缩成一个向量。区别是RNN使用两层序列模型，将多句话看成一个整体同时使用encoder压缩。二者语意上完全一致。这组语义相同的示例配置如下：
+
+* 单层RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_layer_group.conf>`_
+* 双层RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_layer_group.conf>`_
+
+
+读取双层序列数据
+----------------
+
+首先，本示例中使用的原始数据如下\:
+
+- 本例中的原始数据一共有10个样本。每个样本由两部分组成，一个label（此处都为2）和一个已经分词后的句子。这个数据也被单层RNN网络直接使用。
+
+..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg
+    :language: text
+
+
+- 双层序列数据一共有4个样本。 每个样本间用空行分开，整体数据和原始数据完全一样。但于双层序列的LSTM来说，第一个样本同时encode两条数据成两个向量。这四条数据同时处理的句子数量为\ :code:`[2, 3, 2, 3]`\ 。
+
+..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg.nest
+    :language: text
+
+其次，对于两种不同的输入数据类型，不同DataProvider对比如下(`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequenceGen.py>`_)\：
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
+    :language: python
+    :lines: 21-39
+    :linenos:
+
+- 这是普通的单层时间序列的DataProvider代码，其说明如下：
+  
+  * DataProvider共返回两个数据，分别是words和label。即上述代码中的第19行。
+
+    - words是原始数据中的每一句话，所对应的词表index数组。它是integer_value_sequence类型的，即整数数组。words即为这个数据中的单层时间序列。
+    - label是原始数据中对于每一句话的分类标签，它是integer_value类型的。
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
+    :language: python
+    :lines: 42-71
+    :linenos:
+
+- 对于同样的数据，双层时间序列的DataProvider的代码。其说明如下：
+
+  - DataProvider共返回两组数据，分别是sentences和labels。即在双层序列的原始数据中，每一组内的所有句子和labels
+  - sentences是双层时间序列的数据。由于它内部包含了每组数据中的所有句子，且每个句子表示为对应的词表索引数组，因此它是integer_value_sub_sequence 类型的，即双层时间序列。
+  - labels是每组内每个句子的标签，故而是一个单层时间序列。
+
+
+模型配置的模型配置
+------------------------------------------
+
+首先，我们看一下单层RNN的配置。代码中9-15行(高亮部分)即为单层RNN序列的使用代码。这里使用了PaddlePaddle预定义好的RNN处理函数。在这个函数中，RNN对于每一个时间步通过了一个LSTM网络。
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_layer_group.conf
+    :language: python
+    :lines: 38-63
+    :linenos:
+    :emphasize-lines:  9-15
+
+
+其次，我们看一下语义相同的双层RNN的网络配置\:
+
+* PaddlePaddle中的许多layer并不在意输入是否是时间序列，例如\ :code:`embedding_layer`\ 。在这些layer中，所有的操作都是针对每一个时间步来进行的。
+
+* 在该配置的7-26行(高亮部分)，将双层时间序列数据先变换成单层时间序列数据，再对每一个单层时间序列进行处理。
+
+  * 使用\ :code:`recurrent_group`\ 这个函数进行变换，在变换时需要将输入序列传入。由于我们想要的变换是双层时间序列=> 单层时间序列，所以我们需要将输入数据标记成\ :code:`SubsequenceInput`\ 。
+  
+  * 在本例中，我们将原始数据的每一组，通过\ :code:`recurrent_group`\ 进行拆解，拆解成的每一句话再通过一个LSTM网络。这和单层RNN的配置是等价的。
+
+* 与单层RNN的配置类似，我们只需要使用LSTM encode成的最后一个向量。所以对\ :code:`recurrent_group`\ 进行了\ :code:`last_seq`\ 操作。但和单层RNN不同，我们是对每一个子序列取最后一个元素，因此\ :code:`agg_level=AggregateLevel.EACH_SEQUENCE`\ 。
+
+* 至此，\ :code:`lstm_last`\ 便和单层RNN配置中的\ :code:`lstm_last`\ 具有相同的结果了。
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_layer_group.conf
+    :language: python
+    :lines: 38-64
+    :linenos:
+    :emphasize-lines: 7-26
+
+示例2：双层RNN，子序列间有Memory
+================================
+
+本示例意图使用单层RNN和双层RNN实现两个完全等价的全连接RNN。
+
+* 对于单层RNN，输入数据为一个完整的时间序列，例如\ :code:`[4, 5, 2, 0, 9, 8, 1, 4]`\ 。
+
+* 对于双层RNN，输入数据为在单层RNN数据里面，任意将一些数据组合成双层时间序列，例如\ :code:`[ [4, 5, 2], [0, 9], [8, 1, 4]]`。
+
+模型配置的模型配置
+------------------
+
+我们选取单双层序列配置中的不同部分，来对比分析两者语义相同的原因。
+
+- 单层RNN：过了一个很简单的recurrent_group。每一个时间步，当前的输入y和上一个时间步的输出rnn_state做了一个全链接。
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn.conf
+    :language: python
+    :lines: 36-48
+
+- 双层RNN，外层memory是一个元素：
+
+  - 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem，表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中，outer_mem是一个子句的最后一个向量，即整个双层group是将前一个子句的最后一个向量，作为下一个子句memory的初始状态。
+  - 从输入数据上看，单双层序列的句子是一样的，只是双层序列将其又做了子序列划分。因此双层序列的配置中，必须将前一个子句的最后一个元素，作为boot_layer传给下一个子句的memory，才能保证和单层序列的配置中“每个时间步都用了上一个时间步的输出结果”一致。
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn.conf
+    :language: python
+    :lines: 39-66
+
+..  warning::
+    PaddlePaddle目前只支持在每个时间步中，Memory的时间序列长度一致的情况。
+
+示例3：双层RNN，输入不等长
+==========================
+
+.. role:: red
+
+.. raw:: html
+
+    <style> .red {color:red} </style>
+
+**输入不等长** 是指recurrent_group的多个输入序列，在每个时间步的子序列长度可以不相等。但序列输出时，需要指定与某一个输入的序列信息是一致的。使用\ :red:`targetInlink`\ 可以指定哪一个输入和输出序列信息一致，默认指定第一个输入。 
+
+示例3的配置分别为\ `单层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.conf>`_\ 和\ `双层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.conf>`_\ 。
+
+示例3对于单层RNN和双层RNN数据完全相同。
+
+* 对于单层RNN的数据一共有两个样本，他们分别是\ :code:`[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]`\ 和\ :code:`[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]`\ 。对于每一个单层RNN的数据，均有两组特征。
+
+* 在单层数据的基础上，双层RNN数据随意加了一些隔断，例如将第一条数据转化为\ :code:`[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]`\ 。
+
+* 需要注意的是PaddlePaddle目前只支持子序列数目一样的多输入双层RNN。例如本例中的两个特征，均有三个子序列。每个子序列长度可以不一致，但是子序列的数目必须一样。
+
+
+模型配置
+--------
+
+和示例2中的配置类似，示例3的配置使用了单层RNN和双层RNN，实现两个完全等价的全连接RNN。
+
+* 单层RNN\:
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+    :language: python
+    :lines: 42-59
+    :linenos:
+
+* 双层RNN\ \:
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+    :language: python
+    :lines: 41-80
+    :linenos:
+
+在上面代码中，单层和双层序列的使用和示例2中的示例类似，区别是同时处理了两个输入。而对于双层序列，两个输入的子序列长度也并不相同。但是，我们使用了\ :code:`targetInlink`\ 参数设置了外层\ :code:`recurrent_group`\ 的输出格式。所以外层输出的序列形状，和\ :code:`emb2`\ 的序列形状一致。
+
+示例4：beam_search的生成
+========================
+
+TBD
+
+
+词汇表
+======
+
+..  _glossary_memory:
+
+Memory
+------
+
+Memory是PaddlePaddle实现RNN时候使用的一个概念。RNN即时间递归神经网络，通常要求时间步之间具有一些依赖性，即当前时间步下的神经网络依赖前一个时间步神经网络中某一个神经元输出。如下图所示。
+
+..  graphviz:: src/glossary_rnn.dot
+
+上图中虚线的连接，即是跨越时间步的网络连接。PaddlePaddle在实现RNN的时候，将这种跨越时间步的连接用一个特殊的神经网络单元实现。这个神经网络单元就叫Memory。Memory可以缓存上一个时刻某一个神经元的输出，然后在下一个时间步输入给另一个神经元。使用Memory的RNN实现便如下图所示。
+
+..  graphviz:: src/glossary_rnn_with_memory.dot
+
+使用这种方式，PaddlePaddle可以比较简单的判断哪些输出是应该跨越时间步的，哪些不是。
+
+..  _glossary_timestep:
+
+时间步
+------
+
+参考时间序列。
+
+
+..  _glossary_sequence:
+
+时间序列
+--------
+
+时间序列(time series)是指一系列的特征数据。这些特征数据之间的顺序是有意义的。即特征的数组，而不是特征的集合。而这每一个数组元素，或者每一个系列里的特征数据，即为一个时间步(time step)。值得注意的是，时间序列、时间步的概念，并不真正的和『时间』有关。只要一系列特征数据中的『顺序』是有意义的，即为时间序列的输入。
+
+举例说明，例如文本分类中，我们通常将一句话理解成一个时间序列。比如一句话中的每一个单词，会变成词表中的位置。而这一句话就可以表示成这些位置的数组。例如 :code:`[9, 2, 3, 5, 3]` 。
+
+关于时间序列(time series)的更详细准确的定义，可以参考 `维基百科页面 Time series <https://en.wikipedia.org/wiki/Time_series>`_ 或者 `维基百科中文页面 时间序列 <https://zh.wikipedia.org/wiki/%E6%99%82%E9%96%93%E5%BA%8F%E5%88%97>`_ 。
+
+另外，Paddle中经常会将时间序列成为 :code:`Sequence` 。他们在Paddle的文档和API中是一个概念。 
+
+..  _glossary_RNN:
+
+RNN
+---
+
+RNN 在PaddlePaddle的文档中，一般表示 :code:`Recurrent neural network`，即时间递归神经网络。详细介绍可以参考 `维基百科页面 Recurrent neural network <https://en.wikipedia.org/wiki/Recurrent_neural_network>`_ 或者 `中文维基百科页面 <https://zh.wikipedia.org/wiki/%E9%80%92%E5%BD%92%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C>`_ 中关于时间递归神经网络的介绍。
+
+RNN 一般在PaddlePaddle中，指对于一个时间序列输入数据，每一个时间步之间的神经网络具有一定的相关性。例如，某一个神经元的一个输入为上一个时间步网络中某一个神经元的输出。或者，从每一个时间步来看，神经网络的网络结构中具有有向环结构。
+
+..  _glossary_双层RNN:
+
+双层RNN
+-------
+
+双层RNN顾名思义，即RNN之间有一次嵌套关系。输入数据整体上是一个时间序列，而对于每一个内层特征数据而言，也是一个时间序列。即二维数组，或者数组的数组这个概念。 而双层RNN是可以处理这种输入数据的网络结构。
+
+例如，对于段落的文本分类，即将一段话进行分类。我们将一段话看成句子的数组，每个句子又是单词的数组。这便是一种双层RNN的输入数据。而将这个段落的每一句话用lstm编码成一个向量，再对每一句话的编码向量用lstm编码成一个段落的向量。再对这个段落向量进行分类，即为这个双层RNN的网络结构。
+
diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/deep_model/rnn/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9e805ca85191b793c8798a239927a318c70b96f5
--- /dev/null
+++ b/doc/howto/deep_model/rnn/index_cn.rst
@@ -0,0 +1,9 @@
+RNN相关模型
+===========
+
+..  toctree::
+  :maxdepth: 1
+
+  recurrent_group_cn.md
+  hierarchical_layer_cn.rst
+  hrnn_rnn_api_compare_cn.rst
diff --git a/doc/howto/deep_model/rnn/index_en.rst b/doc/howto/deep_model/rnn/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7adc79873d699fdfd5a85034bcef964dd1f19132
--- /dev/null
+++ b/doc/howto/deep_model/rnn/index_en.rst
@@ -0,0 +1,7 @@
+RNN Models
+==========
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn_config_en.rst
diff --git a/doc_cn/algorithm/rnn/rnn-tutorial.md b/doc/howto/deep_model/rnn/recurrent_group_cn.md
similarity index 98%
rename from doc_cn/algorithm/rnn/rnn-tutorial.md
rename to doc/howto/deep_model/rnn/recurrent_group_cn.md
index 9e488b0d51956e86f9fb76f450fdb438f596e239..984fdcc505cdd073d0265c496cda5fb3553c22e4 100644
--- a/doc_cn/algorithm/rnn/rnn-tutorial.md
+++ b/doc/howto/deep_model/rnn/recurrent_group_cn.md
@@ -1,96 +1,96 @@
-# Recurrent Group教程
-
-## 概述
-
-序列数据是自然语言处理任务面对的一种主要输入数据类型。
-
-一句话是由词语构成的序列，多句话进一步构成了段落。因此，段落可以看作是一个嵌套的双层的序列，这个序列的每个元素又是一个序列。
-
-双层序列是PaddlePaddle支持的一种非常灵活的数据组织方式，帮助我们更好地描述段落、多轮对话等更为复杂的语言数据。基于双层序列输入，我们可以设计搭建一个灵活的、层次化的RNN，分别从词语和句子级别编码输入数据，同时也能够引入更加复杂的记忆机制，更好地完成一些复杂的语言理解任务。
-
-在PaddlePaddle中，`recurrent_group`是一种任意复杂的RNN单元，用户只需定义RNN在一个时间步内完成的计算，PaddlePaddle负责完成信息和误差在时间序列上的传播。
-
-更进一步，`recurrent_group`同样可以扩展到双层序列的处理上。通过两个嵌套的`recurrent_group`分别定义子句级别和词语级别上需要完成的运算，最终实现一个层次化的复杂RNN。
-
-目前，在PaddlePaddle中，能够对双向序列进行处理的有`recurrent_group`和部分Layer，具体可参考文档：<a href = "hierarchical-layer.html">支持双层序列作为输入的Layer</a>。
- 
-## 相关概念
-
-### 基本原理
-`recurrent_group` 是PaddlePaddle支持的一种任意复杂的RNN单元。使用者只需要关注于设计RNN在一个时间步之内完成的计算，PaddlePaddle负责完成信息和梯度在时间序列上的传播。
-
-PaddlePaddle中，`recurrent_group`的一个简单调用如下：
-
-``` python
-recurrent_group(step, input, reverse)
-```
-- step：一个可调用的函数，定义一个时间步之内RNN单元完成的计算
-- input：输入，必须是一个单层序列，或者一个双层序列
-- reverse：是否以逆序处理输入序列
- 
-使用`recurrent_group`的核心是设计step函数的计算逻辑。step函数内部可以自由组合PaddlePaddle支持的各种layer，完成任意的运算逻辑。`recurrent_group` 的输入（即input）会成为step函数的输入，由于step 函数只关注于RNN一个时间步之内的计算，在这里`recurrent_group`替我们完成了原始输入数据的拆分。
-
-### 输入
-`recurrent_group`处理的输入序列主要分为以下三种类型：
- 
-- **数据输入**：一个双层序列进入`recurrent_group`会被拆解为一个单层序列，一个单层序列进入`recurrent_group`会被拆解为非序列，然后交给step函数，这一过程对用户是完全透明的。可以有以下两种：1）通过data_layer拿到的用户输入；2）其它layer的输出。
-		
-- **只读Memory输入**：`StaticInput` 定义了一个只读的Memory，由`StaticInput`指定的输入不会被`recurrent_group`拆解，`recurrent_group` 循环展开的每个时间步总是能够引用所有输入，可以是一个非序列，或者一个单层序列。
-	  
-- **序列生成任务的输入**：`GeneratedInput`只用于在序列生成任务中指定输入数据。
-
-### 输入示例
-
-序列生成任务大多遵循encoder-decoer架构，encoder和decoder可以是能够处理序列的任意神经网络单元，而RNN是最流行的选择。
-
-给定encoder输出和当前词，decoder每次预测产生下一个最可能的词语。在这种结构中，decoder接受两个输入：
-    
-- 要生成的目标序列：是decoder的数据输入，也是decoder循环展开的依据，`recurrent_group`会对这类输入进行拆解。
-
-- encoder输出，可以是一个非序列，或者一个单层序列：是一个unbounded memory，decoder循环展开的每一个时间步会引用全部结果，不应该被拆解，这种类型的输入必须通过`StaticInput`指定。关于Unbounded Memory的更多讨论请参考论文 [Neural Turning Machine](https://arxiv.org/abs/1410.5401)。
-		
-在序列生成任务中，decoder RNN总是引用上一时刻预测出的词的词向量，作为当前时刻输入。`GeneratedInput`自动完成这一过程。
-		 
-### 输出
-`step`函数必须返回一个或多个Layer的输出，这个Layer的输出会作为整个`recurrent_group` 最终的输出结果。在输出的过程中，`recurrent_group` 会将每个时间步的输出拼接，这个过程对用户也是透明的。
-
-### memory
-memory只能在`recurrent_group`中定义和使用。memory不能独立存在，必须指向一个PaddlePaddle定义的Layer。引用memory得到这layer上一时刻输出，因此，可以将memory理解为一个时延操作。
-
-可以显示地指定一个layer的输出用于初始化memory。不指定时，memory默认初始化为0。
-
-## 双层RNN介绍
-`recurrent_group`帮助我们完成对输入序列的拆分，对输出的合并，以及计算逻辑在序列上的循环展开。
-
-利用这种特性，两个嵌套的`recurrent_group`能够处理双层序列，实现词语和句子两个级别的双层RNN结构。
-
-- 单层（word-level）RNN：每个状态（state）对应一个词（word）。
-- 双层（sequence-level）RNN：一个双层RNN由多个单层RNN组成，每个单层RNN（即双层RNN的每个状态）对应一个子句（subseq）。
-
-为了描述方便，下文以NLP任务为例，将含有子句（subseq）的段落定义为一个双层序列，将含有词语的句子定义为一个单层序列，那么0层序列即为一个词语。
-
-## 双层RNN的使用
-
-### 训练流程的使用方法
-使用 `recurrent_group`需要遵循以下约定：
- 
-- **单进单出**：输入和输出都是单层序列。
-  - 如果有多个输入，不同输入序列含有的词语数必须严格相等。
-  - 输出一个单层序列，输出序列的词语数和输入序列一致。
-  - memory：在step函数中定义 memory指向一个layer，通过引用memory得到这个layer上一个时刻输出，形成recurrent 连接。memory的is_seq参数必须为false。如果没有定义memory，每个时间步之内的运算是独立的。
-  - boot_layer：memory的初始状态，默认初始状为0，memory的is_seq参数必须为false。
- 
-- **双进双出**：输入和输出都是双层序列。
-  - 如果有多个输入序列，不同输入含有的子句（subseq）数必须严格相等，但子句含有的词语数可以不相等。
-  - 输出一个双层序列，子句（subseq）数、子句的单词数和指定的一个输入序列一致，默认为第一个输入。
-  - memory：在step函数中定义memory，指向一个layer，通过引用memory得到这个layer上一个时刻的输出，形成recurrent连接。定义在外层`recurrent_group` step函数中的memory，能够记录上一个subseq 的状态，可以是一个单层序列（只作为read-only memory），也可以是一个词语。如果没有定义memory，那么 subseq 之间的运算是独立的。
-  - boot_layer：memory 初始状态，可以是一个单层序列（只作为read-only memory）或一个向量。默认不设置，即初始状态为0。
-
-- **双进单出**：目前还未支持，会报错"In hierachical RNN, all out links should be from sequences now"。
- 
-
-### 生成流程的使用方法
-使用`beam_search`需要遵循以下约定：
-
-- 单层RNN：从一个word生成下一个word。
+# Recurrent Group教程
+
+## 概述
+
+序列数据是自然语言处理任务面对的一种主要输入数据类型。
+
+一句话是由词语构成的序列，多句话进一步构成了段落。因此，段落可以看作是一个嵌套的双层的序列，这个序列的每个元素又是一个序列。
+
+双层序列是PaddlePaddle支持的一种非常灵活的数据组织方式，帮助我们更好地描述段落、多轮对话等更为复杂的语言数据。基于双层序列输入，我们可以设计搭建一个灵活的、层次化的RNN，分别从词语和句子级别编码输入数据，同时也能够引入更加复杂的记忆机制，更好地完成一些复杂的语言理解任务。
+
+在PaddlePaddle中，`recurrent_group`是一种任意复杂的RNN单元，用户只需定义RNN在一个时间步内完成的计算，PaddlePaddle负责完成信息和误差在时间序列上的传播。
+
+更进一步，`recurrent_group`同样可以扩展到双层序列的处理上。通过两个嵌套的`recurrent_group`分别定义子句级别和词语级别上需要完成的运算，最终实现一个层次化的复杂RNN。
+
+目前，在PaddlePaddle中，能够对双向序列进行处理的有`recurrent_group`和部分Layer，具体可参考文档：<a href = "hierarchical-layer.html">支持双层序列作为输入的Layer</a>。
+ 
+## 相关概念
+
+### 基本原理
+`recurrent_group` 是PaddlePaddle支持的一种任意复杂的RNN单元。使用者只需要关注于设计RNN在一个时间步之内完成的计算，PaddlePaddle负责完成信息和梯度在时间序列上的传播。
+
+PaddlePaddle中，`recurrent_group`的一个简单调用如下：
+
+``` python
+recurrent_group(step, input, reverse)
+```
+- step：一个可调用的函数，定义一个时间步之内RNN单元完成的计算
+- input：输入，必须是一个单层序列，或者一个双层序列
+- reverse：是否以逆序处理输入序列
+ 
+使用`recurrent_group`的核心是设计step函数的计算逻辑。step函数内部可以自由组合PaddlePaddle支持的各种layer，完成任意的运算逻辑。`recurrent_group` 的输入（即input）会成为step函数的输入，由于step 函数只关注于RNN一个时间步之内的计算，在这里`recurrent_group`替我们完成了原始输入数据的拆分。
+
+### 输入
+`recurrent_group`处理的输入序列主要分为以下三种类型：
+ 
+- **数据输入**：一个双层序列进入`recurrent_group`会被拆解为一个单层序列，一个单层序列进入`recurrent_group`会被拆解为非序列，然后交给step函数，这一过程对用户是完全透明的。可以有以下两种：1）通过data_layer拿到的用户输入；2）其它layer的输出。
+		
+- **只读Memory输入**：`StaticInput` 定义了一个只读的Memory，由`StaticInput`指定的输入不会被`recurrent_group`拆解，`recurrent_group` 循环展开的每个时间步总是能够引用所有输入，可以是一个非序列，或者一个单层序列。
+	  
+- **序列生成任务的输入**：`GeneratedInput`只用于在序列生成任务中指定输入数据。
+
+### 输入示例
+
+序列生成任务大多遵循encoder-decoer架构，encoder和decoder可以是能够处理序列的任意神经网络单元，而RNN是最流行的选择。
+
+给定encoder输出和当前词，decoder每次预测产生下一个最可能的词语。在这种结构中，decoder接受两个输入：
+    
+- 要生成的目标序列：是decoder的数据输入，也是decoder循环展开的依据，`recurrent_group`会对这类输入进行拆解。
+
+- encoder输出，可以是一个非序列，或者一个单层序列：是一个unbounded memory，decoder循环展开的每一个时间步会引用全部结果，不应该被拆解，这种类型的输入必须通过`StaticInput`指定。关于Unbounded Memory的更多讨论请参考论文 [Neural Turning Machine](https://arxiv.org/abs/1410.5401)。
+		
+在序列生成任务中，decoder RNN总是引用上一时刻预测出的词的词向量，作为当前时刻输入。`GeneratedInput`自动完成这一过程。
+		 
+### 输出
+`step`函数必须返回一个或多个Layer的输出，这个Layer的输出会作为整个`recurrent_group` 最终的输出结果。在输出的过程中，`recurrent_group` 会将每个时间步的输出拼接，这个过程对用户也是透明的。
+
+### memory
+memory只能在`recurrent_group`中定义和使用。memory不能独立存在，必须指向一个PaddlePaddle定义的Layer。引用memory得到这layer上一时刻输出，因此，可以将memory理解为一个时延操作。
+
+可以显示地指定一个layer的输出用于初始化memory。不指定时，memory默认初始化为0。
+
+## 双层RNN介绍
+`recurrent_group`帮助我们完成对输入序列的拆分，对输出的合并，以及计算逻辑在序列上的循环展开。
+
+利用这种特性，两个嵌套的`recurrent_group`能够处理双层序列，实现词语和句子两个级别的双层RNN结构。
+
+- 单层（word-level）RNN：每个状态（state）对应一个词（word）。
+- 双层（sequence-level）RNN：一个双层RNN由多个单层RNN组成，每个单层RNN（即双层RNN的每个状态）对应一个子句（subseq）。
+
+为了描述方便，下文以NLP任务为例，将含有子句（subseq）的段落定义为一个双层序列，将含有词语的句子定义为一个单层序列，那么0层序列即为一个词语。
+
+## 双层RNN的使用
+
+### 训练流程的使用方法
+使用 `recurrent_group`需要遵循以下约定：
+ 
+- **单进单出**：输入和输出都是单层序列。
+  - 如果有多个输入，不同输入序列含有的词语数必须严格相等。
+  - 输出一个单层序列，输出序列的词语数和输入序列一致。
+  - memory：在step函数中定义 memory指向一个layer，通过引用memory得到这个layer上一个时刻输出，形成recurrent 连接。memory的is_seq参数必须为false。如果没有定义memory，每个时间步之内的运算是独立的。
+  - boot_layer：memory的初始状态，默认初始状为0，memory的is_seq参数必须为false。
+ 
+- **双进双出**：输入和输出都是双层序列。
+  - 如果有多个输入序列，不同输入含有的子句（subseq）数必须严格相等，但子句含有的词语数可以不相等。
+  - 输出一个双层序列，子句（subseq）数、子句的单词数和指定的一个输入序列一致，默认为第一个输入。
+  - memory：在step函数中定义memory，指向一个layer，通过引用memory得到这个layer上一个时刻的输出，形成recurrent连接。定义在外层`recurrent_group` step函数中的memory，能够记录上一个subseq 的状态，可以是一个单层序列（只作为read-only memory），也可以是一个词语。如果没有定义memory，那么 subseq 之间的运算是独立的。
+  - boot_layer：memory 初始状态，可以是一个单层序列（只作为read-only memory）或一个向量。默认不设置，即初始状态为0。
+
+- **双进单出**：目前还未支持，会报错"In hierachical RNN, all out links should be from sequences now"。
+ 
+
+### 生成流程的使用方法
+使用`beam_search`需要遵循以下约定：
+
+- 单层RNN：从一个word生成下一个word。
 - 双层RNN：即把单层RNN生成后的subseq给拼接成一个新的双层seq。从语义上看，也不存在一个subseq直接生成下一个subseq的情况。
diff --git a/doc/algorithm/rnn/rnn.rst b/doc/howto/deep_model/rnn/rnn_config_en.rst
similarity index 96%
rename from doc/algorithm/rnn/rnn.rst
rename to doc/howto/deep_model/rnn/rnn_config_en.rst
index 01d2caefb5cdf4e949511fd0f5bbafe0e604e881..73f5d5371fcd3ce95253cad47b0d8e738284441c 100644
--- a/doc/algorithm/rnn/rnn.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_en.rst
@@ -30,7 +30,7 @@ Then at the :code:`process` function, each :code:`yield` function will return th
     yield src_ids, trg_ids, trg_ids_next
 
 
-For more details description of how to write a data provider, please refer to `PyDataProvider2 <../../ui/data_provider/index.html>`_. The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`.
+For more details description of how to write a data provider, please refer to :ref:`api_pydataprovider2` . The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`.
 
 ===============================================
 Configure Recurrent Neural Network Architecture
@@ -42,8 +42,8 @@ Simple Gated Recurrent Neural Network
 
 Recurrent neural network process a sequence at each time step sequentially. An example of the architecture of LSTM is listed below.
 
-.. image:: ./bi_lstm.jpg
-	 :align: center
+.. image:: ../../../tutorials/sentiment_analysis/src/bi_lstm.jpg
+     :align: center
 
 Generally speaking, a recurrent network perform the following operations from :math:`t=1` to :math:`t=T`, or reversely from :math:`t=T` to :math:`t=1`.
 
@@ -101,12 +101,12 @@ Sequence to Sequence Model with Attention
 -----------------------------------------
 We will use the sequence to sequence model with attention as an example to demonstrate how you can configure complex recurrent neural network models. An illustration of the sequence to sequence model with attention is shown in the following figure.
 
-.. image:: ./encoder-decoder-attention-model.png
- 	 :align: center
+.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
+      :align: center
 
 In this model, the source sequence :math:`S = \{s_1, \dots, s_T\}` is encoded with a bidirectional gated recurrent neural networks. The hidden states of the bidirectional gated recurrent neural network :math:`H_S = \{H_1, \dots, H_T\}` is called *encoder vector* The decoder is a gated recurrent neural network. When decoding each token :math:`y_t`, the gated recurrent neural network generates a set of weights :math:`W_S^t = \{W_1^t, \dots, W_T^t\}`, which are used to compute a weighted sum of the encoder vector. The weighted sum of the encoder vector is utilized to condition the generation of the token :math:`y_t`.
 
-The encoder part of the model is listed below. It calls :code:`grumemory` to represent gated recurrent neural network. It is the recommended way of using recurrent neural network if the network architecture is simple, because it is faster than :code:`recurrent_group`. We have implemented most of the commonly used recurrent neural network architectures, you can refer to `Layers <../../ui/api/trainer_config_helpers/layers_index.html>`_  for more details.
+The encoder part of the model is listed below. It calls :code:`grumemory` to represent gated recurrent neural network. It is the recommended way of using recurrent neural network if the network architecture is simple, because it is faster than :code:`recurrent_group`. We have implemented most of the commonly used recurrent neural network architectures, you can refer to :ref:`api_trainer_config_helpers_layers` for more details.
 
 We also project the encoder vector to :code:`decoder_size` dimensional space, get the first instance of the backward recurrent network, and project it to :code:`decoder_size` dimensional space:
 
@@ -246,6 +246,6 @@ The code is listed below:
     outputs(beam_gen)
 
 
-Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to `Semantic Role Labeling Demo <../../demo/semantic_role_labeling/index.html>`_ for more details.
+Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to :ref:`semantic_role_labeling` for more details.
 
 The full configuration file is located at :code:`demo/seqToseq/seqToseq_net.py`.
diff --git a/doc/howto/deep_model/rnn/src/glossary_rnn.dot b/doc/howto/deep_model/rnn/src/glossary_rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..2cd0fb1820c44b0e8e0b869f9d39fcad27efa758
--- /dev/null
+++ b/doc/howto/deep_model/rnn/src/glossary_rnn.dot
@@ -0,0 +1,42 @@
+digraph G{
+	subgraph cluster_timestep0 {
+		label="recurrent timestep i-1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc0_0 [label="fc 0"]
+		fc0_1 [label="fc 1"]
+		fc0_2 [label="fc 2"]
+
+		fc0_0 -> fc0_1
+		fc0_1 -> fc0_2
+	}
+
+	subgraph cluster_timestep1 {
+		label="recurrent timestep i"
+		node [style=filled];
+		fc1_0 [label="fc 0"]
+		fc1_1 [label="fc 1"]
+		fc1_2 [label="fc 2"]
+		color=blue
+
+		fc1_0 -> fc1_1
+		fc1_1 -> fc1_2
+	}
+
+	subgraph cluster_timestep2 {
+		label="recurrent timestep i+1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc2_0 [label="fc 0"]
+		fc2_1 [label="fc 1"]
+		fc2_2 [label="fc 2"]
+
+		fc2_0 -> fc2_1
+		fc2_1 -> fc2_2
+	}
+	
+	
+	fc0_1 -> fc1_1 [style="dotted" constraint=false]
+	fc1_1 -> fc2_1 [style="dotted" constraint=false]
+
+}
\ No newline at end of file
diff --git a/doc/howto/deep_model/rnn/src/glossary_rnn_with_memory.dot b/doc/howto/deep_model/rnn/src/glossary_rnn_with_memory.dot
new file mode 100644
index 0000000000000000000000000000000000000000..0f101ec2d8f15aec76c57f328046b6b55cf0c7eb
--- /dev/null
+++ b/doc/howto/deep_model/rnn/src/glossary_rnn_with_memory.dot
@@ -0,0 +1,48 @@
+digraph G{
+	subgraph cluster_timestep0 {
+		label="recurrent timestep i-1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc0_0 [label="fc 0"]
+		fc0_1 [label="fc 1"]
+		fc0_2 [label="fc 2"]
+		m0 [label="memory"]
+		fc0_0 -> fc0_1
+		fc0_1 -> fc0_2
+		fc0_1 -> m0
+		m0 -> fc0_1
+	}
+
+	subgraph cluster_timestep1 {
+		label="recurrent timestep i"
+		node [style=filled];
+		fc1_0 [label="fc 0"]
+		fc1_1 [label="fc 1"]
+		fc1_2 [label="fc 2"]
+		m1 [label="memory"]
+		color=blue
+		fc1_0 -> fc1_1
+		fc1_1 -> fc1_2
+		fc1_1 -> m1
+		m1 -> fc1_1
+	}
+
+	subgraph cluster_timestep2 {
+		label="recurrent timestep i+1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc2_0 [label="fc 0"]
+		fc2_1 [label="fc 1"]
+		fc2_2 [label="fc 2"]
+		m2 [label="memory"]
+		fc2_0 -> fc2_1
+		fc2_1 -> fc2_2
+		fc2_1 -> m2
+		m2 -> fc2_1
+	}
+	
+	
+	m0 -> m1 [style="dotted" constraint=false]
+	m1 -> m2 [style="dotted" constraint=false]
+
+}
\ No newline at end of file
diff --git a/doc/howto/deep_model/rnn/src/simple_full_hierarchical_recurrent.dot b/doc/howto/deep_model/rnn/src/simple_full_hierarchical_recurrent.dot
new file mode 100644
index 0000000000000000000000000000000000000000..ff278a0323bb2c3ef07bf6f016a3a8df05783581
--- /dev/null
+++ b/doc/howto/deep_model/rnn/src/simple_full_hierarchical_recurrent.dot
@@ -0,0 +1,30 @@
+digraph G {
+  rankdir=LR;
+
+  subgraph cluster_t0 {
+    a [label="4"]
+    b [label="5"]
+    c [label="2"]
+  }
+  
+  subgraph cluster_t1 {
+    d [label="0"]
+    e [label="9"]
+  }
+
+  subgraph cluster_t2 {
+    f [label="8"]
+    g [label="1"]
+    h [label="4"]
+  }
+
+  a -> b;
+  b -> c;
+  c -> d [constraint=false];
+
+  d -> e;
+  e -> f [constraint=false];
+  
+  f -> g;
+  g -> h;
+}
\ No newline at end of file
diff --git a/doc/howto/deep_model/rnn/src/simple_full_recurrent.dot b/doc/howto/deep_model/rnn/src/simple_full_recurrent.dot
new file mode 100644
index 0000000000000000000000000000000000000000..cee281fbac993afbd0cc3416570f95965cdf0a59
--- /dev/null
+++ b/doc/howto/deep_model/rnn/src/simple_full_recurrent.dot
@@ -0,0 +1,19 @@
+digraph G {
+  rankdir=LR;
+  a [label="4"]
+  b [label="5"]
+  c [label="2"]
+  d [label="0"]
+  e [label="9"]
+  f [label="8"]
+  g [label="1"]
+  h [label="4"]
+
+  a -> b;
+  b -> c;
+  c -> d;
+  d -> e;
+  e -> f;
+  f -> g;
+  g -> h;
+}
\ No newline at end of file
diff --git a/doc/dev/new_layer/FullyConnected.jpg b/doc/howto/dev/FullyConnected.jpg
similarity index 100%
rename from doc/dev/new_layer/FullyConnected.jpg
rename to doc/howto/dev/FullyConnected.jpg
diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..e0a63f5a14c7b2e8953aa21739668ee2a9ebeff1
--- /dev/null
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -0,0 +1,131 @@
+# 如何贡献代码
+
+我们真诚地感谢您的贡献，欢迎通过 GitHub 的 fork 和 pull request 流程来提交代码。
+ 
+## 代码要求
+- 你的代码必须完全遵守 [doxygen](http://www.stack.nl/~dimitri/doxygen/) 的样式。
+- 确保编译器选项 WITH\_STYLE\_CHECK 已打开，并且编译能通过代码样式检查。
+- 所有代码必须具有单元测试。
+- 通过所有单元测试。
+
+以下教程将指导您提交代码。
+ 
+## [Fork](https://help.github.com/articles/fork-a-repo/)
+ 
+跳转到[PaddlePaddle](https://github.com/PaddlePaddle/Paddle) GitHub首页，然后单击 `Fork` 按钮。
+
+## 克隆（Clone）
+
+Paddle 目前使用[git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发，测试，发行和维护。
+**develop** 是主分支，其他用户分支是特征分支（feature branches）。
+
+一旦你创建了一个fork，你可以使用你最喜欢的 git 客户端克隆你的仓库（repo）或只是直接在命令行输入：
+
+```shell
+# 克隆 fork 到本地
+git clone --branch develop https://github.com/USERNAME/Paddle.git
+```
+如果你的仓库不包含 **develop** 分支，你只需自己创建它。
+
+```shell
+git clone https://github.com/USERNAME/Paddle.git Paddle
+cd Paddle
+git checkout -b develop  # 创建 develop 分支
+git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # 添加 upstream 到 baidu/Paddle
+git pull upstream develop  # 更新 upstream
+git submodule update --init --recursive
+```
+
+然后你可以通过做一个本地开发分支开始开发
+
+```shell
+git checkout -b MY_COOL_STUFF_BRANCH
+```
+
+## 使用 `pre-commit` 钩子
+
+Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理git预提交钩子。 它可以帮助我们格式化源代码（cpp，python），在提交前检查一些基本事宜（每个文件只有一个 EOL 
+，git 中不要添加大文件）。 `pre-commit`测试是 Travis-CI 中单元测试的一部分，不满足钩子
+的 PR 不能提交代码到 Paddle。
+
+你可以通过 `pip install pre-commit` 安装 [pre-commit](http://pre-commit.com/)，
+目前 Paddle 使用 `clang-format` 来调整C/C++源代码格式。请确保 clang-format 版本在3.8以上。
+
+然后只需在 Paddle clone 目录中运行 `pre-commit install` 。当你
+提交你的代码时，pre-commit 钩子会检查本地代码是否存在
+不适合提交的东西，等等。
+
+## 提交（Commit）
+
+提交你的代码：
+
+```shell
+# 显示工作树状态
+git status
+# 添加修改过的文件
+git add xx
+env EDITOR=vim git commit  # 你可以用 vim/nano/emacs 写下你的注释
+```
+提交信息的第一行是标题，其他行可以添加一些细节（如果有必要的话）。
+
+## 保持 Fork 状态最新
+
+在拉（pull）你的请求（request）之前，你应该从最新的 PaddlePaddle 同步代码。
+为此，你需要首先添加远程（remote）：
+
+```shell
+# 观察当前远程仓库配置
+git remote -v
+# 添加上游（upstream）仓库
+git remote add upstream https://github.com/PaddlePaddle/Paddle.git
+# 验证新的 upstream
+git remote -v
+```
+
+用最新的 upstream 更新你的 fork：
+
+```shell
+git pull --rebase upstream develop
+```
+如果本地没有提交，git 将简单地执行快进。但是，如果你一直在做一些改变（绝大多数情况下不应该），你可能要处理冲突。
+
+现在，你的本地主分支与上游修改的一致并是最新的。
+
+## 推送（Push）到 GitHub
+
+```shell
+# 在 GitHub 上 push 你的仓库
+git push -u origin MY_COOL_STUFF_BRANCH  # 创建远程分支 MY_COOL_STUFF_BRANCH 到 origin.
+```
+
+## 拉取请求（Pull Request）
+
+转到 GitHub上 你 fork 的页面，选择你的开发分支并单击 **pull request 按钮**。
+
+## 使用最新版本更新你的 pull 请求
+
+在代码审查（code review）期间，由于 baidu/Paddle 中新的提交导致你的 pull 请求可能会失效。如果没有冲突，GitHub允许自动更新。 你可以点击 pull request 页面中的“更新分支（Update Branch）”按钮。 但是如果存在代码冲突，你需要手动进行更新。你需要在本地仓库执行如下命令：
+
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull upstream develop
+# 你可能需要根据git提示解决冲突
+# 创建并测试你的代码
+git push origin MY_COOL_STUFF_BRANCH
+```
+现在你的 Pull Request 是最新的了。
+
+## 修改你的 pull request
+
+当根据审阅者的意见修改 pull 请求时，请使用“git commit”而不是“git commit --amend”来提交更改，以便审阅者可以看到新的请求和旧的请求之间的区别。
+
+可能的命令是
+
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull upstream develop   # 将本地更新到最新的代码库
+# 可能会发生一些冲突
+# 开始开发吧！
+env EDITOR=vim git commit  # 添加修改日志
+git push origin MY_COOL_STUFF_BRANCH
+```
diff --git a/doc/build/contribute_to_paddle.md b/doc/howto/dev/contribute_to_paddle_en.md
similarity index 77%
rename from doc/build/contribute_to_paddle.md
rename to doc/howto/dev/contribute_to_paddle_en.md
index 1d03eb7362b1b6f2fcdac7b53f8b7f93fb75e49c..e578f6fce8b94180da7d5de041a0e17b1d59f6ea 100644
--- a/doc/build/contribute_to_paddle.md
+++ b/doc/howto/dev/contribute_to_paddle_en.md
@@ -1,8 +1,8 @@
 # Contribute Code
 
 We sincerely appreciate your contributions. You can use fork and pull request
-workflow to merge your code. 
- 
+workflow to merge your code.
+
 ## Code Requirements
 - Your code must be fully documented by
   [doxygen](http://www.stack.nl/~dimitri/doxygen/) style.
@@ -12,11 +12,11 @@ workflow to merge your code.
 - Pass all unit tests.
 
 The following tutorial guides you into submitting your contibution.
- 
+
 ## [Creating a Fork](https://help.github.com/articles/fork-a-repo/)
- 
+
 Just head over to the GitHub page and click the "Fork" button.
-It's just that simple. 
+It's just that simple.
 
 ## Clone
 
@@ -25,7 +25,7 @@ The **develop** is the main branch, and other user's branches are feature branch
 
 Once you've created a fork, you can use your favorite git client to clone your
 repo or just head straight to the command line:
- 
+
 ```shell
 # Clone your fork to your local machine
 git clone --branch develop https://github.com/USERNAME/Paddle.git
@@ -36,8 +36,9 @@ If your repository doesn't contain **develop** branch, just create it by your ow
 git clone https://github.com/USERNAME/Paddle.git Paddle
 cd Paddle
 git checkout -b develop  # create develop branch.
-git remote add upstream https://github.com/baidu/Paddle.git  # add upstream to baidu/Paddle
+git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # add upstream to baidu/Paddle
 git pull upstream develop  # update to upstream
+git submodule update --init --recursive
 ```
 
 Then you can start to develop by making a local developement branch
@@ -46,6 +47,22 @@ Then you can start to develop by making a local developement branch
 git checkout -b MY_COOL_STUFF_BRANCH
 ```
 
+## Using `pre-commit` hook
+
+Paddle developers use [pre-commit](http://pre-commit.com/) tool to manage git
+pre-commit hooks. It can help us format source codes (cpp, python), check some
+basic thing before commit (only one EOL for each file, do not add a huge file
+in git). `pre-commit` tests is a part of unit tests in Travis-CI now, every
+PR doesn't fit hook can not be merged into Paddle.
+
+To use [pre-commit](http://pre-commit.com/), you should install it by
+`pip install pre-commit`, and currently, Paddle uses `clang-format` to format
+c/cpp sources. Please make sure clang-format 3.8+ installed.
+
+Then just run `pre-commit install` in your Paddle clone directory. When you
+commit your code, the pre-commit hook will check the local code if there is
+anything not suitable to commit, and so on.
+
 ## Commit
 
 Commit your changes by following command lines:
@@ -69,7 +86,7 @@ To do this, you'll need to add a remote at first:
 # see the current configured remote repository
 git remote -v
 # add upstream repository
-git remote add upstream https://github.com/baidu/Paddle.git
+git remote add upstream https://github.com/PaddlePaddle/Paddle.git
 # verify the new upstream
 git remote -v
 ```
@@ -82,7 +99,7 @@ git pull --rebase upstream develop
 
 If there are no unique commits locally, git will simply perform a fast-forward.
 However, if you have been making changes (in the vast majority of cases you
-probably shouldn't be), you may have to deal with conflicts. 
+probably shouldn't be), you may have to deal with conflicts.
 
 Now, your local master branch is up-to-date with everything modified upstream.
 
diff --git a/doc/dev/new_layer/new_layer.rst b/doc/howto/dev/new_layer_en.rst
similarity index 99%
rename from doc/dev/new_layer/new_layer.rst
rename to doc/howto/dev/new_layer_en.rst
index af8b76a3075194ead9be40d2c943238b2cfadecc..0513f068f39ad0d931b03d066a0083a1a8a33b79 100644
--- a/doc/dev/new_layer/new_layer.rst
+++ b/doc/howto/dev/new_layer_en.rst
@@ -1,6 +1,6 @@
-==================
-Writing New Layers
-==================
+================
+Write New Layers
+================
 
 This tutorial will guide you to write customized layers in PaddlePaddle. We will utilize fully connected layer as an example to guide you through the following steps for writing a new layer.
 
diff --git a/doc_cn/howto/how_to_write_docs/index.rst b/doc/howto/dev/write_docs_cn.rst
similarity index 90%
rename from doc_cn/howto/how_to_write_docs/index.rst
rename to doc/howto/dev/write_docs_cn.rst
index a1f983b3405fa40f436885e40fca2ebbb4695491..5051a892304fdc8b0f1a19a7d4560d5ee007c47d 100644
--- a/doc_cn/howto/how_to_write_docs/index.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -1,6 +1,6 @@
-###############################
-如何贡献/修改PaddlePaddle的文档
-###############################
+##################
+如何贡献/修改文档
+##################
 
 PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成，生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
 
@@ -51,4 +51,4 @@ TBD
 
 
 ..	_cmake: https://cmake.org/
-..	_sphinx: http://www.sphinx-doc.org/en/1.4.8/
\ No newline at end of file
+..	_sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e03138723e4df951b5fb1bd28f98a33e679b454a
--- /dev/null
+++ b/doc/howto/index_cn.rst
@@ -0,0 +1,37 @@
+进阶指南
+========
+
+使用说明
+--------
+
+..  toctree::
+  :maxdepth: 1
+
+  usage/concepts/use_concepts_cn.rst
+  usage/cluster/k8s/k8s_cn.md
+  usage/cluster/k8s/k8s_distributed_cn.md
+
+开发标准
+--------
+
+..  toctree::
+  :maxdepth: 1
+
+  dev/write_docs_cn.rst
+  dev/contribute_to_paddle_cn.md
+
+模型配置
+--------
+
+..  toctree::
+  :maxdepth: 1
+
+  deep_model/rnn/index_cn.rst
+
+性能优化
+--------
+
+..  toctree::
+  :maxdepth: 1
+
+  optimization/gpu_profiling_cn.rst
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..983dc743eb453a0210bc5fb3c7e4525fa838d428
--- /dev/null
+++ b/doc/howto/index_en.rst
@@ -0,0 +1,36 @@
+HOW TO
+=======
+
+Usage
+-------
+
+..  toctree::
+  :maxdepth: 1
+
+  usage/cmd_parameter/index_en.md
+  usage/cluster/cluster_train_en.md
+
+Development
+------------
+
+..  toctree::
+  :maxdepth: 1
+
+  dev/new_layer_en.rst
+  dev/contribute_to_paddle_en.md
+
+Configuration
+-------------
+
+..  toctree::
+  :maxdepth: 1
+
+  deep_model/rnn/index_en.rst
+
+Optimization
+-------------
+
+..  toctree::
+  :maxdepth: 1
+
+  optimization/gpu_profiling_en.rst
diff --git a/doc/howto/optimization/gpu_profiling_cn.rst b/doc/howto/optimization/gpu_profiling_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e2b0b0396e0034b01ed2c5081effdd3bcabf31ae
--- /dev/null
+++ b/doc/howto/optimization/gpu_profiling_cn.rst
@@ -0,0 +1,242 @@
+==================
+GPU性能分析与调优
+==================
+
+..  contents::
+
+此教程将向您分步介绍如何使用内置的定时工具、 **nvprof** 或 **nvvp** 来运行性能分析和调优。
+
+- 什么是性能分析？
+- 为什么需要性能分析？
+- 如何进行性能分析？
+- 性能分析工具介绍
+- 详细教程
+- 性能分析小技巧
+
+什么是性能分析？
+================
+在软件工程的范畴里，性能分析（Profiling）是一个动态程序分析的术语，它可以指测量一个程序的空间（内存）复杂度或时间复杂度，
+也可以说是某些特定指令的使用情况，或者是函数调用的频率和耗时等。通常情况下，分析得到的信息用于协助进行程序的优化。
+
+简单来说，性能分析工具是用于给应用程序的性能做定量分析的。如果想很好的理解程序的行为，那程序分析工具是必不可少的利器。简单的性能分析，可以告诉您某个操作到底花了多长时间？而更深入的分析，甚至能解释为什么某个操作花了很长时间？
+
+为什么需要性能分析？
+============================
+训练好一个深层神经网络通常要耗费非常长的时间，所以性能也就逐步变成了深度学习领域最重要的指标。
+而优化性能的首要任务，是需要了解哪些步骤拖慢了整体。
+如果某一块根本就不怎么耗时，那也就不需要急着优化性能啦！
+
+如何进行性能分析？
+========================
+为了达到性能最优，您可以采用下面五个步骤：
+
+- 对代码进行性能分析
+- 找到运行慢的部分
+- 找到运行慢的原因
+- 修改成更快的版本
+- 再次对代码进行性能分析
+
+Usually, processor has two key performance limits include float point throughput and
+memory throughput. For GPU,  it also need more parallelism to fulfill its potential.
+This is why they can be so fast.
+
+通常情况下，处理器有两个关键性能限制：一个是浮点计算量，另一个是内存操作量。
+GPU则还需要高并行性，才能发挥其全部能力。这正是它们速度快的原因。
+
+性能分析工具介绍
+======================
+就通常的GPU性能分析来说，市面上已经有NVIDIA或第三方提供的众多工具。
+
+**nvprof** 是Nvidia性能分析工具， **nvvp** 则是带GUI的Nvidia可视化性能分析工具。
+在这个教程中，我们主要会介绍nvprof和nvvp。
+
+:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
+above profilers.
+
+:code:`paddle/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
+
+.. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+   :language: c++
+   :lines: 137-151
+   :linenos:
+
+上述的代码片段包含了两种方法，您可以任意使用一个或两个来对感兴趣的代码段做性能分析。
+
+1. :code:`REGISTER_TIMER_INFO` 是一个内置的定时器封装，可以用来计算CPU函数或cuda内核的时间消耗。
+
+2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
+program crashes when CPU version of PaddlePaddle invokes them.
+
+3. :code:`REGISTER_GPU_PROFILER` 是一个封装对象，封装了 :code:`cudaProfilerStart` 和 :code:`cudaProfileStop` 两个操作；同时其内部实现可以避免纯CPU版本PaddlePaddle在执行本语句时发生崩溃。
+
+您会在接下来的部分中获得更多的细节介绍。
+
+详细教程
+============
+
+内置定时器
+------------
+
+如果想要启用PaddlePaddle的内置定时器，您首先需要在相关代码段中加入 :code:`REGISTER_TIMER_INFO`。
+接下来就可以使用 :code:`printStatus` 或者 :code:`printAllStatus` 函数来将信息输出到界面中。
+下面举个简单的例子：
+
+1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数（如高亮部分）。
+
+    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 8-12,14
+        :linenos:
+
+2. cmake配置中将 **WITH_TIMER** 打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_TIMER=ON
+        make
+
+3. 执行您的代码，并观察结果(如高亮部分）。
+
+    .. code-block:: bash
+        :emphasize-lines: 1,12-15
+
+        > ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+        [==========] Running 1 test from 1 test case.
+        [----------] Global test environment set-up.
+        [----------] 1 test from Profiler
+        [ RUN      ] Profiler.BilinearFwdBwd
+        I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
+        gSizeX = 64, imgSizeY = 64"
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)
+        [----------] 1 test from Profiler (136 ms total)
+
+        [----------] Global test environment tear-down
+        [==========] 1 test from 1 test case ran. (136 ms total)
+        [  PASSED  ] 1 test.
+
+nvprof 工具
+----------------
+
+要使用命令行分析工具 **nvprof**，您按如下步骤操作即可：
+
+1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中（参考强调部分）。
+
+    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 6-7
+        :linenos:
+
+2. cmake中将 **WITH_PROFILER** 配置打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_PROFILER=ON
+        make
+
+3. 使用 **nvprof** 来分析执行文件。
+
+    .. code-block:: bash
+
+        nvprof  ./paddle/math/tests/test_GpuProfiler
+
+然后，您就能获得如下的分析结果：
+
+.. code-block:: bash
+
+    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler
+    ==78544== Profiling result:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
+
+    ==78544== API calls:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
+    0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
+
+
+nvvp 工具
+--------------
+
+如果想使用可视化的分析器 **nvvp**，您可以导入 :code:`nvprof -o ...` 的输出，或者从工具的界面里运行您的应用。
+
+**备注: nvvp 也支持CPU的性能分析** (需在nvvp界面中选上才能开启）
+
+..  image:: nvvp1.png
+    :align: center
+    :scale: 33%
+
+从内核函数的角度， **nvvp** 可以精确说明一个长耗时操作的具体原因。
+同时，如下图所示， **nvvp** 的内核block使用情况、寄存器使用情况和共享内存使用情况能让我们对GPU的整体使用有更好的理解。
+
+
+..  image:: nvvp2.png
+    :align: center
+    :scale: 33%
+
+而从应用的角度， **nvvp** 可以帮您提供一些定位性能瓶颈的建议。
+例如，下图中就展示了一些关于内存数据迁徙和计算资源利用率的建议，为您做性能调优提供了方向。
+
+..  image:: nvvp3.png
+    :align: center
+    :scale: 33%
+
+..  image:: nvvp4.png
+    :align: center
+    :scale: 33%
+
+性能分析小技巧
+==================
+
+- 开始阶段，从 **nvprof** 和 **nvvp** 的输出信息入手是个不错的选择。
+- 接下来可以考虑下时间线的分析。
+- 如果真想挖掘内核深处的某个秘密，您最好先确认：这一块的耗时比例真的太高，值得深入分析。
+- 可能的情况下，试着让输出的分析数据和理论值对应。
+
+    1) 例如，如果我知道内核花了10ms来移动1GB数据，那我会期望分析工具统计到速度是100GB/s。
+    2) 若有不一致之处，很有可能实际应用就是没有按照您的预期情况运行。
+- 了解您的硬件：如果您的GPU理论可以达到6 TFLOPs（6万亿次浮点运算每秒），而当前已经有5.5 TFLOPs了，那估计这里的潜力就没啥好挖的了……
+
+性能分析是性能优化的关键一步。有的时候简简单单的改变就能在性能上产生明显的优化效果！
+当然，具体情况因人而异。
+
+参考资料
+===========
+Jeremy Appleyard, `GPU Profiling for Deep Learning <http://www.robots.ox.ac.uk/~seminars/seminars/Extra/2015_10_08_JeremyAppleyard.pdf>`_, 2015
diff --git a/doc/howto/optimization/gpu_profiling_en.rst b/doc/howto/optimization/gpu_profiling_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ed208ceaf7af0c5aab88fd4fcb18fa96b8c9ff38
--- /dev/null
+++ b/doc/howto/optimization/gpu_profiling_en.rst
@@ -0,0 +1,240 @@
+====================
+Tune GPU Performance 
+====================
+
+..  contents::
+
+This tutorial will guide you step-by-step through how to conduct profiling and performance tuning using built-in timer, **nvprof** and **nvvp**.
+
+- What is profiling?
+- Why we need profiling?
+- How to do profiling?
+- Profile tools
+- Hands-on Tutorial
+- Profiling tips
+
+What's profiling?
+=================
+In software engineering, profiling is a form of dynamic program analysis that measures the space (memory) or time
+complexity of a program, the usage of particular instructions, or the frequency and duration of function calls.
+Most commonly, profiling information serves to aid program optimization.
+
+Briefly, profiler is used to measure application performance. Program analysis tools are extremely important for
+understanding program behavior. Simple profiling can tell you that how long does an operation take? For advanced
+profiling, it can interpret why does an operation take a long time?
+
+Why we need profiling?
+======================
+Since training deep neural network typically take a very long time to get over, performance is gradually becoming
+the most important thing in deep learning field. The first step to improve performance is to understand what parts
+are slow.  There is no point in improving performance of a region which doesn’t take much time!
+
+
+How to do profiling?
+====================
+To achieve maximum performance, there are five steps you can take to reach your goals.
+
+- Profile the code
+- Find the slow parts
+- Work out why they’re slow
+- Make them fast
+- Profile the code again
+
+Usually, processor has two key performance limits include float point throughput and
+memory throughput. For GPU,  it also need more parallelism to fulfill its potential.
+This is why they can be so fast.
+
+Profiler Tools
+==============
+For general GPU profiling, a bunch of tools are provided from both NVIDIA and third party.
+
+**nvprof** is Nvidia profiler and **nvvp** is (GUI based) Nvidia visual profiler.
+In this tutorial, we will focus on nvprof and nvvp.
+
+:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
+above profilers.
+
+.. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+   :language: c++
+   :lines: 137-151
+   :linenos:
+
+The above code snippet includes two methods, you can use any of them to profile the regions of interest.
+
+1. :code:`REGISTER_TIMER_INFO` is a built-in timer wrapper which can calculate the time overhead of both cpu functions and cuda kernels.
+
+2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
+program crashes when CPU version of PaddlePaddle invokes them.
+
+You can find more details about how to use both of them in the next session.
+
+Hands-on Approach
+=================
+
+Built-in Timer
+--------------
+
+To enable built-in timer in PaddlePaddle, first you have to add :code:`REGISTER_TIMER_INFO` into the regions of you interest.
+Then, all information could be stamped in the console via :code:`printStatus` or :code:`printAllStatus` function.
+As a simple example, consider the following:
+
+1. Add :code:`REGISTER_TIMER_INFO` and :code:`printAllStatus` functions (see the emphasize-lines).
+
+    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 8-12,14
+        :linenos:
+
+2. Configure cmake with **WITH_TIMER** and recompile PaddlePaddle.
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_TIMER=ON
+        make
+
+3. Execute your code and observe the results (see the emphasize-lines).
+
+    .. code-block:: bash
+        :emphasize-lines: 1,12-15
+
+        > ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+        [==========] Running 1 test from 1 test case.
+        [----------] Global test environment set-up.
+        [----------] 1 test from Profiler
+        [ RUN      ] Profiler.BilinearFwdBwd
+        I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
+        gSizeX = 64, imgSizeY = 64"
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)
+        [----------] 1 test from Profiler (136 ms total)
+
+        [----------] Global test environment tear-down
+        [==========] 1 test from 1 test case ran. (136 ms total)
+        [  PASSED  ] 1 test.
+
+nvprof profiler
+---------------
+
+To use this command line profiler **nvprof**, you can simply issue the following command:
+
+1. Add :code:`REGISTER_GPU_PROFILER` function (see the emphasize-lines).
+
+    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 6-7
+        :linenos:
+
+2. Configure cmake with **WITH_PROFILER** and recompile PaddlePaddle.
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_PROFILER=ON
+        make
+
+3. Use Nvidia profiler **nvprof** to profile the binary.
+
+    .. code-block:: bash
+
+        nvprof  ./paddle/math/tests/test_GpuProfiler
+
+Then, you can get the following profiling result:
+
+.. code-block:: bash
+
+    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler
+    ==78544== Profiling result:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
+
+    ==78544== API calls:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
+    0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
+
+
+nvvp profiler
+-------------
+
+For visual profiler **nvvp**, you can either import the output of :code:`nvprof –o ...` or
+run application through GUI.
+
+**Note: nvvp also support CPU profiling** (Click the box in nvvp to enable profile execution on CPU).
+
+..  image:: nvvp1.png
+    :align: center
+    :scale: 33%
+
+From the perspective of kernel functions, **nvvp** can even illustrate why does an operation take a long time?
+As shown in the following figure, kernel's block usage, register usage and shared memory usage from :code:`nvvp`
+allow us to fully utilize all warps on the GPU.
+
+..  image:: nvvp2.png
+    :align: center
+    :scale: 33%
+
+From the perspective of application, **nvvp** can give you some suggestions to address performance bottleneck.
+For instance, some advice in data movement and compute utilization from the below figure can guide you to tune performance.
+
+..  image:: nvvp3.png
+    :align: center
+    :scale: 33%
+
+..  image:: nvvp4.png
+    :align: center
+    :scale: 33%
+
+Profiling tips
+==============
+
+- The **nvprof** and **nvvp** output is a very good place to start.
+- The timeline is a good place to go next.
+- Only dig deep into a kernel if it’s taking a significant amount of your time.
+- Where possible, try to match profiler output with theory.
+    1) For example, if I know I’m moving 1GB, and my kernel takes 10ms, I expect the profiler to report 100GB/s.
+    2) Discrepancies are likely to mean your application isn’t doing what you thought it was.
+- Know your hardware: If your GPU can do 6 TFLOPs, and you’re already doing 5.5 TFLOPs, you won’t go much faster!
+
+
+Profiling is a key step in optimization. Sometimes quite simple changes can lead to big improvements in performance.
+Your mileage may vary!
+
+Reference
+=========
+Jeremy Appleyard, `GPU Profiling for Deep Learning <http://www.robots.ox.ac.uk/~seminars/seminars/Extra/2015_10_08_JeremyAppleyard.pdf>`_, 2015
diff --git a/doc/howto/optimization/nvvp1.png b/doc/howto/optimization/nvvp1.png
new file mode 100644
index 0000000000000000000000000000000000000000..1af23ac3c52929b2b0645d2f9fa4d4c6db1f6e77
Binary files /dev/null and b/doc/howto/optimization/nvvp1.png differ
diff --git a/doc/howto/optimization/nvvp2.png b/doc/howto/optimization/nvvp2.png
new file mode 100644
index 0000000000000000000000000000000000000000..177c9db708da6863d1075f3e615f5962dbe18b29
Binary files /dev/null and b/doc/howto/optimization/nvvp2.png differ
diff --git a/doc/howto/optimization/nvvp3.png b/doc/howto/optimization/nvvp3.png
new file mode 100644
index 0000000000000000000000000000000000000000..d8f393667d6569b6f1e61ffccac43fae5888b6db
Binary files /dev/null and b/doc/howto/optimization/nvvp3.png differ
diff --git a/doc/howto/optimization/nvvp4.png b/doc/howto/optimization/nvvp4.png
new file mode 100644
index 0000000000000000000000000000000000000000..51f2f3e183295de6cf8ddaf2b3b8a0862aa35f01
Binary files /dev/null and b/doc/howto/optimization/nvvp4.png differ
diff --git a/doc/cluster/opensource/cluster_train.md b/doc/howto/usage/cluster/cluster_train_en.md
similarity index 99%
rename from doc/cluster/opensource/cluster_train.md
rename to doc/howto/usage/cluster/cluster_train_en.md
index cb493a88f031850cb6a5eeed0ebe9e41bb7e01c3..2fd24e532e3b8cb7572e1d4c2e5acbb5d57bc567 100644
--- a/doc/cluster/opensource/cluster_train.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -1,4 +1,4 @@
-# Distributed Training
+# Run Distributed Training
 
 In this article, we explain how to run distributed Paddle training jobs on clusters.  We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).
 
@@ -9,7 +9,7 @@ In this article, we explain how to run distributed Paddle training jobs on clust
 1. Aforementioned scripts use a Python library [fabric](http://www.fabfile.org/) to run SSH commands.  We can use `pip` to install fabric:
 
    ```bash
-pip install fabric
+   pip install fabric
    ```
 
 1. We need to install PaddlePaddle on all nodes in the cluster.  To enable GPUs, we need to install CUDA in `/usr/local/cuda`; otherwise Paddle would report errors at runtime.
diff --git a/doc/howto/usage/cluster/k8s/Dockerfile b/doc/howto/usage/cluster/k8s/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..3a73606c61432329b4cc2d2f8daadc5af8735c96
--- /dev/null
+++ b/doc/howto/usage/cluster/k8s/Dockerfile
@@ -0,0 +1,7 @@
+FROM paddledev/paddle:cpu-latest
+
+MAINTAINER zjsxzong89@gmail.com
+
+COPY start.sh /root/
+COPY start_paddle.py /root/
+CMD ["bash"," -c","/root/start.sh"]
\ No newline at end of file
diff --git a/doc/howto/usage/cluster/k8s/job.yaml b/doc/howto/usage/cluster/k8s/job.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..488aad0bede4f940b25c7be04259f209c3de9f52
--- /dev/null
+++ b/doc/howto/usage/cluster/k8s/job.yaml
@@ -0,0 +1,43 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: jobpath
+        hostPath: 
+          path: /home/work/paddle_output              
+      containers:
+      - name: trainer
+        image: registry.baidu.com/public/paddle:mypaddle
+        command: ["bin/bash",  "-c", "/root/start.sh"]        
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath     
+        - name: JOB_NAMESPACE
+          value: default         
+        - name: TRAIN_CONFIG_DIR
+          value: recommendation
+        - name: CONF_PADDLE_NIC
+          value: eth0  
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"     
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"  
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"                                                               
+        volumeMounts:
+        - name: jobpath
+          mountPath: /home/jobpath       
+      restartPolicy: Never
+    
diff --git a/doc/howto/usage/cluster/k8s/k8s-paddle-arch.png b/doc/howto/usage/cluster/k8s/k8s-paddle-arch.png
new file mode 100644
index 0000000000000000000000000000000000000000..a8c64550b1fa7f41de1eaa9a037c65cddc0cd30e
Binary files /dev/null and b/doc/howto/usage/cluster/k8s/k8s-paddle-arch.png differ
diff --git a/doc/howto/usage/cluster/k8s/k8s_cn.md b/doc/howto/usage/cluster/k8s/k8s_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..2575701053ca12cc3af45682af6cd682a88bb987
--- /dev/null
+++ b/doc/howto/usage/cluster/k8s/k8s_cn.md
@@ -0,0 +1,205 @@
+# Kubernetes 单机训练
+
+在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。
+
+## 制作Docker镜像
+
+在一个功能齐全的Kubernetes机群里，通常我们会安装Ceph等分布式文件系统来存储训练数据。这样的话，一个分布式Paddle训练任务中的每个进程都可以从Ceph读取数据。在这个例子里，我们只演示一个单机作业，所以可以简化对环境的要求，把训练数据直接放在
+Paddle的Docker image里。为此，我们需要制作一个包含训练数据的Paddle镜像。
+
+Paddle 的 [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) 
+里介绍了用Paddle源码中的脚本下载训练数据的过程。
+而 `paddledev/paddle:cpu-demo-latest` 镜像里有 Paddle 源码与demo，（ 请注意，默认的
+Paddle镜像 `paddledev/paddle:cpu-latest` 是不包括源码的, Paddle的各版本镜像可以参考 [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html) ），所以我们使用这个镜像来下载训练数据到Docker container中，然后把这个包含了训练数据的container保存为一个新的镜像。
+  
+### 运行容器
+
+```
+$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+```
+
+### 下载数据
+
+进入容器`/root/paddle/demo/quick_start/data`目录，使用`get_data.sh`下载数据
+
+```
+$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh
+
+Downloading Amazon Electronics reviews data...
+--2016-10-31 01:33:43--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
+Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
+HTTP request sent, awaiting response... 200 OK
+Length: 495854086 (473M) [application/x-gzip]
+Saving to: 'reviews_Electronics_5.json.gz'
+
+ 10% [=======>                                         ] 874,279     64.7KB/s  eta 2h 13m
+
+```
+
+### 修改启动脚本
+
+下载完数据后，修改`/root/paddle/demo/quick_start/train.sh`文件，内容如下（增加了一条cd命令）
+```
+set -e
+cd /root/paddle/demo/quick_start
+cfg=trainer_config.lr.py
+#cfg=trainer_config.emb.py
+#cfg=trainer_config.cnn.py
+#cfg=trainer_config.lstm.py
+#cfg=trainer_config.bidi-lstm.py
+#cfg=trainer_config.db-lstm.py
+paddle train \
+  --config=$cfg \
+  --save_dir=./output \
+  --trainer_count=4 \
+  --log_period=20 \
+  --num_passes=15 \
+  --use_gpu=false \
+  --show_parameter_stats_period=100 \
+  --test_all_data_in_one_period=1 \
+  2>&1 | tee 'train.log'
+```
+
+### 提交镜像
+
+修改启动脚本后，退出容器，使用`docker commit`命令创建新镜像。
+
+```
+$ docker commit quick_start_data mypaddle/paddle:quickstart
+```
+
+## 使用 Kubernetes 进行训练
+
+>针对任务运行完成后容器自动退出的场景，Kubernetes有Job类型的资源来支持。下文就是用Job类型的资源来进行训练。
+
+### 编写yaml文件
+
+在训练时，输出结果可能会随着容器的消耗而被删除，需要在创建容器前挂载卷以便我们保存训练结果。使用我们之前构造的镜像，可以创建一个 [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job)，简单的yaml文件如下：
+
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: quickstart
+spec:
+  parallelism: 1
+  completions: 1
+  template:
+    metadata:
+      name: quickstart
+    spec:
+      volumes:
+      - name: output
+        hostPath: 
+          path: /home/work/paddle_output     
+      containers:
+      - name: pi
+        image: mypaddle/paddle:quickstart
+        command: ["bin/bash",  "-c", "/root/paddle/demo/quick_start/train.sh"]
+        volumeMounts:
+        - name: output
+          mountPath: /root/paddle/demo/quick_start/output
+      restartPolicy: Never
+```
+
+### 创建Paddle Job
+
+使用上文创建的yaml文件创建Kubernetes Job，命令为：
+
+```
+$ kubectl  create -f paddle.yaml
+```
+
+查看job的详细情况：
+
+```
+$ kubectl  get job
+NAME         DESIRED   SUCCESSFUL   AGE
+quickstart   1         0            58s
+
+$ kubectl  describe job quickstart
+Name:		quickstart
+Namespace:	default
+Image(s):	registry.baidu.com/public/paddle:cpu-demo-latest
+Selector:	controller-uid=f120da72-9f18-11e6-b363-448a5b355b84
+Parallelism:	1
+Completions:	1
+Start Time:	Mon, 31 Oct 2016 11:20:16 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Pods Statuses:	0 Running / 1 Succeeded / 0 Failed
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+Events:
+  FirstSeen	LastSeen	Count	From			SubobjectPath	Type		Reason			Message
+  ---------	--------	-----	----			-------------	--------	------			-------
+  1m		1m		1	{job-controller }			Normal		SuccessfulCreate	Created pod: quickstart-fa0wx
+```
+
+### 查看训练结果
+
+根据Job对应的Pod信息，可以查看此Pod运行的宿主机。
+
+```
+kubectl  describe pod quickstart-fa0wx
+Name:		quickstart-fa0wx
+Namespace:	default
+Node:		paddle-demo-let02/10.206.202.44
+Start Time:	Mon, 31 Oct 2016 11:20:17 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Status:		Succeeded
+IP:		10.0.0.9
+Controllers:	Job/quickstart
+Containers:
+  quickstart:
+    Container ID:	docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465
+    Image:		registry.baidu.com/public/paddle:cpu-demo-latest
+    Image ID:		docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750
+    Port:
+    Command:
+      bin/bash
+      -c
+      /root/paddle/demo/quick_start/train.sh
+    QoS Tier:
+      cpu:		BestEffort
+      memory:		BestEffort
+    State:		Terminated
+      Reason:		Completed
+      Exit Code:	0
+      Started:		Mon, 31 Oct 2016 11:20:20 +0800
+      Finished:		Mon, 31 Oct 2016 11:21:46 +0800
+    Ready:		False
+    Restart Count:	0
+    Environment Variables:
+Conditions:
+  Type		Status
+  Ready 	False
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+```
+
+我们还可以登录到宿主机上查看训练结果。
+
+```
+[root@paddle-demo-let02 paddle_output]# ll
+total 60
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014
+```
diff --git a/doc/howto/usage/cluster/k8s/k8s_distributed_cn.md b/doc/howto/usage/cluster/k8s/k8s_distributed_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..d4d01f2759bd89a3448ed12ee7fd24a091217e47
--- /dev/null
+++ b/doc/howto/usage/cluster/k8s/k8s_distributed_cn.md
@@ -0,0 +1,308 @@
+# Kubernetes 分布式训练
+
+前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
+
+## Kubernetes 基本概念
+
+[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统，其提供应用部署、维护、 扩展机制等功能，利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行，且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws)，[Azure](http://kubernetes.io/docs/getting-started-guides/azure/)，[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前，需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识，下面先简要介绍一下本文用到的几个Kubernetes概念。
+
+- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点，这个节点可以是物理机或者虚拟机，Kubernetes集群就是由node节点与master节点组成的。
+
+- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器，pod是Kubernetes的最小调度单元，一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET，PID，IPC，UTS等Linux namespace。由于容器之间共享NET namespace，所以它们使用同一个IP地址，可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
+
+- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 是Kubernetes上运行的作业，一次作业称为一个job，通常每个job包括一个或者多个pods。
+
+- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷，是pod内的容器都可以访问的共享目录，也是容器与node之间共享文件的方式，因为容器内的文件都是暂时存在的，当容器因为各种原因被销毁时，其内部的文件也会随之消失。通过volume，就可以将这些文件持久化存储。Kubernetes支持多种volume，例如hostPath(宿主机目录)，gcePersistentDisk，awsElasticBlockStore等。
+
+- [*Namespaces*](http://kubernetes.io/docs/user-guide/volumes/) 命名空间，在kubernetes中创建的所有资源对象(例如上文的pod，job)等都属于一个命名空间，在同一个命名空间中，资源对象的名字是唯一的，不同空间的资源名可以重复，命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
+
+## 整体方案
+
+### 部署Kubernetes集群
+
+首先，我们需要拥有一个Kubernetes集群，在这个集群中所有node与pod都可以互相通信。关于Kubernetes集群搭建，可以参考[官方文档](http://kubernetes.io/docs/getting-started-guides/kubeadm/)，在以后的文章中我们也会介绍AWS上搭建的方案。本文假设大家能找到几台物理机，并且可以按照官方文档在上面部署Kubernetes。在本文的环境中，Kubernetes集群中所有node都挂载了一个[MFS](http://moosefs.org/)（Moose filesystem，一种分布式文件系统）共享目录，我们通过这个目录来存放训练文件与最终输出的模型。关于MFS的安装部署，可以参考[MooseFS documentation](https://moosefs.com/documentation.html)。在训练之前，用户将配置与训练数据切分好放在MFS目录中，训练时，程序从此目录拷贝文件到容器内进行训练，将结果保存到此目录里。整体的结构图如下：
+
+![paddle on kubernetes结构图](k8s-paddle-arch.png)
+
+上图描述了一个3节点的分布式训练场景，Kubernetes集群的每个node上都挂载了一个MFS目录，这个目录可以通过volume的形式挂载到容器中。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行，每个pod包含一个PaddlePaddle容器。在容器创建后，会启动pserver与trainer进程，读取volume中的数据进行这次分布式训练。
+
+### 使用 Job
+
+我们使用Kubernetes中的job这个概念来代表一次分布式训练。Job表示一次性作业，在作业完成后，Kubernetes会销毁job产生的容器并且释放相关资源。
+
+在Kubernetes中，可以通过编写一个YAML文件，来描述这个job，在这个文件中，主要包含了一些配置信息，例如PaddlePaddle的节点个数，`paddle pserver`开放的端口个数与端口号，使用的网卡设备等，这些信息通过环境变量的形式传递给容器内的程序使用。
+
+在一次分布式训练中，用户确定好本次训练需要的PaddlePaddle节点个数，将切分好的训练数据与配置文件上传到MFS共享目录中。然后编写这次训练的job YAML文件，提交给Kubernetes集群创建并开始作业。
+
+### 创建PaddlePaddle节点
+
+当Kubernetes master收到请求，解析完YAML文件后，会创建出多个pod(个数为PaddlePaddle节点数)，Kubernetes会把这些pod调度到集群的node上运行。一个pod就代表一个PaddlePaddle节点，当pod被成功分配到一台物理/虚拟机上后，Kubernetes会启动pod内的容器，这个容器会根据YAML文件中的环境变量，启动`paddle pserver`与`paddle train`进程。
+
+### 启动训练
+
+在容器启动后，会通过脚本来启动这次分布式训练，我们知道`paddle train`进程启动时需要知道其他节点的IP地址以及本节点的trainer_id，由于PaddlePaddle本身不提供类似服务发现的功能，所以在本文的启动脚本中，每个节点会根据job name向Kubernetes apiserver查询这个job对应的所有pod信息(Kubernetes默认会在每个容器的环境变量中写入apiserver的地址)。
+
+根据这些pod信息，就可以通过某种方式，为每个pod分配一个唯一的trainer_id。本文把所有pod的IP地址进行排序，将顺序作为每个PaddlePaddle节点的trainer_id。启动脚本的工作流程大致如下：
+
+  1. 查询Kubernetes apiserver获取pod信息，根据IP分配trainer_id
+  1. 从MFS共享目录中拷贝训练文件到容器内
+  1. 根据环境变量，解析出`paddle pserver`与`paddle train`的启动参数，启动进程
+  1. 训练时，PaddlePaddle会自动将结果保存在trainer_id为0的节点上，将输出路径设置为MFS目录，保存输出的文件
+
+
+## 搭建过程
+
+根据前文的描述，要在已有的Kubernetes集群上进行PaddlePaddle的分布式训练，主要分为以下几个步骤：
+
+1. 制作PaddlePaddle镜像
+1. 将训练文件与切分好的数据上传到共享存储
+1. 编写本次训练的YAML文件，创建一个Kubernetes job
+1. 训练结束后查看输出结果
+
+下面就根据这几个步骤分别介绍。
+
+
+### 制作镜像
+
+PaddlePaddle镜像需要提供`paddle pserver`与`paddle train`进程的运行环境，用这个镜像创建的容器需要有以下两个功能：
+
+- 拷贝训练文件到容器内
+
+- 生成`paddle pserver`与`paddle train`进程的启动参数，并且启动训练
+
+因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。镜像的*Dockerfile*如下：
+
+```Dockerfile
+FROM paddledev/paddle:cpu-latest
+
+MAINTAINER zjsxzong89@gmail.com
+
+COPY start.sh /root/
+COPY start_paddle.py /root/
+CMD ["bash"," -c","/root/start.sh"]
+```
+
+[`start.sh`](start.sh)文件拷贝训练文件到容器内，然后执行[`start_paddle.py`](start_paddle.py)脚本启动训练，前文提到的获取其他节点IP地址，分配`trainer_id`等都在`start_paddle.py`脚本中完成。
+
+`start_paddle.py`脚本开始时，会先进行参数的初始化与解析。
+
+```python
+parser = argparse.ArgumentParser(prog="start_paddle.py",
+                                     description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+```
+
+然后通过函数`getPodList()`访问Kubernetes的接口来查询此job对应的所有pod信息。当所有pod都处于running状态（容器运行都运行）时，再通过函数`getIdMap(podlist)`获取trainer_id。
+
+```python
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(10)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+```
+
+在函数`getIdMap(podlist)`内部，我们通过读取`podlist`中每个pod的IP地址，将IP排序生成的序号作为trainer_id。
+
+```python
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+```
+
+在得到`idMap`后，通过函数`startPaddle(idMap, train_args_dict)`构造`paddle pserver`与`paddle train`的启动参数并执行进程。
+
+在函数`startPaddle`中，最主要的工作就是解析出`paddle pserver`与`paddle train`的启动参数。例如`paddle train`参数的解析，解析环境变量得到`PADDLE_NIC`，`PADDLE_PORT`，`PADDLE_PORTS_NUM`等参数，然后通过自身的IP地址在`idMap`中获取`trainerId`。
+
+```python
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+```
+
+使用 `docker build` 构建镜像：
+
+```bash
+docker build -t your_repo/paddle:mypaddle .
+```
+
+然后将构建成功的镜像上传到镜像仓库。
+
+```bash
+docker push  your_repo/paddle:mypaddle
+```
+
+### 上传训练文件
+
+本文使用PaddlePaddle官方的[recommendation demo](http://www.paddlepaddle.org/doc/demo/index.html#recommendation)作为这次训练的内容，我们将训练文件与数据放在一个job name命名的目录中，上传到MFS共享存储。完成后MFS上的文件内容大致如下：
+
+```bash
+[root@paddle-kubernetes-node0 mfs]# tree -d
+.
+└── paddle-cluster-job
+    ├── data
+    │   ├── 0
+    │   │
+    │   ├── 1
+    │   │
+    │   └── 2
+    ├── output
+    └── recommendation
+```
+
+目录中paddle-cluster-job是本次训练对应的job name，本次训练要求有3个PaddlePaddle节点，在paddle-cluster-job/data目录中存放切分好的数据，文件夹0，1，2分别代表3个节点的trainer_id。recommendation文件夹内存放训练文件，output文件夹存放训练结果与日志。
+
+### 创建Job
+
+Kubernetes可以通过YAML文件来创建相关对象，然后可以使用命令行工具创建job。
+
+Job YAML文件描述了这次训练使用的Docker镜像，需要启动的节点个数以及 `paddle pserver`与 `paddle train`进程启动的必要参数，也描述了容器需要使用的存储卷挂载的情况。YAML文件中各个字段的具体含义，可以查看[Kubernetes Job API](http://kubernetes.io/docs/api-reference/batch/v1/definitions/#_v1_job)。例如，本次训练的YAML文件可以写成：
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: jobpath
+        hostPath:
+          path: /home/work/mfs
+      containers:
+      - name: trainer
+        image: your_repo/paddle:mypaddle
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: recommendation
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        volumeMounts:
+        - name: jobpath
+          mountPath: /home/jobpath
+      restartPolicy: Never
+```
+
+文件中，`metadata`下的`name`表示这个job的名字。`parallelism，completions`字段表示这个job会同时开启3个PaddlePaddle节点，成功训练且退出的pod数目为3时，这个job才算成功结束。然后申明一个存储卷`jobpath`，代表宿主机目录`/home/work/mfs`，在对容器的描述`containers`字段中，将此目录挂载为容器的`/home/jobpath`目录，这样容器的`/home/jobpath`目录就成为了共享存储，放在这个目录里的文件其实是保存到了MFS上。
+
+`env`字段表示容器的环境变量，我们将`paddle`运行的一些参数通过这种方式传递到容器内。
+
+`JOB_PATH`表示共享存储挂载的路径，`JOB_NAME`表示job名字，`TRAIN_CONFIG_DIR`表示本次训练文件所在目录，这三个变量组合就可以找到本次训练需要的文件路径。
+
+`CONF_PADDLE_NIC`表示`paddle pserver`进程需要的`--nics`参数，即网卡名
+
+`CONF_PADDLE_PORT`表示`paddle pserver`的`--port`参数，`CONF_PADDLE_PORTS_NUM`则表示稠密更新的端口数量，也就是`--ports_num`参数。
+
+`CONF_PADDLE_PORTS_NUM_SPARSE`表示稀疏更新的端口数量，也就是`--ports_num_for_sparse`参数。
+
+`CONF_PADDLE_GRADIENT_NUM`表示训练节点数量，即`--num_gradient_servers`参数
+
+编写完YAML文件后，可以使用Kubernetes的命令行工具创建job。
+
+```bash
+kubectl create -f job.yaml
+```
+
+创建成功后，Kubernetes就会创建3个pod作为PaddlePaddle节点然后拉取镜像，启动容器开始训练。
+
+
+### 查看输出
+
+在训练过程中，可以在共享存储上查看输出的日志和模型，例如output目录下就存放了输出结果。注意node_0，node_1，node_2这几个目录表示PaddlePaddle节点与trainer_id，并不是Kubernetes中的node概念。
+
+```bash
+[root@paddle-kubernetes-node0 output]# tree -d
+.
+├── node_0
+│   ├── server.log
+│   └── train.log
+├── node_1
+│   ├── server.log
+│   └── train.log
+├── node_2
+......
+├── pass-00002
+│   ├── done
+│   ├── ___embedding_0__.w0
+│   ├── ___embedding_1__.w0
+......
+```
+
+我们可以通过日志查看容器训练的情况，例如：
+
+```bash
+[root@paddle-kubernetes-node0 node_0]# cat train.log
+I1116 09:10:17.123121    50 Util.cpp:155] commandline:
+ /usr/local/bin/../opt/paddle/bin/paddle_trainer
+    --nics=eth0 --port=7164
+    --ports_num=2 --comment=paddle_process_by_paddle
+    --pservers=192.168.129.66,192.168.223.143,192.168.129.71
+    --ports_num_for_sparse=2 --config=./trainer_config.py
+    --trainer_count=4 --num_passes=10 --use_gpu=0 
+    --log_period=50 --dot_period=10 --saving_period=1 
+    --local=0 --trainer_id=0
+    --save_dir=/home/jobpath/paddle-cluster-job/output
+I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
+I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
+[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
+[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__regression_cost_0__]
+I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
+I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.681543    50 GradientMachine.cpp:134] Initing parameters..
+I1116 09:10:18.012390    50 GradientMachine.cpp:141] Init parameters done.
+I1116 09:10:18.018641    50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164
+I1116 09:10:18.018950    50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165
+I1116 09:10:18.019069    50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164
+I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165
+I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
+I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
+```
diff --git a/doc/howto/usage/cluster/k8s/start.sh b/doc/howto/usage/cluster/k8s/start.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b3a1334174a20b018d35de3b01b149fc5b10d49d
--- /dev/null
+++ b/doc/howto/usage/cluster/k8s/start.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+set -eu
+
+jobconfig=${JOB_PATH}"/"${JOB_NAME}"/"${TRAIN_CONFIG_DIR}
+cd /root
+cp -rf $jobconfig .
+cd $TRAIN_CONFIG_DIR
+
+
+python /root/start_paddle.py \
+  --dot_period=10 \
+  --ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM \
+  --log_period=50 \
+  --num_passes=10 \
+  --trainer_count=4 \
+  --saving_period=1 \
+  --local=0 \
+  --config=./trainer_config.py \
+  --use_gpu=0
diff --git a/doc/howto/usage/cluster/k8s/start_paddle.py b/doc/howto/usage/cluster/k8s/start_paddle.py
new file mode 100755
index 0000000000000000000000000000000000000000..df00d82919faa2acecc79c28e3d773ba3de9672a
--- /dev/null
+++ b/doc/howto/usage/cluster/k8s/start_paddle.py
@@ -0,0 +1,158 @@
+#!/usr/bin/python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import time
+import socket
+import os
+import argparse
+
+# configuration for cluster
+API = "/api/v1/namespaces/"
+JOBSELECTOR = "labelSelector=job-name="
+JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
+JOB_PATH_DATA = JOB_PATH + "/data"
+JOB_PATH_OUTPUT = JOB_PATH + "/output"
+JOBNAME = os.getenv("JOB_NAME")
+NAMESPACE = os.getenv("JOB_NAMESPACE")
+PADDLE_NIC = os.getenv("CONF_PADDLE_NIC")
+PADDLE_PORT = os.getenv("CONF_PADDLE_PORT")
+PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
+PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
+PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
+
+
+def refine_unknown_args(cmd_args):
+    '''
+    refine unknown parameters to handle some special parameters
+    '''
+    new_args = []
+    for arg in cmd_args:
+        if arg.startswith("--") and arg.find("=") != -1:
+            equal_pos = arg.find("=")  # find first = pos
+            arglist = list(arg)
+            arglist[equal_pos] = " "
+            arg = "".join(arglist)
+            arg = arg.lstrip("-")
+            new_args += arg.split(" ")
+        elif arg.startswith("--") and arg.find("=") == -1:
+            arg = arg.lstrip("-")
+            new_args.append(arg)
+        else:
+            new_args.append(arg)
+    return new_args
+
+
+def isPodAllRunning(podlist):
+    '''
+    check all pod is running
+    '''
+    require = len(podlist["items"])
+    running = 0
+    for pod in podlist["items"]:
+        if pod["status"]["phase"] == "Running":
+            running += 1
+    if require == running:
+        return True
+    return False
+
+
+def getPodList():
+    '''
+    get all container status of the job
+    '''
+    apiserver = "https://" + \
+        os.getenv("KUBERNETES_SERVICE_HOST") + ":" + \
+        os.getenv("KUBERNETES_SERVICE_PORT_HTTPS")
+
+    pod = API + NAMESPACE + "/pods?"
+    job = JOBNAME
+    return requests.get(apiserver + pod + JOBSELECTOR + job,
+                        verify=False).json()
+
+
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+
+
+def startPaddle(idMap={}, train_args_dict=None):
+    '''
+    start paddle pserver and trainer
+    '''
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+    logDir = JOB_PATH_OUTPUT + "/node_" + str(trainerId)
+    if not os.path.exists(JOB_PATH_OUTPUT):
+        os.makedirs(JOB_PATH_OUTPUT)
+    os.mkdir(logDir)
+    copyCommand = 'cp -rf ' + JOB_PATH_DATA + \
+        "/" + str(trainerId) + " ./data"
+    os.system(copyCommand)
+    startPserver = 'nohup paddle pserver' + \
+        " --port=" + str(PADDLE_PORT) + \
+        " --ports_num=" + str(PADDLE_PORTS_NUM) + \
+        " --ports_num_for_sparse=" + str(PADDLE_PORTS_NUM_SPARSE) + \
+        " --nics=" + PADDLE_NIC + \
+        " --comment=" + "paddle_process_by_paddle" + \
+        " --num_gradient_servers=" + str(PADDLE_SERVER_NUM) +\
+        " > " + logDir + "/server.log 2>&1 &"
+    print startPserver
+    os.system(startPserver)
+    # wait until pservers completely start
+    time.sleep(10)
+    startTrainer = program + args + " > " + \
+        logDir + "/train.log 2>&1 < /dev/null"
+    print startTrainer
+    os.system(startTrainer)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        prog="start_paddle.py", description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(10)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+    startPaddle(idMap, train_args_dict)
diff --git a/doc/ui/cmd_argument/argument_outline.md b/doc/howto/usage/cmd_parameter/arguments_en.md
similarity index 99%
rename from doc/ui/cmd_argument/argument_outline.md
rename to doc/howto/usage/cmd_parameter/arguments_en.md
index d6cc2c6ed7cc1b9209d56b4348497427efe40ac3..013edbc9047817d7f6b82c4d5188412bd2ce41d6 100644
--- a/doc/ui/cmd_argument/argument_outline.md
+++ b/doc/howto/usage/cmd_parameter/arguments_en.md
@@ -143,7 +143,7 @@ It looks like there are a lot of arguments. However, most of them are for develo
 </tr>
 
 <tr>
-<td class="left" rowspan = "2">testing during training</td><td class="left">test_all_data_in_one_period</td>
+<td class="left" rowspan = "2">testing during training</td><td class="left">test_period</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>
 
diff --git a/doc/ui/cmd_argument/detail_introduction.md b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
similarity index 96%
rename from doc/ui/cmd_argument/detail_introduction.md
rename to doc/howto/usage/cmd_parameter/detail_introduction_en.md
index 07608e5edf740bd3e1242913f1d2d7589ad313aa..27b2faf1d8a9367ff9498a76d363791ab7fbe61c 100644
--- a/doc/ui/cmd_argument/detail_introduction.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
@@ -1,3 +1,7 @@
+```eval_rst
+..  _cmd_detail_introduction:
+```
+
 # Detail Description
 
 ## Common
@@ -31,7 +35,7 @@
   - type: string (default: null).
 
 * `--version`
-  - Whether to print version infomatrion.
+  - Whether to print version information.
   - type: bool (default: 0).
 
 * `--show_layer_stat`
@@ -110,8 +114,8 @@
   - type: int32 (default: -1).
 
 * `--test_period`
-  - Run testing every test_period train batches. If not set, run testing each pass.
-  - type: int32 (default: 1000).
+   - if equal 0, do test on all test data at the end of each pass. While if equal non-zero, do test on all test data every test_period batches.
+  - type: int32 (default: 0).
 
 * `--test_wait`
   - Whether to wait for parameter per pass if not exist. If set test_data_path in submitting environment of cluster, it will launch one process to perfom testing, so we need to set test_wait=1. Note that in the cluster submitting environment, this argument has been set True by default.
@@ -121,10 +125,6 @@
   - File that saves the model list when testing. It was set automatically when using cluster submitting environment after setting model_path.
   - type: string (default: "", null).
 
-* `--test_all_data_in_one_period`
-  - This argument is usually used in testing period during traning. If true, all data will be tested in one test period. Otherwise (batch_size * log_peroid) data will be tested.
-  - type: bool (default: 0).
-
 * `--predict_output_dir`
   - Directory that saves the layer output. It is configured in Outputs() in network config. Default, this argument is null, meaning save nothing. Specify this directory if you want to save feature map of some layers in testing mode. Note that, layer outputs are values after activation function.
   - type: string (default: "", null).
diff --git a/doc/howto/usage/cmd_parameter/index_en.md b/doc/howto/usage/cmd_parameter/index_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..2a96e7e976c43fd69befccd78753cee431ef61bc
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/index_en.md
@@ -0,0 +1,8 @@
+```eval_rst
+..  _cmd_line_index:
+```
+# Set Command-line Parameters
+
+* [Use Case](use_case_en.md)
+* [Arguments](arguments_en.md)
+* [Detailed Descriptions](detail_introduction_en.md)
diff --git a/doc/ui/cmd_argument/use_case.md b/doc/howto/usage/cmd_parameter/use_case_en.md
similarity index 97%
rename from doc/ui/cmd_argument/use_case.md
rename to doc/howto/usage/cmd_parameter/use_case_en.md
index a6bfba29af4f73055338c3a671bcafaa1456c7cf..4d7bb33f36fe258ee24796eedc9296065923e58f 100644
--- a/doc/ui/cmd_argument/use_case.md
+++ b/doc/howto/usage/cmd_parameter/use_case_en.md
@@ -10,9 +10,8 @@ paddle train \
   --config=network_config \
   --save_dir=output \
   --trainer_count=COUNT \                #(default:1)
-  --test_period=M \                      #(default:1000）
-  --test_all_data_in_one_period=true \   #(default:false) 
-  --num_passes=N \                       #(defalut:100）
+  --test_period=M \                      #(default:0) 
+  --num_passes=N \                       #(defalut:100)
   --log_period=K \                       #(default:100)
   --dot_period=1000 \                    #(default:1)
   #[--show_parameter_stats_period=100] \ #(default:0)
diff --git a/doc_cn/concepts/pserver_topology.dot b/doc/howto/usage/concepts/src/pserver_topology.dot
similarity index 100%
rename from doc_cn/concepts/pserver_topology.dot
rename to doc/howto/usage/concepts/src/pserver_topology.dot
diff --git a/doc_cn/concepts/trainer_config.py b/doc/howto/usage/concepts/src/trainer_config.py
similarity index 100%
rename from doc_cn/concepts/trainer_config.py
rename to doc/howto/usage/concepts/src/trainer_config.py
diff --git a/doc/howto/usage/concepts/use_concepts_cn.rst b/doc/howto/usage/concepts/use_concepts_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..77ba76441910b0696188ad6fa577b92c47129499
--- /dev/null
+++ b/doc/howto/usage/concepts/use_concepts_cn.rst
@@ -0,0 +1,155 @@
+############
+基本使用概念
+############
+
+PaddlePaddle是一个深度学习框架，支持单机模式和多机模式。
+
+单机模式用命令 ``paddle train`` 可以启动一个trainer进程，单机训练通常只包括一个trainer进程。如果数据规模比较大，希望加速训练，可以启动分布式作业。一个分布式作业里包括若干trainer进程和若干Parameter Server（或称pserver）进程。用命令 ``paddle pserver`` 可以启动 pserver 进程，pserver进程用于协调多个trainer进程之间的通信。
+
+本文首先介绍trainer进程中的一些使用概念，然后介绍pserver进程中概念。
+
+..    contents::
+
+系统框图
+========
+
+下图描述了用户使用框图，PaddlePaddle的trainer进程里内嵌了Python解释器，trainer进程可以利用这个解释器执行Python脚本，Python脚本里定义了模型配置、训练算法、以及数据读取函数。其中，数据读取程序往往定义在一个单独Python脚本文件里，被称为数据提供器（DataProvider），通常是一个Python函数。模型配置、训练算法通常定义在另一单独Python文件中, 称为训练配置文件。下面将分别介绍这两部分。
+
+..    graphviz:: 
+
+    digraph pp_process {
+        rankdir=LR;
+        config_file [label="用户神经网络配置"];
+        subgraph cluster_pp {
+            style=filled;
+            color=lightgrey;
+            node [style=filled, color=white, shape=box];
+            label = "PaddlePaddle C++";
+            py [label="Python解释器"];
+        }
+        data_provider [label="用户数据解析"];
+        config_file -> py;
+        py -> data_provider [dir="back"];
+    }
+
+数据提供器
+==========
+
+DataProvider是PaddlePaddle系统的数据提供器，将用户的原始数据转换成系统可以识别的数据类型。每当系统需要新的数据训练时, trainer进程会调用DataProvider函数返回数据。当所有数据读取完一轮后，DataProvider返回空数据，通知系统一轮数据读取结束，并且系统每一轮训练开始时会重置DataProvider。需要注意的是，DataProvider是被系统调用，而不是新数据驱动系统，一些随机化噪声添加都应该在DataProvider中完成。
+
+在不同的应用里，训练数据的格式往往各不相同。因此，为了用户能够灵活的处理数据，我们提供了Python处理数据的接口，称为 `PyDataProvider`_ 。在 ``PyDataProvider`` 中，系统C++模块接管了shuffle、处理batch、GPU和CPU通信、双缓冲、异步读取等问题，一些情况下(如：``min_pool_size=0``)需要Python接口里处理shuffle，可以参考 `PyDataProvider`_ 的相关文档继续深入了解。
+
+
+训练配置文件
+============
+
+训练配置文件主要包括数据源、优化算法、网络结构配置三部分。 其中数据源配置与DataProvider的关系是：DataProvider里定义数据读取函数，训练配置文件的数据源配置中指定DataProvider文件名字、生成数据函数接口，请不要混淆。
+
+一个简单的训练配置文件为：
+
+..  literalinclude:: src/trainer_config.py
+    :linenos:
+
+文件开头 ``from paddle.trainer_config_helpers import *`` ，是因为PaddlePaddle配置文件与C++模块通信的最基础协议是protobuf，为了避免用户直接写复杂的protobuf string，我们为用户定以Python接口来配置网络，该Python代码可以生成protobuf包，这就是`trainer_config_helpers`_的作用。因此，在文件的开始，需要import这些函数。 这个包里面包含了模型配置需要的各个模块。
+
+下面分别介绍数据源配置、优化算法配置、网络结构配置这三部分该概念。
+
+数据源配置
+----------
+
+使用 `PyDataProvider`_ 的函数 ``define_py_data_sources2`` 配置数据源。``define_py_data_sources2`` 里通过train_list和test_list指定是训练文件列表和测试文件列表。 如果传入字符串的话，是指一个数据列表文件。这个数据列表文件中包含的是每一个训练或者测试文件的路径。如果传入一个list的话，则会默认生成一个list文件，再传入给train.list或者test.list。
+
+``module`` 和 ``obj`` 指定了DataProvider的文件名和返回数据的函数名。更详细的使用，请参考 `PyDataProvider`_ 。
+
+优化算法配置
+------------
+
+通过 `settings`_ 接口设置神经网络所使用的训练参数和 `优化算法`_ ，包括学习率、batch_size、优化算法、正则方法等，具体的使用方法请参考 `settings`_ 文档。
+
+网络结构配置
+------------
+
+神经网络配置主要包括网络连接、激活函数、损失函数、评估器。
+
+- 网络连接： 主要由Layer组成，每个Layer返回的都是一个 ``LayerOutput`` 对象，Layer里面可以定义参数属性、激活类型等。
+
+  为了更灵活的配置，PaddlePaddle提供了基于 Projection 或者 Operator 的配置，这两个需要与 ``mixed_layer`` 配合使用。这里简单介绍Layer、Projection、Operator的概念:
+
+  - Layer: 神经网络的某一层，可以有可学习的参数，一般是封装了许多复杂操作的集合。
+  - Projection：需要与 ``mixed_layer`` 配合使用，含可学习参数。
+  - Operator： 需要与 ``mixed_layer`` 配合使用，不含可学习参数，输入全是其他Layer的输出。
+
+ 
+  这个配置文件网络由 ``data_layer`` 、 ``simple_img_conv_pool`` 、 ``fc_layer`` 组成。
+
+  - `data_layer`_  ： 通常每个配置文件都会包括 ``data_layer`` ，定义输入数据大小。
+  - `simple_img_conv_pool`_ ：是一个组合层，包括了图像的卷积 (convolution)和池化(pooling)。
+  - `fc_layer`_ ：全连接层，激活函数为Softmax，这里也可叫分类层。
+
+  
+- 损失函数和评估器：损失函数即为网络的优化目标，评估器可以评价模型结果。
+
+  PaddlePaddle包括很多损失函数和评估起，详细可以参考 `损失函数层`_ 和 `评估器`_ 。这里 ``classification_cost`` 默认使用多类交叉熵损失函数和分类错误率统计评估器。
+  
+- ``outputs``: 标记网络输出的函数为 ``outputs`` 。
+
+  训练阶段，网络的输出为神经网络的优化目标；预测阶段，网络的输出也可通过 ``outputs`` 标记。
+
+
+这里对 ``mixed_layer`` 稍做详细说明， 该Layer将多个输入(Projection 或 Operator)累加求和，具体计算是通过内部的 Projection 和 Operator 完成，然后加 Bias 和 activation 操作，
+
+例如，和 ``fc_layer`` 同样功能的 ``mixed_layer`` 是:
+
+..    code-block:: python
+   
+       data = data_layer(name='data', size=200)
+       with mixed_layer(size=200) as out:
+           out += full_matrix_projection(input=data)
+
+PaddlePaddle 可以使用 ``mixed layer`` 配置出非常复杂的网络，甚至可以直接配置一个完整的LSTM。用户可以参考 `mixed_layer`_ 的相关文档进行配置。
+
+
+分布式训练
+==========
+
+PaddlePaddle多机采用经典的 Parameter Server 架构对多个节点的 trainer 进行同步。多机训练的经典拓扑结构如下\:
+
+..    graphviz:: src/pserver_topology.dot
+
+图中每个灰色方块是一台机器，在每个机器中，先使用命令 ``paddle pserver`` 启动一个pserver进程，并指定端口号，可能的参数是\:
+
+..    code-block:: bash
+
+    paddle pserver --port=5000 --num_gradient_servers=4 --tcp_rdma='tcp' --nics='eth0'
+
+* ``--port=5000`` : 指定 pserver 进程端口是 5000 。
+* ``--gradient_servers=4`` : 有四个训练进程(PaddlePaddle 将 trainer 也称作 GradientServer ，因为其为负责提供Gradient) 。
+* ``--tcp_rdma='tcp' --nics=`eth0```: 指定以太网类型为TCP网络，指定网络接口名字为eth0。
+
+启动之后 pserver 进程之后，需要启动 trainer 训练进程，在各个机器上运行如下命令\:
+
+..    code-block:: bash
+
+    paddle train --port=5000 --pservers=192.168.100.101,192.168.100.102,192.168.100.103,192.168.100.104 --config=...
+
+对于简单的多机协同训练使用上述方式即可。另外，pserver/train 通常在高级情况下，还需要设置下面两个参数\：
+
+* --ports_num\: 一个 pserver 进程共绑定多少个端口用来做稠密更新，默认是1。
+* --ports_num_for_sparse\: 一个pserver进程共绑定多少端口用来做稀疏更新，默认是0。
+
+使用手工指定端口数量，是因为Paddle的网络通信中，使用了 int32 作为消息长度，比较容易在大模型下溢出。所以，在 pserver 进程中可以启动多个子线程去接受 trainer 的数据，这样单个子线程的长度就不会溢出了。但是这个值不可以调的过大，因为增加这个值，对性能尤其是内存占用有一定的开销，另外稀疏更新的端口如果太大的话，很容易导致某一个参数服务器没有分配到任何参数。
+
+详细的说明可以参考，使用 `集群训练Paddle`_ 。
+
+
+..  _PyDataProvider: ../ui/data_provider/pydataprovider2.html
+.. _settings: ../../doc/ui/api/trainer_config_helpers/optimizers.html#settings
+.. _优化算法: ../../doc/ui/api/trainer_config_helpers/optimizers.html#optimizers
+.. _trainer_config_helper: ../../doc/ui/api/trainer_config_helpers/index.html
+.. _data_layer: ../../doc/ui/api/trainer_config_helpers/layers.html#data-layer
+.. _simple_img_conv_pool: ../../doc/ui/api/trainer_config_helpers/networks.html#simple-img-conv-pool
+.. _fc_layer: ../../doc/ui/api/trainer_config_helpers/layers.html#fc-layer
+.. _损失函数层: ../../doc/ui/api/trainer_config_helpers/layers.html#cost-layers
+.. _评估器: ../../doc/ui/api/trainer_config_helpers/evaluators.html
+.. _mixed_layer: ../../doc/ui/api/trainer_config_helpers/layers.html#mixed-layer
+..  _集群训练Paddle: ../cluster/index.html
diff --git a/doc/index.rst b/doc/index.rst
deleted file mode 100644
index 668ad75a902bdd14c6198c41380ae93e29cec0d3..0000000000000000000000000000000000000000
--- a/doc/index.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-PaddlePaddle Documentation
-==========================
-
-..  toctree::
-  :maxdepth: 1
-
-  introduction/index.md
-  user_guide.rst
-  dev/index.rst
-  algorithm/index.rst
diff --git a/doc/index_cn.rst b/doc/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..460fedb5658a8ea9bbe8b602ee2b5df66502fa62
--- /dev/null
+++ b/doc/index_cn.rst
@@ -0,0 +1,11 @@
+PaddlePaddle 文档
+======================
+
+..  toctree::
+  :maxdepth: 1
+
+  getstarted/index_cn.rst
+  tutorials/index_cn.md
+  howto/index_cn.rst
+  api/index_cn.rst
+  faq/index_cn.rst
diff --git a/doc/index_en.rst b/doc/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1d9cca7de720ebc23fe816f32d158930d91c07e7
--- /dev/null
+++ b/doc/index_en.rst
@@ -0,0 +1,12 @@
+PaddlePaddle Documentation
+==========================
+
+..  toctree::
+  :maxdepth: 1
+
+  getstarted/index_en.rst
+  tutorials/index_en.md
+  howto/index_en.rst
+  api/index_en.rst
+  about/index_en.rst
+ 
\ No newline at end of file
diff --git a/doc/introduction/index.md b/doc/introduction/index.md
deleted file mode 100644
index 01f52031a1d0247cd0b885218c17001f23685239..0000000000000000000000000000000000000000
--- a/doc/introduction/index.md
+++ /dev/null
@@ -1,100 +0,0 @@
-# Introduction
-
-PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
-
-## 1. A Classic Problem
-
-Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - <a href="https://en.wikipedia.org/wiki/Simple_linear_regression">**simple linear regression**</a> : you have observed a set of two-dimensional data points of `X` and `Y`, where `X` is an explanatory variable and `Y` is corresponding dependent variable, and you want to recover the underlying correlation between `X` and `Y`. Linear regression can be used in many practical scenarios. For example, `X` can be a variable about house size, and `Y` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
-
-## 2. Prepare the Data
-
-Suppose the true relationship can be characterized as `Y = 2X + 0.3`, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
-
-```python
-# dataprovider.py
-from paddle.trainer.PyDataProvider2 import *
-import random
-
-# define data types of input: 2 real numbers
-@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-def process(settings, input_file):
-    for i in xrange(2000):
-        x = random.random()
-        yield [x], [2*x+0.3]
-```
-
-## 3. Train a NeuralNetwork in PaddlePaddle
-
-To recover this relationship between `X` and `Y`, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line `Y' = wX + b` , then we gradually adapt `w` and `b` to minimize the difference between `Y'` and `Y`. Here is what it looks like in PaddlePaddle:
-
-```python
-# trainer_config.py
-from paddle.trainer_config_helpers import *
-
-# 1. read data. Suppose you saved above python code as dataprovider.py
-data_file = 'empty.list'
-with open(data_file, 'w') as f: f.writelines(' ')
-define_py_data_sources2(train_list=data_file, test_list=None, 
-        module='dataprovider', obj='process',args={})
-
-# 2. learning algorithm
-settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-
-# 3. Network configuration
-x = data_layer(name='x', size=1)
-y = data_layer(name='y', size=1)
-y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-cost = regression_cost(input=y_predict, label=y)
-outputs(cost)
-```
-
-Some of the most fundamental usages of PaddlePaddle are demonstrated:
-
--  The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly.
-
--  The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time.
-
--  Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration:
-	-  **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for `X` and `Y`.
-	-  **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model.
-	-  **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters.
-
-Now that everything is ready, you can train the network with a simple command line call:
- ```
- paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
- ```
-
-This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path `./output`. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
-
-
-## 4. Evaluate the Model
-
-Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: `w=2, b=0.3`, thus a better option is to check out model parameters directly.
-
-In PaddlePaddle, training is just to get a collection of model parameters, which are `w` and `b` in this case. Each parameter is saved in an individual file in the popular `numpy` array format. Here is the code that reads parameters from last pass.
-
-```python
-import numpy as np
-import os
-
-def load(file_name):
-    with open(file_name, 'rb') as f:
-        f.read(16) # skip header for float type.
-        return np.fromfile(f, dtype=np.float32)
-        
-print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-# w=1.999743, b=0.300137
-```
-
-<center> ![](./parameters.png) </center>
-
-Although starts from a random guess, you can see that value of `w` changes quickly towards 2 and `b` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer.
-
-There, you have recovered the underlying pattern between `X` and `Y` only from observed data.
-
-
-## 5. Where to Go from Here
-
-- <a href="../build/index.html"> Build and Installation </a>
-- <a href="../demo/quick_start/index_en.html">Quick Start</a>
-- <a href="../demo/index.html">Example and Demo</a>
diff --git a/doc/introduction/parameters.png b/doc/introduction/parameters.png
deleted file mode 120000
index f47e74c94fffabbd32f055febbadb1b18aa0c429..0000000000000000000000000000000000000000
--- a/doc/introduction/parameters.png
+++ /dev/null
@@ -1 +0,0 @@
-../../doc_cn/introduction/parameters.png
\ No newline at end of file
diff --git a/doc/source/api.rst b/doc/source/api.rst
deleted file mode 100644
index 30396c26b61827847cc5acc29cee1c3c8e7b226e..0000000000000000000000000000000000000000
--- a/doc/source/api.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-API
-===
-
-.. doxygenfile:: paddle/api/PaddleAPI.h
-.. doxygenfile:: paddle/api/Internal.h
diff --git a/doc/source/cuda/index.rst b/doc/source/cuda/index.rst
deleted file mode 100644
index b0fed2e7f72c9a9671e56e114edfc88d72504dbe..0000000000000000000000000000000000000000
--- a/doc/source/cuda/index.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-CUDA
-====
-
-.. toctree::
-  :maxdepth: 2
-
-  matrix.rst
-  nn.rst
-  utils.rst
diff --git a/doc/source/cuda/matrix.rst b/doc/source/cuda/matrix.rst
deleted file mode 100644
index b7699c83eda15d9003506f5fc57b51d52e7af823..0000000000000000000000000000000000000000
--- a/doc/source/cuda/matrix.rst
+++ /dev/null
@@ -1,59 +0,0 @@
-Matrix
-======
-
-Base
-----
-
-hl_matrix.h
-```````````
-.. doxygenfile:: paddle/cuda/include/hl_matrix.h
-
-hl_matrix_base.h
-````````````````
-.. doxygenfile:: paddle/cuda/include/hl_matrix_base.cuh
-
-hl_matrix_apply.cuh
-```````````````````
-.. doxygenfile:: paddle/cuda/include/hl_matrix_apply.cuh
-
-hl_matrix_ops.cuh
-`````````````````
-.. doxygenfile:: paddle/cuda/include/hl_matrix_ops.cuh
-
-hl_matrix_type.cuh
-``````````````````
-.. doxygenfile:: paddle/cuda/include/hl_matrix_type.cuh
-
-hl_sse_matrix_kernel.cuh
-````````````````````````
-.. doxygenfile:: paddle/cuda/include/hl_sse_matrix_kernel.cuh
-
-Matrix Function 
----------------
-
-hl_batch_transpose.h
-````````````````````
-.. doxygenfile:: paddle/cuda/include/hl_batch_transpose.h
-
-hl_aggregate.h
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_aggregate.h
-
-hl_top_k.h
-``````````
-.. doxygenfile:: paddle/cuda/include/hl_top_k.h
-
-hl_table_apply.h
-````````````````
-.. doxygenfile:: paddle/cuda/include/hl_table_apply.h
-
-Sparse Matrix
--------------
-
-hl_sparse.h
-```````````
-.. doxygenfile:: paddle/cuda/include/hl_sparse.h
-
-hl_sparse.ph
-````````````
-.. doxygenfile:: paddle/cuda/include/hl_sparse.ph
diff --git a/doc/source/cuda/nn.rst b/doc/source/cuda/nn.rst
deleted file mode 100644
index 5577d01e72a5b22847bda40528c46a28cacc1490..0000000000000000000000000000000000000000
--- a/doc/source/cuda/nn.rst
+++ /dev/null
@@ -1,39 +0,0 @@
-Neural Network
-==============
-
-Base
-----
-
-.. doxygenfile:: paddle/cuda/include/hl_gpu.h
-.. doxygenfile:: paddle/cuda/include/hl_functions.h
-.. doxygenfile:: paddle/cuda/include/hl_avx_functions.h
-.. doxygenfile:: paddle/cuda/include/hl_gpu_functions.cuh
-.. doxygenfile:: paddle/cuda/include/hl_activation_functions.h
-
-
-CNN Related APIs
-----------------
-.. doxygenfile:: paddle/cuda/include/hl_cnn.h
-.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.h
-.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.ph
-
-RNN Related APIs
-----------------
-
-.. doxygenfile:: paddle/cuda/include/hl_recurrent_apply.cuh
-.. doxygenfile:: paddle/cuda/include/hl_sequence.h
-
-LSTM Model
-``````````
-
-.. doxygenfile:: paddle/cuda/include/hl_lstm.h
-.. dpxygenfile:: paddle/cuda/include/hl_cpu_lstm.cuh
-.. doxygenfile:: paddle/cuda/include/hl_gpu_lstm.cuh
-.. doxygenfile:: paddle/cuda/include/hl_lstm_ops.cuh
-
-GRU Model
-`````````
-
-.. doxygenfile:: paddle/cuda/include/hl_gru_ops.cuh
-.. doxygenfile:: paddle/cuda/include/hl_cpu_gru.cuh
-.. doxygenfile:: paddle/cuda/include/hl_gpu_gru.cuh
diff --git a/doc/source/cuda/utils.rst b/doc/source/cuda/utils.rst
deleted file mode 100644
index 850e8bd1c6670947e2a5f1b6f9b0d5b252117cbf..0000000000000000000000000000000000000000
--- a/doc/source/cuda/utils.rst
+++ /dev/null
@@ -1,37 +0,0 @@
-Utils
-=====
-
-Dynamic Link Libs
------------------
-.. doxygenfile:: paddle/cuda/include/hl_dso_loader.h
-
-GPU Resources
--------------
-
-hl_cuda.ph
-``````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda.ph
-
-hl_cuda.h
-`````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda.h
-
-HPPL Base
----------
-.. doxygenfile:: paddle/cuda/include/hl_base.h
-
-CUBLAS Wrapper
---------------
-.. doxygenfile:: paddle/cuda/include/hl_cuda_cublas.h
-
-Timer
------
-.. doxygenfile:: paddle/cuda/include/hl_time.h
-
-Thread Resource
----------------
-.. doxygenfile:: paddle/cuda/include/hl_thread.ph
-
-Device Function
----------------
-.. doxygenfile:: paddle/cuda/include/hl_device_functions.cuh
diff --git a/doc/source/gserver/activations.rst b/doc/source/gserver/activations.rst
deleted file mode 100644
index 55b9d3be383c07842d7066280cc0e174788db1fb..0000000000000000000000000000000000000000
--- a/doc/source/gserver/activations.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-Activations
-===========
-
-..  doxygenclass:: paddle::ActivationFunction
-    :members:
diff --git a/doc/source/gserver/dataproviders.rst b/doc/source/gserver/dataproviders.rst
deleted file mode 100644
index c30d9d6a36a6fbb664ae001274b6a7b0e721070f..0000000000000000000000000000000000000000
--- a/doc/source/gserver/dataproviders.rst
+++ /dev/null
@@ -1,87 +0,0 @@
-==============
-Data Providers
-==============
-
-DataProviders
-=============
-
-Base
-----
-..  doxygenclass:: paddle::DataProvider
-    :members:
-
-DataProviderGroup
------------------
-..  doxygenclass:: paddle::DataProviderGroup
-    :members:
-
-MultiDataProvider
------------------
-..  doxygenclass:: paddle::MultiDataProvider
-    :members:
-
-PyDataProvider
-==============
-
-IFieldScanner
--------------
-..  doxygenclass:: paddle::IFieldScanner
-    :members:
-
-DenseScanner
--------------
-..  doxygenclass:: paddle::DenseScanner
-    :members:
-
-IndexScanner
--------------
-..  doxygenclass:: paddle::IndexScanner
-    :members:
-
-SparseNonValueScanner
----------------------
-..  doxygenclass:: paddle::SparseNonValueScanner
-    :members:
-
-SparseValueScanner
-------------------
-..  doxygenclass:: paddle::SparseValueScanner
-    :members:
-
-SequenceScanner
----------------
-..  doxygenclass:: paddle::SparseValueScanner
-    :members:
-
-IPyDataProviderCache
---------------------
-..  doxygenclass:: paddle::IPyDataProviderCache
-    :members:
-
-NoCacheStrategy
----------------
-..  doxygenclass:: paddle::NoCacheStrategy
-    :members:
-
-CacheOnePassInMemory
---------------------
-..  doxygenclass:: paddle::CacheOnePassInMemory
-    :members:
-
-IPyDataProvider
----------------
-..  doxygenclass:: paddle::PyDataProvider2
-    :members:
-
-ProtoDataProvider
-=================
-
-ProtoDataProvider
-----------------
-..  doxygenclass:: paddle::ProtoDataProvider
-    :members:
-
-ProtoSequenceDataProvider
--------------------------
-..  doxygenclass:: paddle::ProtoSequenceDataProvider
-    :members:
diff --git a/doc/source/gserver/evaluators.rst b/doc/source/gserver/evaluators.rst
deleted file mode 100644
index f5361f76cd2b1c9c004221c03ea05b2c1f3a652e..0000000000000000000000000000000000000000
--- a/doc/source/gserver/evaluators.rst
+++ /dev/null
@@ -1,103 +0,0 @@
-==========
-Evaluators
-==========
-
-Base
-====
-
-..  doxygenclass:: paddle::Evaluator
-    :members:
-
-Sum
-===
-
-SumEvaluator
-------------
-..  doxygenclass:: paddle::SumEvaluator
-    :members:
-
-ColumnSumEvaluator
-------------------
-..  doxygenclass:: paddle::ColumnSumEvaluator
-    :members:
-
-Classification
-==============
-
-ClassificationErrorEvaluator
----------------------------
-..  doxygenclass:: paddle::ClassificationErrorEvaluator
-    :members:
-
-SequenceClassificationErrorEvaluator
-------------------------------------
-..  doxygenclass:: paddle::SequenceClassificationErrorEvaluator
-    :members:
-
-AucEvaluator
--------------
-..  doxygenclass:: paddle::AucEvaluator
-    :members:
-
-PrecisionRecallEvaluator
-------------------------
-..  doxygenclass:: paddle::PrecisionRecallEvaluator
-    :members:
-
-ChunkEvaluator
---------------
-..  doxygenclass:: paddle::ChunkEvaluator
-    :members:
-
-CTCEvaluator
-------------
-..  doxygenclass:: paddle::CTCErrorEvaluator
-    :members:
-
-
-Rank
-====
-
-PnpairEvaluator
--------------
-..  doxygenclass:: paddle::PnpairEvaluator
-    :members:
-
-AucEvaluator
--------------
-..  doxygenclass:: paddle::RankAucEvaluator
-    :members:
-
-
-Printer
-=======
-
-ValuePrinter
--------------
-..  doxygenclass:: paddle::ValuePrinter
-    :members:
-
-GradientPrinter
----------------
-..  doxygenclass:: paddle::GradientPrinter
-    :members:
-
-MaxIdPrinter
-------------
-..  doxygenclass:: paddle::MaxIdPrinter
-    :members:
-
-MaxFramePrinter
----------------
-..  doxygenclass:: paddle::MaxFramePrinter
-    :members:
-
-SequenceTextPrinter
-------------------
-..  doxygenclass:: paddle::SequenceTextPrinter
-    :members:
-
-ClassificationErrorPrinter
---------------------------
-..  doxygenclass:: paddle::ClassificationErrorPrinter
-    :members:
diff --git a/doc/source/gserver/gradientmachines.rst b/doc/source/gserver/gradientmachines.rst
deleted file mode 100644
index 04c8e91d0316a45ad10b0ed0513d3e8916b7c3d9..0000000000000000000000000000000000000000
--- a/doc/source/gserver/gradientmachines.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-Gradient Machines
-=================
-
-GradientMachine
----------------
-..  doxygenclass:: paddle::GradientMachine
-    :members:
-
-GradientMachineMode
--------------------
-..  doxygenclass:: paddle::IGradientMachineMode
-    :members:
-
-MultiGradientMachine
---------------------
-..  doxygenclass:: paddle::MultiGradientMachine
-    :members:
-
-TrainerThread
-`````````````
-..  doxygenclass:: paddle::TrainerThread
-    :members:
-
-RecurrentGradientMachine
-------------------------
-..  doxygenclass:: paddle::RecurrentGradientMachine
-    :members:
diff --git a/doc/source/gserver/index.rst b/doc/source/gserver/index.rst
deleted file mode 100644
index 223b00b9a9dbf1db40ce702cf0e154e5e53a8644..0000000000000000000000000000000000000000
--- a/doc/source/gserver/index.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-GServer
-=======
-
-.. toctree::
-  :maxdepth: 2
-
-  activations.rst
-  dataproviders.rst
-  evaluators.rst
-  gradientmachines.rst
-  layers.rst
-  neworks.rst
diff --git a/doc/source/gserver/layers.rst b/doc/source/gserver/layers.rst
deleted file mode 100644
index 191b2bdff26ed17437370a12036f9dbb174dae15..0000000000000000000000000000000000000000
--- a/doc/source/gserver/layers.rst
+++ /dev/null
@@ -1,566 +0,0 @@
-======
-Layers
-======
-
-Base
-====
-
-Layer 
------
-..  doxygenclass:: paddle::Layer
-    :members:
-
-Projection
-----------
-..  doxygenclass:: paddle::Projection
-    :members:
-
-Operator
---------
-..  doxygenclass:: paddle::Operator
-    :members:
-    
-Data Layer
-==========
-
-..  doxygenclass:: paddle::DataLayer
-    :members:
-
-Fully Connected Layers
-======================
-
-FullyConnectedLayer
--------------------
-..  doxygenclass:: paddle::FullyConnectedLayer
-    :members:
-
-SelectiveFullyConnectedLayer
-----------------------------
-..  doxygenclass:: paddle::SelectiveFullyConnectedLayer
-    :members:
-
-Conv Layers
-===========
-
-ConvBaseLayer
--------------
-..  doxygenclass:: paddle::ConvBaseLayer
-    :members:
-
-ConvOperator
-------------
-..  doxygenclass:: paddle::ConvOperator
-    :members:
-
-ConvShiftLayer
---------------
-..  doxygenclass:: paddle::ConvShiftLayer
-    :members:
-
-CudnnConvLayer
---------------
-..  doxygenclass:: paddle::CudnnConvLayer
-    :members:
-
-ExpandConvBaseLayer
--------------------
-..  doxygenclass:: paddle::ExpandConvBaseLayer
-    :members:
-
-ExpandConvLayer
----------------
-..  doxygenclass:: paddle::ExpandConvLayer
-    :members:
-
-ContextProjection
------------------
-..  doxygenclass:: paddle::ContextProjection
-    :members:
-
-Pooling Layers
-==============
-
-PoolLayer
----------
-..  doxygenclass:: paddle::PoolLayer
-    :members:
-
-PoolProjectionLayer
--------------------
-..  doxygenclass:: paddle::PoolProjectionLayer
-    :members:
-
-CudnnPoolLayer
---------------
-..  doxygenclass:: paddle::CudnnPoolLayer
-    :members:
-
-SpatialPyramidPoolLayer
------------------------
-..  doxygenclass:: paddle::SpatialPyramidPoolLayer
-    :members:
-
-MaxOutLayer
------------
-..  doxygenclass:: paddle::MaxOutLayer
-    :members:
-
-Norm Layers
-===========
-
-NormLayer
----------
-..  doxygenclass:: paddle::NormLayer
-    :members:
-
-CMRProjectionNormLayer
-----------------------
-..  doxygenclass:: paddle::CMRProjectionNormLayer
-    :members:
-
-DataNormLayer
--------------
-..  doxygenclass:: paddle::DataNormLayer
-    :members:
-
-ResponseNormLayer
------------------
-..  doxygenclass:: paddle::ResponseNormLayer
-    :members:
-
-BatchNormBaseLayer
-------------------
-..  doxygenclass:: paddle::BatchNormBaseLayer
-    :members:
-
-BatchNormalizationLayer
------------------------
-..  doxygenclass:: paddle::BatchNormalizationLayer
-    :members:
-
-CudnnBatchNormLayer
------------------------
-..  doxygenclass:: paddle::CudnnBatchNormLayer
-    :members:
-
-SumToOneNormLayer
------------------
-..  doxygenclass:: paddle::SumToOneNormLayer
-    :members:
-
-Activation Layer
-================
-
-ParameterReluLayer
-------------------
-..  doxygenclass:: paddle::ParameterReluLayer
-    :members:
-
-Recurrent Layers
-================
-
-RecurrentLayer
---------------
-..  doxygenclass:: paddle::RecurrentLayer
-    :members:
-
-SequenceToBatch
----------------
-..  doxygenclass:: paddle::SequenceToBatch
-    :members:
-
-LSTM
-----
-LstmLayer
-`````````
-..  doxygenclass:: paddle::LstmLayer
-    :members:
-
-LstmStepLayer
-`````````````
-..  doxygenclass:: paddle::LstmStepLayer
-    :members:
-
-LstmCompute
-```````````
-..  doxygenclass:: paddle::LstmCompute
-    :members:
-
-MDLSTM
-------
-MDLstmLayer
-```````````
-..  doxygenclass:: paddle::MDLstmLayer
-    :members:
-
-CoordIterator
-`````````````
-..  doxygenclass:: paddle::CoordIterator
-    :members:
-
-GRU
----
-GatedRecurrentLayer
-```````````````````
-..  doxygenclass:: paddle::GatedRecurrentLayer
-    :members:
-
-GruStepLayer
-````````````
-..  doxygenclass:: paddle::GruStepLayer
-    :members:
-
-GruCompute
-``````````
-..  doxygenclass:: paddle::GruCompute
-    :members:
-
-Recurrent Layer Group
-=====================
-
-AgentLayer
-----------
-..  doxygenclass:: paddle::AgentLayer
-    :members:
-
-SequenceAgentLayer
-------------------
-..  doxygenclass:: paddle::SequenceAgentLayer
-    :members:
-
-GatherAgentLayer
-----------------
-..  doxygenclass:: paddle::GatherAgentLayer
-    :members:
-
-SequenceGatherAgentLayer
-------------------------
-..  doxygenclass:: paddle::SequenceGatherAgentLayer
-    :members:
-
-ScatterAgentLayer
------------------
-..  doxygenclass:: paddle::ScatterAgentLayer
-    :members:
-
-SequenceScatterAgentLayer
--------------------------
-..  doxygenclass:: paddle::SequenceScatterAgentLayer
-    :members:
-
-GetOutputLayer
---------------
-..  doxygenclass:: paddle::GetOutputLayer
-    :members:
-
-Mixed Layer
-===========
-..  doxygenclass:: paddle::MixedLayer
-    :members:
-
-DotMulProjection
-----------------
-..  doxygenclass:: paddle::DotMulProjection
-    :members:
-
-DotMulOperator
---------------
-..  doxygenclass:: paddle::DotMulOperator
-    :members:
-
-FullMatrixProjection
---------------------
-..  doxygenclass:: paddle::FullMatrixProjection
-    :members:
-
-IdentityProjection
-------------------
-..  doxygenclass:: paddle::IdentityProjection
-    :members:
-
-IdentityOffsetProjection
-------------------------
-..  doxygenclass:: paddle::IdentityOffsetProjection
-    :members:
-
-TableProjection
----------------
-..  doxygenclass:: paddle::TableProjection
-    :members:
-
-TransposedFullMatrixProjection
-------------------------------
-..  doxygenclass:: paddle::TransposedFullMatrixProjection
-    :members:
-
-Aggregate Layers
-================
-
-Aggregate
----------
-AverageLayer
-````````````
-..  doxygenclass:: paddle::AverageLayer
-    :members:
-
-MaxLayer
-````````
-..  doxygenclass:: paddle::MaxLayer
-    :members:
-
-SequenceLastInstanceLayer
-`````````````````````````
-..  doxygenclass:: paddle::SequenceLastInstanceLayer
-    :members:
-
-Concat
-------
-ConcatenateLayer
-````````````````
-..  doxygenclass:: paddle::ConcatenateLayer
-    :members:
-
-ConcatenateLayer2
-`````````````````
-..  doxygenclass:: paddle::ConcatenateLayer2
-    :members:
-
-SequenceConcatLayer
-```````````````````
-..  doxygenclass:: paddle::SequenceConcatLayer
-    :members:
-
-Subset
-------
-SubSequenceLayer
-````````````````
-..  doxygenclass:: paddle::SubSequenceLayer
-    :members:
-
-Reshaping Layers
-================
-
-BlockExpandLayer
-----------------
-..  doxygenclass:: paddle::BlockExpandLayer
-    :members:
-
-ExpandLayer
------------
-..  doxygenclass:: paddle::ExpandLayer
-    :members:
-
-FeatureMapExpandLayer
----------------------
-..  doxygenclass:: paddle::FeatureMapExpandLayer
-    :members:
-
-ResizeLayer
------------
-..  doxygenclass:: paddle::ResizeLayer
-    :members:
-
-SequenceReshapeLayer
---------------------
-..  doxygenclass:: paddle::SequenceReshapeLayer
-    :members:
-
-Math Layers
-===========
-
-AddtoLayer
-----------
-..  doxygenclass:: paddle::AddtoLayer
-    :members:
-
-ConvexCombinationLayer
-----------------------
-..  doxygenclass:: paddle::ConvexCombinationLayer
-    :members:
-
-InterpolationLayer
-------------------
-..  doxygenclass:: paddle::InterpolationLayer
-    :members:
-
-MultiplexLayer
---------------
-..  doxygenclass:: paddle::MultiplexLayer
-    :members:
-
-OuterProdLayer
---------------
-..  doxygenclass:: paddle::OuterProdLayer
-    :members:
-
-PowerLayer
-----------
-..  doxygenclass:: paddle::PowerLayer
-    :members:
-
-ScalingLayer
-------------
-..  doxygenclass:: paddle::ScalingLayer
-    :members:
-
-SlopeInterceptLayer
--------------------
-..  doxygenclass:: paddle::SlopeInterceptLayer
-    :members:
-
-TensorLayer
-------------
-..  doxygenclass:: paddle::TensorLayer
-    :members:
-
-TransLayer
-----------
-..  doxygenclass:: paddle::TransLayer
-    :members:
-
-Sampling Layers
-===============
-
-BilinearInterpLayer
--------------------
-..  doxygenclass:: paddle::BilinearInterpLayer
-    :members:
-
-MultinomialSampler
-------------------
-..  doxygenclass:: paddle::MultinomialSampler
-    :members:
-
-MaxIdLayer
-----------
-..  doxygenclass:: paddle::MaxIdLayer
-    :members:
-
-SamplingIdLayer
----------------
-..  doxygenclass:: paddle::SamplingIdLayer
-    :members:
-
-Cost Layers
-===========
-
-CostLayer
------------
-..  doxygenclass:: paddle::CostLayer
-    :members:
-
-HuberTwoClass
-`````````````
-..  doxygenclass:: paddle::HuberTwoClass
-    :members:
-
-LambdaCost
-```````````
-..  doxygenclass:: paddle::LambdaCost
-    :members:
-
-MultiBinaryLabelCrossEntropy
-````````````````````````````
-..  doxygenclass:: paddle::MultiBinaryLabelCrossEntropy
-    :members:
-
-MultiClassCrossEntropy
-```````````````````````
-..  doxygenclass:: paddle::MultiClassCrossEntropy
-    :members:
-
-MultiClassCrossEntropyWithSelfNorm
-``````````````````````````````````
-..  doxygenclass:: paddle::MultiClassCrossEntropyWithSelfNorm
-    :members:
-
-RankingCost
-```````````
-..  doxygenclass:: paddle::RankingCost
-    :members:
-
-SoftBinaryClassCrossEntropy
-```````````````````````````
-..  doxygenclass:: paddle::SoftBinaryClassCrossEntropy
-    :members:
-
-SumOfSquaresCostLayer
-`````````````````````
-..  doxygenclass:: paddle::SumOfSquaresCostLayer
-    :members:
-
-SumCostLayer
-`````````````````````
-..  doxygenclass:: paddle::SumCostLayer
-    :members:
-
-CosSimLayer
------------
-..  doxygenclass:: paddle::CosSimLayer
-    :members:
-
-CosSimVecMatLayer
------------------
-..  doxygenclass:: paddle::CosSimVecMatLayer
-    :members:
-
-CRFDecodingLayer
-----------------
-..  doxygenclass:: paddle::CRFDecodingLayer
-    :members:
-
-CRFLayer
---------
-..  doxygenclass:: paddle::CRFLayer
-    :members:
-
-CTCLayer
---------
-..  doxygenclass:: paddle::CTCLayer
-    :members:
-
-HierarchicalSigmoidLayer
-------------------------
-..  doxygenclass:: paddle::HierarchicalSigmoidLayer
-    :members:
-
-LinearChainCRF
---------------
-..  doxygenclass:: paddle::LinearChainCRF
-    :members:
-
-LinearChainCTC
---------------
-..  doxygenclass:: paddle::LinearChainCTC
-    :members:
-
-NCELayer
---------
-..  doxygenclass:: paddle::NCELayer
-    :members:
-
-Validation Layers
------------------
-
-ValidationLayer
-```````````````
-..  doxygenclass:: paddle::ValidationLayer
-    :members:
-
-AucValidation
-`````````````
-..  doxygenclass:: paddle::AucValidation
-    :members:
-
-PnpairValidation
-````````````````
-..  doxygenclass:: paddle::PnpairValidation
-    :members:
-
-Check Layers
-============
-
-EosIdCheckLayer
----------------
-..  doxygenclass:: paddle::EosIdCheckLayer
-    :members:
diff --git a/doc/source/gserver/neworks.rst b/doc/source/gserver/neworks.rst
deleted file mode 100644
index 73fb60d549cc88f61d2e2d18c9ec31c37cf4fa9a..0000000000000000000000000000000000000000
--- a/doc/source/gserver/neworks.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-Networks
-========
-
-NeuralNetwork
--------------
-..  doxygenclass:: paddle::NeuralNetwork
-    :members:
-
-ParallelNeuralNetwork
----------------------
-..  doxygenclass:: paddle::ParallelNeuralNetwork
-    :members:
diff --git a/doc/source/index.rst b/doc/source/index.rst
deleted file mode 100644
index 36323c888ee65147f59f28160dc26ca29235ba63..0000000000000000000000000000000000000000
--- a/doc/source/index.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Source Code Documents
-=====================
-
-.. toctree::
-  :maxdepth: 1
-
-  gserver/index.rst
-  trainer.rst
-  parameter/index.rst
-  pserver/index.rst
-  api.rst
-  cuda/index.rst
-  math/index.rst
-  utils/index.rst
diff --git a/doc/source/math/functions.rst b/doc/source/math/functions.rst
deleted file mode 100644
index aef12e0f005226c6d40d74d0e858a11585339758..0000000000000000000000000000000000000000
--- a/doc/source/math/functions.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-Functions
-=========
-
-MathFunctions
--------------
-.. doxygenfile:: paddle/math/MathFunctions.h
-
-SIMDFunctions
--------------
-.. doxygenfile:: paddle/math/SIMDFunctions.h
diff --git a/doc/source/math/index.rst b/doc/source/math/index.rst
deleted file mode 100644
index 2ec16f2b4450c870f9590aea4ad4ca7dc415b75d..0000000000000000000000000000000000000000
--- a/doc/source/math/index.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-Math
-====
-
-.. toctree::
-  :maxdepth: 2
-
-  vector.rst
-  matrix.rst
-  functions.rst
-  utils.rst
diff --git a/doc/source/math/matrix.rst b/doc/source/math/matrix.rst
deleted file mode 100644
index 9bb20f618d229e1baea15e26378bf40d7c6e1783..0000000000000000000000000000000000000000
--- a/doc/source/math/matrix.rst
+++ /dev/null
@@ -1,76 +0,0 @@
-Matrix
-======
-
-Base
-----
-
-BaseMatrix Template
-```````````````````
-..  doxygenclass:: paddle::BaseMatrixT
-    :members:
-
-Matrix
-``````
-..  doxygenclass:: paddle::Matrix
-    :members:
-
-MatrixOffset
-````````````
-..  doxygenclass:: paddle::MatrixOffset
-    :members:
-
-CpuMatrix
----------
-
-CpuMatrix
-`````````
-..  doxygenclass:: paddle::CpuMatrix
-    :members:
-
-SharedCpuMatrix
-```````````````
-..  doxygenclass:: paddle::SharedCpuMatrix
-    :members:
-
-GpuMatrix
----------
-..  doxygenclass:: paddle::GpuMatrix
-    :members:
-
-CpuSparseMatrix
----------------
-
-CpuSparseMatrix
-```````````````
-..  doxygenclass:: paddle::CpuSparseMatrix
-    :members:
-
-SparseRowCpuMatrix
-``````````````````
-..  doxygenclass:: paddle::SparseRowCpuMatrix
-    :members:
-
-SparseAutoGrowRowCpuMatrix
-``````````````````````````
-..  doxygenclass:: paddle::SparseAutoGrowRowCpuMatrix
-    :members:
-
-SparsePrefetchRowCpuMatrix
-``````````````````````````
-..  doxygenclass:: paddle::SparsePrefetchRowCpuMatrix
-    :members:
-
-SparseRowIdsCpuMatrix
-`````````````````````
-..  doxygenclass:: paddle::SparseRowIdsCpuMatrix
-    :members:
-
-CacheRowCpuMatrix
-`````````````````
-..  doxygenclass:: paddle::CacheRowCpuMatrix
-    :members:
-
-GpuSparseMatrix
----------------
-..  doxygenclass:: paddle::GpuSparseMatrix
-    :members:
diff --git a/doc/source/math/utils.rst b/doc/source/math/utils.rst
deleted file mode 100644
index 55d9961a390c205563a9ae4fbd87ac4ae90fc314..0000000000000000000000000000000000000000
--- a/doc/source/math/utils.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-Memory Manager
-==============
-
-Memory Handle
--------------
-.. doxygenfile:: paddle/math/MemoryHandle.h
-
-Allocator
----------
-.. doxygenfile:: paddle/math/Allocator.h
-
-PoolAllocator
-`````````````
-.. doxygenfile:: paddle/math/PoolAllocator.h
-
-Storage
--------
-.. doxygenfile:: paddle/math/Storage.h
diff --git a/doc/source/math/vector.rst b/doc/source/math/vector.rst
deleted file mode 100644
index 07f7062abaf4f30b8967b594f4e16ab881f5414f..0000000000000000000000000000000000000000
--- a/doc/source/math/vector.rst
+++ /dev/null
@@ -1,37 +0,0 @@
-Vector
-======
-
-BaseVector
-``````````
-..  doxygenclass:: paddle::BaseVector
-    :members:
-
-Vector Template
-```````````````
-..  doxygenclass:: paddle::VectorT
-    :members:
-
-CpuVector Template
-``````````````````
-..  doxygenclass:: paddle::CpuVectorT
-    :members:
-
-GpuVector Template
-``````````````````
-..  doxygenclass:: paddle::GpuVectorT
-    :members:
-
-ParallelCpuVector Template
-``````````````````````````
-..  doxygenclass:: paddle::ParallelCpuVectorT
-    :members:
-
-ParallelGpuVector Template
-``````````````````````````
-..  doxygenclass:: paddle::ParallelGpuVectorT
-    :members:
-
-CpuGpuVector Template
-`````````````````````
-..  doxygenclass:: paddle::CpuGpuVectorT
-    :members:
diff --git a/doc/source/parameter/index.rst b/doc/source/parameter/index.rst
deleted file mode 100644
index 3bf6948dc3478574d8d125d8461235f8827e4e42..0000000000000000000000000000000000000000
--- a/doc/source/parameter/index.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-Parameter
-=========
-
-.. toctree::
-  :maxdepth: 2
-
-  parameter.rst
-  optimizer.rst
-  updater.rst
diff --git a/doc/source/parameter/optimizer.rst b/doc/source/parameter/optimizer.rst
deleted file mode 100644
index b5b8b850b349d547c9e5508d3ebec3d7e00ea310..0000000000000000000000000000000000000000
--- a/doc/source/parameter/optimizer.rst
+++ /dev/null
@@ -1,22 +0,0 @@
-Optimizer
-=========
-
-ParameterOptimizer
-------------------
-.. doxygenfile:: paddle/parameter/ParameterOptimizer.h
-
-Regularizer
------------
-.. doxygenfile:: paddle/parameter/Regularizer.h
-
-FirstOrderOptimizer
--------------------
-.. doxygenfile:: paddle/parameter/FirstOrderOptimizer.h
-
-AverageOptimizer
-----------------
-.. doxygenfile:: paddle/parameter/AverageOptimizer.h
-
-OptimizerWithRegularizer
-------------------------
-.. doxygenfile:: paddle/parameter/OptimizerWithRegularizer.h
diff --git a/doc/source/parameter/parameter.rst b/doc/source/parameter/parameter.rst
deleted file mode 100644
index 2daa62d4e63b952cd93bba35ee32ce35ce768a0d..0000000000000000000000000000000000000000
--- a/doc/source/parameter/parameter.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-Parameter
-=========
-
-Parameter
----------
-.. doxygenfile:: paddle/parameter/Argument.h
-.. doxygenfile:: paddle/parameter/Parameter.h
-.. doxygenfile:: paddle/parameter/ParallelParameter.h
-
-Weight
-------
-.. doxygenfile:: paddle/parameter/Weight.h
diff --git a/doc/source/parameter/updater.rst b/doc/source/parameter/updater.rst
deleted file mode 100644
index dfa22e8e7d1d6f0713974835de93194d2cc58e6f..0000000000000000000000000000000000000000
--- a/doc/source/parameter/updater.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Updater
-=======
-
-Base
-----
-.. doxygenfile:: paddle/parameter/ParameterUpdaterBase.h
-
-Hook
-----
-.. doxygenfile:: paddle/parameter/ParameterUpdaterHook.h
-
-Functions
----------
-.. doxygenfile:: paddle/parameter/ParameterUpdateFunctions.h
diff --git a/doc/source/pserver/client.rst b/doc/source/pserver/client.rst
deleted file mode 100644
index e5bba0706a1d919104b85e23861ba490a2c828db..0000000000000000000000000000000000000000
--- a/doc/source/pserver/client.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-Client
-======
-
-BaseClient
-----------
-..  doxygenclass:: paddle::BaseClient
-    :members:
-
-ParameterClient2
-----------------
-..  doxygenclass:: paddle::ParameterClient2
-    :members:
diff --git a/doc/source/pserver/index.rst b/doc/source/pserver/index.rst
deleted file mode 100644
index 0031e9476bd063511cc2f0a8c209f35627cf44ba..0000000000000000000000000000000000000000
--- a/doc/source/pserver/index.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-PServer
-=======
-
-.. toctree::
-  :maxdepth: 2
-
-  client.rst
-  network.rst
-  server.rst
-  utils.rst
diff --git a/doc/source/pserver/network.rst b/doc/source/pserver/network.rst
deleted file mode 100644
index 7004c9d91fa9f2af11e15791ef682c108761027e..0000000000000000000000000000000000000000
--- a/doc/source/pserver/network.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-Network
-=======
-
-SocketServer
-------------
-..  doxygenclass:: paddle::SocketServer
-    :members:
-
-SocketWorker
-------------
-..  doxygenclass:: paddle::SocketWorker
-    :members:
-
-SocketClient
-------------
-..  doxygenclass:: paddle::SocketClient
-    :members:
-
-SocketChannel
--------------
-..  doxygenclass:: paddle::SocketChannel
-    :members:
-
-MessageReader
--------------
-..  doxygenclass:: paddle::MsgReader
-    :members:
diff --git a/doc/source/pserver/server.rst b/doc/source/pserver/server.rst
deleted file mode 100644
index 35301acf8ffe3d97e6124c37cf8fe1b43071e14e..0000000000000000000000000000000000000000
--- a/doc/source/pserver/server.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-Server
-======
-
-ProtoServer
------------
-..  doxygenclass:: paddle::ProtoServer
-    :members:
-
-ParameterServer2
-----------------
-..  doxygenclass:: paddle::ParameterServer2
-    :members:
diff --git a/doc/source/trainer.rst b/doc/source/trainer.rst
deleted file mode 100644
index 85f1feb4fc941f94e65a6b1d037445d2367f65ec..0000000000000000000000000000000000000000
--- a/doc/source/trainer.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-Trainer
-=======
-
-TrainerStats
-------------
-
-..  doxygenclass:: paddle::TrainerStats
-    :members:
-
-RemoteParameterUpdater
------------------------
-
-..  doxygenclass:: paddle::RemoteParameterUpdater
-    :members:
-
-ConcurrentRemoteParameterUpdater
---------------------------------
-
-..  doxygenclass:: paddle::ConcurrentRemoteParameterUpdater
-    :members:
-
-SparseRemoteParameterUpdater
-----------------------------
-
-..  doxygenclass:: paddle::SparseRemoteParameterUpdater
-    :members:
-
-SparseRemoteParameterUpdaterComposite
--------------------------------------
-
-..  doxygenclass:: paddle::SparseRemoteParameterUpdaterComposite
-    :members:
diff --git a/doc/source/utils/customStackTrace.rst b/doc/source/utils/customStackTrace.rst
deleted file mode 100644
index cdc8930739eb4b4d6308ff1fbce170d2977d42e8..0000000000000000000000000000000000000000
--- a/doc/source/utils/customStackTrace.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-CustomStackTrace
-================
-..  doxygenclass:: paddle::CustomStackTrace
-    :members:
diff --git a/doc/source/utils/enum.rst b/doc/source/utils/enum.rst
deleted file mode 100644
index e0da75afe164f9dab59b862faa7230fc57423e50..0000000000000000000000000000000000000000
--- a/doc/source/utils/enum.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-Enumeration wrapper
-===================
-..  doxygennamespace:: paddle::enumeration_wrapper
diff --git a/doc/source/utils/index.rst b/doc/source/utils/index.rst
deleted file mode 100644
index 7ddc47d1726f7627852be922d2b769d0752aa799..0000000000000000000000000000000000000000
--- a/doc/source/utils/index.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-Utils
-=====
-
-.. toctree::
-  :maxdepth: 2
-
-  lock.rst
-  queue.rst
-  thread.rst
-  customStackTrace.rst
-  enum.rst
diff --git a/doc/source/utils/lock.rst b/doc/source/utils/lock.rst
deleted file mode 100644
index f011acb9431f0f3dc3b2ba27fcfe71fe6eb07ae9..0000000000000000000000000000000000000000
--- a/doc/source/utils/lock.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-Lock
-====
-
-RWLock
-------
-..  doxygenclass:: paddle::RWLock
-    :members:
-
-ReadLockGuard
--------------
-..  doxygenclass:: paddle::ReadLockGuard
-    :members:
-
-SpinLock
---------
-..  doxygenclass:: paddle::SpinLock
-    :members:
-
-Semaphore
----------
-..  doxygenclass:: paddle::Semaphore
-    :members:
-
-ThreadBarrier
--------------
-..  doxygenclass:: paddle::ThreadBarrier
-    :members:
-
-LockedCondition
----------------
-..  doxygenclass:: paddle::LockedCondition
-    :members:
diff --git a/doc/source/utils/queue.rst b/doc/source/utils/queue.rst
deleted file mode 100644
index 98192648e2d61e622c2337d10ba024dd676ee685..0000000000000000000000000000000000000000
--- a/doc/source/utils/queue.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-Queue
-=====
-
-Queue
------
-..  doxygenclass:: paddle::Queue
-    :members:
-
-BlockingQueue 
--------------
-..  doxygenclass:: paddle::BlockingQueue 
-    :members:
diff --git a/doc/source/utils/thread.rst b/doc/source/utils/thread.rst
deleted file mode 100644
index 23d379a9894e5fc22bc6795a480a53d768e608e6..0000000000000000000000000000000000000000
--- a/doc/source/utils/thread.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-Thread
-======
-
-Thread 
-------
-..  doxygenclass:: paddle::Thread
-    :members:
-
-ThreadWorker
-------------
-..  doxygenclass:: paddle::ThreadWorker
-    :members:
-
-SyncThreadPool 
---------------
-..  doxygenclass:: paddle::SyncThreadPool 
-    :members:
-    
-MultiThreadWorker 
------------------
-..  doxygenclass:: paddle::MultiThreadWorker 
-    :members:
-
-AsyncThreadPool 
----------------
-..  doxygenclass:: paddle::AsyncThreadPool
-    :members:
diff --git a/doc_cn/conf.py.in b/doc/templates/conf.py.cn.in
similarity index 94%
rename from doc_cn/conf.py.in
rename to doc/templates/conf.py.cn.in
index 93242ace406000c84414bcabe1ecb683b9ff3cea..418d718fbd9c61bff3acb9c2dab0638c0b650bab 100644
--- a/doc_cn/conf.py.in
+++ b/doc/templates/conf.py.cn.in
@@ -22,7 +22,7 @@ AutoStructify = transform.AutoStructify
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 sys.path.insert(0, '@PROJ_ROOT@/python')
-templates_path = ["@PROJ_ROOT@/doc/templates"]
+templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
 
 # -- General configuration ------------------------------------------------
 
@@ -62,14 +62,14 @@ source_suffix = ['.rst', '.md', '.Rmd']
 source_encoding = 'utf-8'
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = 'index_cn'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = None
+language = 'zh_CN'
 
 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:
@@ -79,7 +79,7 @@ language = None
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build']
+exclude_patterns = ['_build', '**/*_en*', '*_en*']
 
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
@@ -112,12 +112,12 @@ todo_include_todos = False
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-#html_theme = 'sphinx_rtd_theme'  # sphinx_rtd_theme will cause table bad style
-html_theme = 'classic'
+html_theme = 'sphinx_rtd_theme'
+
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ['@PROJ_ROOT@/doc_theme/static']
 
 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'
diff --git a/doc/conf.py.in b/doc/templates/conf.py.en.in
similarity index 90%
rename from doc/conf.py.in
rename to doc/templates/conf.py.en.in
index 6c221f598b805fc00a9475a269d95cb54d1f4e98..e96c25cb75bee20d2e2949423d80ddab1d3450a1 100644
--- a/doc/conf.py.in
+++ b/doc/templates/conf.py.en.in
@@ -23,21 +23,7 @@ AutoStructify = transform.AutoStructify
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 sys.path.insert(0, '@PROJ_ROOT@/python')
 
-templates_path = ["@PROJ_ROOT@/doc/templates"]
-
-# -- Doxygen Settings
-breathe_projects = {
-   'paddle': '@PADDLE_DOXYGEN_OUTPUT@/xml'
-}
-breathe_default_project = 'paddle'
-
-breathe_domain_by_extension = {
-    'h': 'cpp',  # mapping XXX.h XXX.cpp together
-}
-
-breathe_default_members = {
-    'protected-members','undoc-members'
-}
+templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
 
 # -- General configuration ------------------------------------------------
 
@@ -62,7 +48,6 @@ extensions = [
     'sphinx.ext.autosummary',
     'sphinx.ext.mathjax',
     'sphinx.ext.napoleon',
-    'breathe'
 ]
 
 
@@ -78,7 +63,7 @@ source_suffix = ['.rst', '.md', '.Rmd']
 source_encoding = 'utf-8'
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = 'index_en'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -95,7 +80,7 @@ language = None
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build']
+exclude_patterns = ['_build', '**/*_cn*', '*_cn*']
 
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
@@ -128,13 +113,12 @@ todo_include_todos = False
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-#html_theme = 'sphinx_rtd_theme'
-html_theme = 'classic'
+html_theme = 'sphinx_rtd_theme'
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ['@PROJ_ROOT@/doc_theme/static']
 
 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'
@@ -160,5 +144,6 @@ def setup(app):
     # no c++ API for now
     app.add_config_value('recommonmark_config', {
             'url_resolver': lambda url: github_doc_root + url,
+        'enable_eval_rst': True,
             }, True)
     app.add_transform(AutoStructify)
diff --git a/doc/demo/embedding_model/index.md b/doc/tutorials/embedding_model/index_en.md
similarity index 99%
rename from doc/demo/embedding_model/index.md
rename to doc/tutorials/embedding_model/index_en.md
index 06f3ff1f009e470cdb9687658613a76acbb79751..d793a50f488e464bcd90a2fb506a8dcc3c760433 100644
--- a/doc/demo/embedding_model/index.md
+++ b/doc/tutorials/embedding_model/index_en.md
@@ -93,7 +93,7 @@ where `train.sh` is almost the same as `demo/seqToseq/translation/train.sh`, the
 - `--init_model_path`: path of the initialization model, here is `data/paraphrase_model`
 - `--load_missing_parameter_strategy`: operations when model file is missing, here use a normal distibution to initialize the other parameters except for the embedding layer
 
-For users who want to understand the dataset format, model architecture and training procedure in detail, please refer to [Text generation Tutorial](../text_generation/text_generation.md).
+For users who want to understand the dataset format, model architecture and training procedure in detail, please refer to [Text generation Tutorial](../text_generation/index_en.md).
 
 ## Optional Function ##
 ###  Embedding Parameters Observation
diff --git a/doc/demo/embedding_model/neural-n-gram-model.png b/doc/tutorials/embedding_model/neural-n-gram-model.png
similarity index 100%
rename from doc/demo/embedding_model/neural-n-gram-model.png
rename to doc/tutorials/embedding_model/neural-n-gram-model.png
diff --git a/doc/demo/image_classification/cifar.png b/doc/tutorials/image_classification/cifar.png
similarity index 100%
rename from doc/demo/image_classification/cifar.png
rename to doc/tutorials/image_classification/cifar.png
diff --git a/doc/demo/image_classification/image_classification.png b/doc/tutorials/image_classification/image_classification.png
similarity index 100%
rename from doc/demo/image_classification/image_classification.png
rename to doc/tutorials/image_classification/image_classification.png
diff --git a/doc/demo/image_classification/image_classification.md b/doc/tutorials/image_classification/index_en.md
similarity index 100%
rename from doc/demo/image_classification/image_classification.md
rename to doc/tutorials/image_classification/index_en.md
diff --git a/doc/demo/image_classification/lenet.png b/doc/tutorials/image_classification/lenet.png
similarity index 100%
rename from doc/demo/image_classification/lenet.png
rename to doc/tutorials/image_classification/lenet.png
diff --git a/doc/demo/image_classification/plot.png b/doc/tutorials/image_classification/plot.png
similarity index 100%
rename from doc/demo/image_classification/plot.png
rename to doc/tutorials/image_classification/plot.png
diff --git a/doc/tutorials/image_classification/src/cifar.png b/doc/tutorials/image_classification/src/cifar.png
new file mode 100644
index 0000000000000000000000000000000000000000..f54a0c58837cb3385b32dc57d02cec92666ef0f1
Binary files /dev/null and b/doc/tutorials/image_classification/src/cifar.png differ
diff --git a/doc/tutorials/image_classification/src/image_classification.png b/doc/tutorials/image_classification/src/image_classification.png
new file mode 100644
index 0000000000000000000000000000000000000000..14f255805081c1b4fab27eaf336fd389fa93ca19
Binary files /dev/null and b/doc/tutorials/image_classification/src/image_classification.png differ
diff --git a/doc/tutorials/image_classification/src/lenet.png b/doc/tutorials/image_classification/src/lenet.png
new file mode 100644
index 0000000000000000000000000000000000000000..1e6f2b32bad797f3fccb929c72a121fc935b0cbb
Binary files /dev/null and b/doc/tutorials/image_classification/src/lenet.png differ
diff --git a/doc/tutorials/image_classification/src/plot.png b/doc/tutorials/image_classification/src/plot.png
new file mode 100644
index 0000000000000000000000000000000000000000..a31f99791c670e18bb8c62b7604ec8cb0284ffb4
Binary files /dev/null and b/doc/tutorials/image_classification/src/plot.png differ
diff --git a/doc/demo/imagenet_model/resnet_block.jpg b/doc/tutorials/imagenet_model/resnet_block.jpg
similarity index 100%
rename from doc/demo/imagenet_model/resnet_block.jpg
rename to doc/tutorials/imagenet_model/resnet_block.jpg
diff --git a/doc/tutorials/imagenet_model/resnet_model_cn.md b/doc/tutorials/imagenet_model/resnet_model_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..82ec9d70b345c11aba3aa86f8206eedc8072bb88
--- /dev/null
+++ b/doc/tutorials/imagenet_model/resnet_model_cn.md
@@ -0,0 +1,284 @@
+# Model Zoo - ImageNet #
+
+[ImageNet](http://www.image-net.org/) 是通用物体分类领域一个众所周知的数据库。本教程提供了一个用于ImageNet上的卷积分类网络模型。
+
+## ResNet 介绍
+
+论文 [Deep Residual Learning for Image Recognition](http://arxiv.org/abs/1512.03385) 中提出的ResNet网络结构在2015年ImageNet大规模视觉识别竞赛(ILSVRC 2015)的分类任务中赢得了第一名。他们提出残差学习的框架来简化网络的训练，所构建网络结构的的深度比之前使用的网络有大幅度的提高。下图展示的是基于残差的连接方式。左图构造网络模块的方式被用于34层的网络中，而右图的瓶颈连接模块用于50层，101层和152层的网络结构中。
+
+<center>![resnet_block](./resnet_block.jpg)</center>
+<center>图 1. ResNet 网络模块</center>
+
+本教程中我们给出了三个ResNet模型，这些模型都是由原作者提供的模型<https://github.com/KaimingHe/deep-residual-networks>转换过来的。我们使用PaddlePaddle在ILSVRC的验证集共50,000幅图像上测试了模型的分类错误率，其中输入图像的颜色通道顺序为**BGR**，保持宽高比缩放到短边为256，只截取中心方形的图像区域。分类错误率和模型大小由下表给出。
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+<colgroup>
+<col  class="left" />
+<col  class="left" />
+<col  class="left" />
+</colgroup>
+<thead>
+<tr>
+<th scope="col" class="left">ResNet</th>
+<th scope="col" class="left">Top-1</th>
+<th scope="col" class="left">Model Size</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">ResNet-50</td>
+<td class="left">24.9%</td>
+<td class="left">99M</td>
+</tr>
+<tr>
+<td class="left">ResNet-101</td>
+<td class="left">23.7%</td>
+<td class="left">173M</td>
+</tr>
+<tr>
+<td class="left">ResNet-152</td>
+<td class="left">23.2%</td>
+<td class="left">234M</td>
+</tr>
+</tbody>
+
+</table></center>
+<br>
+
+## ResNet 模型
+
+50层，101层和152层的网络配置文件可参照```demo/model_zoo/resnet/resnet.py```。你也可以通过在命令行参数中增加一个参数如```--config_args=layer_num=50```来指定网络层的数目。
+
+### 网络可视化
+
+你可以通过执行下面的命令来得到ResNet网络的结构可视化图。该脚本会生成一个dot文件，然后可以转换为图片。需要安装graphviz来转换dot文件为图片。
+
+```
+cd demo/model_zoo/resnet
+./net_diagram.sh
+```
+
+### 模型下载
+
+```
+cd demo/model_zoo/resnet
+./get_model.sh
+```
+你可以执行上述命令来下载所有的模型和均值文件，如果下载成功，这些文件将会被保存在```demo/model_zoo/resnet/model```路径下。
+
+```
+mean_meta_224  resnet_101  resnet_152  resnet_50
+```
+   * resnet_50: 50层网络模型。
+   * resnet_101: 101层网络模型。
+   * resnet_152: 152层网络模型。
+   * mean\_meta\_224: 均值图像文件，图像大小为3 x 224 x 224，颜色通道顺序为**BGR**。你也可以使用这三个值: 103.939, 116.779, 123.68。
+
+### 参数信息
+
+* **卷积层权重**
+
+  由于每个卷积层后面连接的是batch normalization层，因此该层中没有偏置(bias)参数，并且只有一个权重。
+  形状: `(Co, ky, kx, Ci)`
+   * Co: 输出特征图的通道数目
+   * ky: 滤波器核在垂直方向上的尺寸
+   * kx: 滤波器核在水平方向上的尺寸
+   * Ci: 输入特征图的通道数目
+
+  二维矩阵: (Co * ky * kx, Ci), 行优先次序存储。
+
+* **全连接层权重**
+
+  二维矩阵: (输入层尺寸, 本层尺寸), 行优先次序存储。
+
+* **[Batch Normalization](<http://arxiv.org/abs/1502.03167>) 层权重**
+
+本层有四个参数，实际上只有.w0和.wbias是需要学习的参数，另外两个分别是滑动均值和方差。在测试阶段它们将会被加载到模型中。下表展示了batch normalization层的参数。
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+<colgroup>
+<col  class="left" />
+<col  class="left" />
+<col  class="left" />
+</colgroup>
+<thead>
+<tr>
+<th scope="col" class="left">参数名</th>
+<th scope="col" class="left">尺寸</th>
+<th scope="col" class="left">含义</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">_res2_1_branch1_bn.w0</td>
+<td class="left">256</td>
+<td class="left">gamma, 缩放参数</td>
+</tr>
+<tr>
+<td class="left">_res2_1_branch1_bn.w1</td>
+<td class="left">256</td>
+<td class="left">特征图均值</td>
+</tr>
+<tr>
+<td class="left">_res2_1_branch1_bn.w2</td>
+<td class="left">256</td>
+<td class="left">特征图方差</td>
+</tr>
+<tr>
+<td class="left">_res2_1_branch1_bn.wbias</td>
+<td class="left">256</td>
+<td class="left">beta, 偏置参数</td>
+</tr>
+</tbody>
+
+</table></center>
+<br>
+
+### 参数读取
+
+使用者可以使用下面的Python脚本来读取参数值:
+
+```
+import sys
+import numpy as np
+
+def load(file_name):
+    with open(file_name, 'rb') as f:
+        f.read(16) # skip header for float type.
+        return np.fromfile(f, dtype=np.float32)
+
+if __name__=='__main__':
+    weight = load(sys.argv[1])
+```
+
+或者直接使用下面的shell命令:
+
+```
+od -j 16 -f _res2_1_branch1_bn.w0
+```
+
+## 特征提取
+
+我们提供了C++和Python接口来提取特征。下面的例子使用了`demo/model_zoo/resnet/example`中的数据，详细地展示了整个特征提取的过程。
+
+### C++接口
+
+首先，在配置文件中的`define_py_data_sources2`里指定图像数据列表，具体请参照示例`demo/model_zoo/resnet/resnet.py`。
+
+```
+    train_list = 'train.list' if not is_test else None
+    # mean.meta is mean file of ImageNet dataset.
+    # mean.meta size : 3 x 224 x 224.
+    # If you use three mean value, set like:
+    # "mean_value:103.939,116.779,123.68;"
+    args={
+        'mean_meta': "model/mean_meta_224/mean.meta",
+        'image_size': 224, 'crop_size': 224,
+        'color': True,'swap_channel:': [2, 1, 0]}
+    define_py_data_sources2(train_list,
+                           'example/test.list',
+                           module="example.image_list_provider",
+                           obj="processData",
+                           args=args)
+```
+
+第二步，在`resnet.py`文件中指定要提取特征的网络层的名字。例如，
+
+```
+Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn")
+```
+
+第三步，在`extract_fea_c++.sh`文件中指定模型路径和输出的目录，然后执行下面的命令。
+
+```
+cd demo/model_zoo/resnet
+./extract_fea_c++.sh
+```
+
+如果执行成功，特征将会存到`fea_output/rank-00000`文件中，如下所示。同时你可以使用`load_feature.py`文件中的`load_feature_c`接口来加载该文件。
+
+```
+-0.115318 -0.108358 ... -0.087884;-1.27664 ... -1.11516 -2.59123;
+-0.126383 -0.116248 ... -0.00534909;-1.42593 ... -1.04501 -1.40769;
+```
+
+* 每行存储的是一个样本的特征。其中，第一行存的是图像`example/dog.jpg`的特征，第二行存的是图像`example/cat.jpg`的特征。
+* 不同层的特征由分号`;`隔开，并且它们的顺序与`Outputs()`中指定的层顺序一致。这里，左边是`res5_3_branch2c_conv`层的特征，右边是`res5_3_branch2c_bn`层特征。
+
+### Python接口
+
+示例`demo/model_zoo/resnet/classify.py`中展示了如何使用Python来提取特征。下面的例子同样使用了`./example/test.list`中的数据。执行的命令如下：
+
+```
+cd demo/model_zoo/resnet
+./extract_fea_py.sh
+```
+
+extract_fea_py.sh:
+
+```
+python classify.py \
+     --job=extract \
+     --conf=resnet.py\
+     --use_gpu=1 \
+     --mean=model/mean_meta_224/mean.meta \
+     --model=model/resnet_50 \
+     --data=./example/test.list \
+     --output_layer="res5_3_branch2c_conv,res5_3_branch2c_bn" \
+     --output_dir=features
+
+```
+* \--job=extract:              指定工作模式来提取特征。
+* \--conf=resnet.py:           网络配置文件。
+* \--use_gpu=1:                指定是否使用GPU。
+* \--model=model/resnet_50:    模型路径。
+* \--data=./example/test.list: 数据列表。
+* \--output_layer="xxx,xxx":   指定提取特征的层。
+* \--output_dir=features:      输出目录。
+
+如果运行成功，你将会看到特征存储在`features/batch_0`文件中，该文件是由cPickle产生的。你可以使用`load_feature.py`中的`load_feature_py`接口来打开该文件，它将返回如下的字典：
+
+```
+{
+'cat.jpg': {'res5_3_branch2c_conv': array([[-0.12638293, -0.116248  , -0.11883899, ..., -0.00895038, 0.01994277, -0.00534909]], dtype=float32), 'res5_3_branch2c_bn': array([[-1.42593431, -1.28918779, -1.32414699, ..., -1.45933616, -1.04501402, -1.40769434]], dtype=float32)},
+'dog.jpg': {'res5_3_branch2c_conv': array([[-0.11531784, -0.10835785, -0.08809858, ...,0.0055237, 0.01505112, -0.08788397]], dtype=float32), 'res5_3_branch2c_bn': array([[-1.27663755, -1.18272924, -0.90937918, ..., -1.25178063, -1.11515927, -2.59122872]], dtype=float32)}
+}
+```
+
+仔细观察，这些特征值与上述使用C++接口提取的结果是一致的。
+
+## 预测
+
+`classify.py`文件也可以用于对样本进行预测。我们提供了一个示例脚本`predict.sh`，它使用50层的ResNet模型来对`example/test.list`中的数据进行预测。
+
+```
+cd demo/model_zoo/resnet
+./predict.sh
+```
+
+predict.sh调用了`classify.py`:
+
+```
+python classify.py \
+     --job=predict \
+     --conf=resnet.py\
+     --multi_crop \
+     --model=model/resnet_50 \
+     --use_gpu=1 \
+     --data=./example/test.list
+```
+* \--job=extract:              指定工作模型进行预测。
+* \--conf=resnet.py:           网络配置文件。network configure.
+* \--multi_crop:               使用10个裁剪图像块，预测概率取平均。
+* \--use_gpu=1:                指定是否使用GPU。
+* \--model=model/resnet_50:    模型路径。
+* \--data=./example/test.list: 数据列表。
+
+如果运行成功，你将会看到如下结果，其中156和285是这些图像的分类标签。
+
+```
+Label of example/dog.jpg is: 156
+Label of example/cat.jpg is: 282
+```
diff --git a/doc/demo/imagenet_model/resnet_model.md b/doc/tutorials/imagenet_model/resnet_model_en.md
similarity index 95%
rename from doc/demo/imagenet_model/resnet_model.md
rename to doc/tutorials/imagenet_model/resnet_model_en.md
index 5403ab9f17d2399fee878d0f3c512cb166aba06f..478ad06193b14ba7fe02238df621db1f7b0804d4 100644
--- a/doc/demo/imagenet_model/resnet_model.md
+++ b/doc/tutorials/imagenet_model/resnet_model_en.md
@@ -52,7 +52,7 @@ See ```demo/model_zoo/resnet/resnet.py```. This config contains network of 50, 1
 
 ### Network Visualization
 
-You can get a diagram of ResNet network by running the following commands. The script generates dot file and then converts dot file to PNG file, which uses installed draw_dot tool in our server. If you can not access the server, just install graphviz to convert dot file.
+You can get a diagram of ResNet network by running the following commands. The script generates dot file and then converts dot file to PNG file, which needs to install graphviz to convert.
 
 ```
 cd demo/model_zoo/resnet
@@ -138,7 +138,7 @@ There are four parameters in this layer. In fact, only .w0 and .wbias are the le
 
 ### Parameter Observation
 
-Users who want to observe the parameters can use python to read:
+Users who want to observe the parameters can use Python to read:
 
 ```
 import sys
@@ -209,7 +209,7 @@ If successful, features are saved in `fea_output/rank-00000` as follows. And you
 
 ### Python Interface
 
-`demo/model_zoo/resnet/classify.py` is an example to show how to use python to extract features. Following example still uses data of `./example/test.list`. Command is as follows:
+`demo/model_zoo/resnet/classify.py` is an example to show how to use Python to extract features. Following example still uses data of `./example/test.list`. Command is as follows:
 
 ```
 cd demo/model_zoo/resnet
@@ -238,8 +238,6 @@ python classify.py \
 * \--output_layer="xxx,xxx":   specify layers to extract features.
 * \--output_dir=features:      output diretcoty.
 
-Note, since the convolution layer in these ResNet models is suitable for the cudnn implementation which only support GPU. It not support CPU mode because of compatibility issue and we will fix later.
-
 If run successfully, you will see features saved in `features/batch_0`, this file is produced with cPickle. You can use `load_feature_py` interface in `load_feature.py` to open the file, and it returns a dictionary as follows:
 
 ```
diff --git a/doc/tutorials/index_cn.md b/doc/tutorials/index_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..adc75978a7820b7f9c9239ea8a727aa3d587cab0
--- /dev/null
+++ b/doc/tutorials/index_cn.md
@@ -0,0 +1,24 @@
+# 完整教程
+
+## 快速入门
+
+使用商品评论分类任务，系统性的介绍如何一步步改进，最终得到产品级的深度模型。
+
+* [阅读教程](quick_start/index_cn.rst)
+
+## 图像
+
+* TBD
+
+## 自然语言处理
+
+* [情感分类](sentiment_analysis/index_cn.md)
+* [语义角色标注](semantic_role_labeling/index_cn.md)
+
+## 个性化推荐
+
+* TBD
+
+## 常用模型
+
+* TBD
diff --git a/doc/tutorials/index_en.md b/doc/tutorials/index_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..63b2091c245eedf61a31da620e5804daf765cc42
--- /dev/null
+++ b/doc/tutorials/index_en.md
@@ -0,0 +1,24 @@
+# TUTORIALS
+There are several examples and demos here.
+
+## Quick Start
+
+* [Quick Start](quick_start/index_en.md)
+
+## Image
+
+* [Image Classification](image_classification/index_en.md)
+
+## NLP
+
+* [Sentiment Analysis](sentiment_analysis/index_en.md)
+* [Text Generation](text_generation/index_en.md)
+* [Semantic Role Labeling](semantic_role_labeling/index_en.md)
+
+## Recommendation
+
+* [MovieLens Regression](rec/ml_regression_en.rst)
+
+## Model Zoo
+* [ImageNet: ResNet](imagenet_model/resnet_model_en.md)
+* [Embedding: Chinese Word](embedding_model/index_en.md)
diff --git a/doc/tutorials/quick_start/index_cn.rst b/doc/tutorials/quick_start/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..936f16118a439b310794157191bb6d82d8fa6d42
--- /dev/null
+++ b/doc/tutorials/quick_start/index_cn.rst
@@ -0,0 +1,396 @@
+=============
+快速入门教程
+=============
+
+我们将以 `文本分类问题 <https://en.wikipedia.org/wiki/Document_classification>`_ 为例,
+介绍PaddlePaddle的基本使用方法。
+
+安装
+====
+
+请参考 `安装教程 <../../build_and_install/index.html>`_ 安装PaddlePaddle。
+
+使用概述
+========
+
+**文本分类问题**：对于给定的一条文本，我们从提前给定的类别集合中选择其所属类别。
+
+比如, 在购物网站上，通过查看买家对某个产品的评价反馈, 评估该产品的质量。
+
+- 这个显示器很棒！ （好评）
+- 用了两个月之后这个显示器屏幕碎了。（差评）
+
+使用PaddlePaddle, 每一个任务流程都可以被划分为如下五个步骤。
+
+    ..  image:: src/Pipeline_cn.jpg
+        :align: center
+        :scale: 80%
+
+1. 数据格式准备
+    - 本例每行保存一条样本，类别Id和文本信息用 ``Tab`` 间隔，文本中的单词用空格分隔（如果不切词，则字与字之间用空格分隔），例如：``类别Id '\t' 这 个 显 示 器 很 棒 ！``
+2. 向系统传送数据
+    - PaddlePaddle可以执行用户的python脚本程序来读取各种格式的数据文件。
+    - 本例的所有字符都将转换为连续整数表示的Id传给模型。
+3. 描述网络结构和优化算法
+    - 本例由易到难展示4种不同的文本分类网络配置：逻辑回归模型，词向量模型，卷积模型，时序模型。
+    - 常用优化算法包括Momentum, RMSProp，AdaDelta，AdaGrad，Adam，Adamax等，本例采用Adam优化方法，加了L2正则和梯度截断。
+4. 训练模型
+5. 应用模型
+
+数据格式准备
+------------
+
+接下来我们将展示如何用PaddlePaddle训练一个文本分类模型，将 `Amazon电子产品评论数据 <http://jmcauley.ucsd.edu/data/amazon/>`_ 分为好评(正样本)和差评(负样本)两种类别。
+`源代码 <https://github.com/PaddlePaddle/Paddle>`_ 的 ``demo/quick_start`` 目录里提供了该数据的下载脚本和预处理脚本，你只需要在命令行输入以下命令，就能够很方便的完成数据下载和相应的预处理工作。
+
+.. code-block:: bash
+
+    cd demo/quick_start
+    ./data/get_data.sh
+    ./preprocess.sh
+
+数据预处理完成之后，通过配置类似于 ``dataprovider_*.py`` 的数据读取脚本和类似于 ``trainer_config.*.py`` 的训练模型脚本，PaddlePaddle将以设置参数的方式来设置
+相应的数据读取脚本和训练模型脚本。接下来，我们将对这两个步骤给出了详细的解释，你也可以先跳过本文的解释环节，直接进入训练模型章节, 使用 ``sh train.sh`` 开始训练模型，
+查看`train.sh`内容，通过 **自底向上法** (bottom-up approach)来帮助你理解PaddlePaddle的内部运行机制。
+
+
+向系统传送数据
+==============
+
+Python脚本读取数据
+------------------
+
+`DataProvider <../../ui/data_provider/index.html>`_ 是PaddlePaddle负责提供数据的模块。``DataProvider`` 主要职责在于将训练数据传入内存或者显存，让模型能够得到训练更新，其包括两个函数：
+
+* initializer：PaddlePaddle会在调用读取数据的Python脚本之前，先调用initializer函数。在下面例子里，我们在initialzier函数里初始化词表，并且在随后的读取数据过程中填充词表。
+* process：PaddlePaddle调用process函数来读取数据。每次读取一条数据后，process函数会用yield语句输出这条数据，从而能够被PaddlePaddle 捕获 (harvest)。
+
+``dataprovider_bow.py`` 文件给出了完整例子：
+
+..  literalinclude:: ../../../demo/quick_start/dataprovider_bow.py
+     :language: python
+     :lines: 21-70
+     :linenos:
+     :emphasize-lines: 8,33
+
+
+配置中的数据加载定义
+--------------------
+
+在模型配置中通过 ``define_py_data_sources2`` 接口来加载数据：
+
+..  literalinclude:: ../../../demo/quick_start/trainer_config.emb.py
+     :language: python
+     :lines: 19-35
+     :linenos:
+     :emphasize-lines: 12
+
+
+以下是对上述数据加载的解释：
+
+- data/train.list,data/test.list: 指定训练数据和测试数据
+- module="dataprovider_bow": 处理数据的Python脚本文件
+- obj="process": 指定生成数据的函数
+- args={"dictionary": word_dict}: 额外的参数，这里指定词典
+
+更详细数据格式和用例请参考 `PyDataProvider2 <../../ui/data_provider/pydataprovider2.html>`_ 。
+
+模型网络结构
+============
+
+本小节我们将介绍模型网络结构。
+
+    ..  image:: src/PipelineNetwork_cn.jpg
+        :align: center
+        :scale: 80%
+
+
+我们将以最基本的逻辑回归网络作为起点，并逐渐展示更加深入的功能。更详细的网络配置连接请参考 `Layer文档 <../../../doc/layer.html>`_ 。
+所有配置都能在 `源代码 <https://github.com/PaddlePaddle/Paddle>`_ 的 ``demo/quick_start`` 目录下找到。
+
+逻辑回归模型
+------------
+
+具体流程如下:
+
+    ..  image:: src/NetLR_cn.jpg
+        :align: center
+        :scale: 80%
+
+- 获取利用 `one-hot vector <https://en.wikipedia.org/wiki/One-hot>`_ 表示的每个单词，维度是词典大小
+
+    .. code-block:: python
+
+        word = data_layer(name="word",  size=word_dim)
+
+- 获取该条样本类别Id，维度是类别个数。
+
+    .. code-block:: python
+
+        label = data_layer(name="label", size=label_dim)
+
+- 利用逻辑回归模型对该向量进行分类，同时会计算分类准确率
+
+    .. code-block:: python
+
+        # Define a fully connected layer with logistic activation (also called softmax activation).
+        output = fc_layer(input=word,
+                        size=label_dim,
+                        act_type=SoftmaxActivation())
+        # Define cross-entropy classification loss and error.
+        classification_cost(input=output, label=label)
+
+
+ - input: 除去data层，每个层都有一个或多个input,多个input以list方式输入
+ - size: 该层神经元个数
+ - act_type: 激活函数类型
+
+**效果总结**：我们将在后面介绍训练和预测流程的脚本。在此为方便对比不同网络结构，我们总结了各个网络的复杂度和效果。
+
+    =====================  ===============================  =================
+    网络名称                        参数数量                    错误率
+    =====================  ===============================  =================
+    逻辑回归                      252 KB                       8.652 %
+    =====================  ===============================  =================
+
+词向量模型
+----------
+
+embedding模型需要稍微改变提供数据的Python脚本，即 ``dataprovider_emb.py``，词向量模型、
+卷积模型、时序模型均使用该脚本。其中文本输入类型定义为整数时序类型integer_value_sequence。
+
+.. code-block:: python
+
+    def initializer(settings, dictionary, **kwargs):
+        settings.word_dict = dictionary
+        settings.input_types = [
+            # Define the type of the first input as sequence of integer.
+            # The value of the integers range from 0 to len(dictrionary)-1
+            integer_value_sequence(len(dictionary)),
+            # Define the second input for label id
+            integer_value(2)]
+
+    @provider(init_hook=initializer)
+    def process(settings, file_name):
+        ...
+        # omitted, it is same as the data provider for LR model
+
+该模型依然使用逻辑回归分类网络的框架， 只是将句子用连续向量表示替换为用稀疏向量表示， 即对第三步进行替换。句子表示的计算更新为两步：
+
+..  image:: src/NetContinuous_cn.jpg
+    :align: center
+    :scale: 80%
+
+- 利用单词Id查找该单词对应的连续向量(维度为word_dim)， 输入N个单词，输出为N个word_dim维度向量
+
+    .. code-block:: python
+
+        emb = embedding_layer(input=word, size=word_dim)
+
+- 将该句话包含的所有单词向量求平均, 得到句子的表示
+
+    .. code-block:: python
+
+        avg = pooling_layer(input=emb, pooling_type=AvgPooling())
+
+其它部分和逻辑回归网络结构一致。
+
+**效果总结：**
+
+    =====================  ===============================  ==================
+    网络名称                        参数数量                    错误率
+    =====================  ===============================  ==================
+    词向量模型                      15 MB                       8.484 %
+    =====================  ===============================  ==================
+
+卷积模型
+-----------
+
+卷积网络是一种特殊的从词向量表示到句子表示的方法， 也就是将词向量模型进一步演化为三个新步骤。
+
+..  image:: src/NetConv_cn.jpg
+    :align: center
+    :scale: 80%
+
+文本卷积分可为三个步骤:
+
+1. 首先，从每个单词左右两端分别获取k个相邻的单词, 拼接成一个新的向量；
+
+2. 其次，对该向量进行非线性变换(例如Sigmoid变换), 使其转变为维度为hidden_dim的新向量；
+
+3. 最后，对整个新向量集合的每一个维度取最大值来表示最后的句子。
+
+这三个步骤可配置为:
+
+.. code-block:: python
+
+    text_conv = sequence_conv_pool(input=emb,
+                                context_start=k,
+                                context_len=2 * k + 1)
+
+**效果总结：**
+
+    =====================  ===============================  ========================
+    网络名称                        参数数量                    错误率
+    =====================  ===============================  ========================
+    卷积模型                      16 MB                       5.628 %
+    =====================  ===============================  ========================
+
+时序模型
+----------
+
+..  image:: src/NetRNN_cn.jpg
+    :align: center
+    :scale: 80%
+
+时序模型，也称为RNN模型, 包括简单的 `RNN模型 <https://en.wikipedia.org/wiki/Recurrent_neural_network>`_, `GRU模型 <https://en.wikipedia.org/wiki/Gated_recurrent_unit>`_ 和 `LSTM模型 <https://en.wikipedia.org/wiki/Long_short-term_memory>`_ 等等。
+
+- GRU模型配置：
+
+    .. code-block:: python
+
+        gru = simple_gru(input=emb, size=gru_size)
+
+
+- LSTM模型配置：
+
+    .. code-block:: python
+
+        lstm = simple_lstm(input=emb, size=lstm_size)
+
+本次试验，我们采用单层LSTM模型，并使用了Dropout，**效果总结：**
+
+    =====================  ===============================  =========================
+    网络名称                        参数数量                    错误率
+    =====================  ===============================  =========================
+    时序模型                      16 MB                       4.812 %
+    =====================  ===============================  =========================
+
+优化算法
+=========
+
+`优化算法 <http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/optimizers_index.html>`_ 包括
+Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优化方法，同时使用了L2正则(L2 Regularization)和梯度截断(Gradient Clipping)。
+
+.. code-block:: python
+
+    settings(batch_size=128,
+            learning_rate=2e-3,
+            learning_method=AdamOptimizer(),
+            regularization=L2Regularization(8e-4),
+            gradient_clipping_threshold=25)
+
+训练模型
+=========
+
+在数据加载和网络配置完成之后， 我们就可以训练模型了。
+
+..  image:: src/PipelineTrain_cn.jpg
+    :align: center
+    :scale: 80%
+
+训练模型，我们只需要运行 ``train.sh`` 训练脚本：
+
+    .. code-block:: bash
+
+        ./train.sh
+
+``train.sh`` 中包含了训练模型的基本命令。训练时所需设置的主要参数如下：
+
+    .. code-block:: bash
+
+        paddle train \
+        --config=trainer_config.py \
+        --log_period=20 \
+        --save_dir=./output \
+        --num_passes=15 \
+        --use_gpu=false
+
+这里只简单介绍了单机训练，如何进行分布式训练，可以参考教程 `分布式训练 <../../cluster/index.html>`_ 。
+
+预测
+=====
+
+当模型训练好了之后，我们就可以进行预测了。
+
+..  image:: src/PipelineTest_cn.jpg
+    :align: center
+    :scale: 80%
+
+之前配置文件中 ``test.list`` 指定的数据将会被测试，这里直接通过预测脚本 ``predict.sh`` 进行预测,
+更详细的说明，可以参考 `Python API预测 <../../ui/predict/swig_py_paddle.html>`_ 教程。
+
+    .. code-block:: bash
+
+        model="output/pass-00003"
+        paddle train \
+            --config=trainer_config.lstm.py \
+            --use_gpu=false \
+            --job=test \
+            --init_model_path=$model \
+            --config_args=is_predict=1 \
+            --predict_output_dir=. \
+
+        mv rank-00000 result.txt
+
+这里以 ``output/pass-00003`` 为例进行预测，用户可以根据训练日志，选择测试结果最好的模型来预测。
+
+预测结果以文本的形式保存在 ``result.txt`` 中，一行为一个样本，格式如下：
+
+    .. code-block:: bash
+
+        预测ID;ID为0的概率 ID为1的概率
+        预测ID;ID为0的概率 ID为1的概率
+
+总体效果总结
+==============
+
+在 ``/demo/quick_start`` 目录下，能够找到这里使用的所有数据, 网络配置, 训练脚本等等。
+对于Amazon-Elec测试集(25k), 如下表格，展示了上述网络模型的训练效果:
+
+    =====================  ===============================  =============  ==================================
+    网络名称                       参数数量                    错误率          配置文件
+    =====================  ===============================  =============  ==================================
+    逻辑回归模型                      252 KB                     8.652%          trainer_config.lr.py
+    词向量模型                         15 MB                      8.484%         trainer_config.emb.py
+    卷积模型                        16 MB                     5.628%          trainer_config.cnn.py
+    时序模型                         16 MB                     4.812%          trainer_config.lstm.py
+    =====================  ===============================  =============  ==================================
+
+
+附录
+=====
+
+命令行参数
+----------
+
+* \--config：网络配置
+* \--save_dir：模型存储路径
+* \--log_period：每隔多少batch打印一次日志
+* \--num_passes：训练轮次，一个pass表示过一遍所有训练样本
+* \--config_args：命令指定的参数会传入网络配置中。
+* \--init_model_path：指定初始化模型路径，可用在测试或训练时指定初始化模型。
+
+默认一个pass保存一次模型，也可以通过saving_period_by_batches设置每隔多少batch保存一次模型。
+可以通过show_parameter_stats_period设置打印参数信息等。
+其他参数请参考 `命令行参数文档 <../../ui/index.html#command-line-argument>`_ 。
+
+输出日志
+---------
+
+.. code-block:: bash
+
+    TrainerInternal.cpp:160]  Batch=20 samples=2560 AvgCost=0.628761 CurrentCost=0.628761 Eval: classification_error_evaluator=0.304297  CurrentEval: classification_error_evaluator=0.304297
+
+模型训练会看到类似上面这样的日志信息，详细的参数解释，请参考如下表格：
+
+    ===========================================  ==============================================================
+    名称                                             解释
+    ===========================================  ==============================================================
+    Batch=20                                      表示过了20个batch
+    samples=2560                                  表示过了2560个样本
+    AvgCost                                          每个pass的第0个batch到当前batch所有样本的平均cost
+    CurrentCost                                      当前log_period个batch所有样本的平均cost
+    Eval: classification_error_evaluator          每个pass的第0个batch到当前batch所有样本的平均分类错误率
+    CurrentEval: classification_error_evaluator      当前log_period个batch所有样本的平均分类错误率
+    ===========================================  ==============================================================
diff --git a/doc/demo/quick_start/index_en.md b/doc/tutorials/quick_start/index_en.md
similarity index 88%
rename from doc/demo/quick_start/index_en.md
rename to doc/tutorials/quick_start/index_en.md
index 80d816a768a71156ce72cda6ea92b749fbcdbe1f..4e765b23037d8b4b717d12437f839cc488badf5b 100644
--- a/doc/demo/quick_start/index_en.md
+++ b/doc/tutorials/quick_start/index_en.md
@@ -12,7 +12,7 @@ This tutorial will teach the basics of deep learning (DL), including how to impl
 
 To get started, please install PaddlePaddle on your computer. Throughout this tutorial, you will learn by implementing different DL models for text classification.
 
-To install PaddlePaddle, please follow the instructions here: <a href = "../../build/index.html" >Build and Install</a>.
+To install PaddlePaddle, please follow the instructions here: <a href = "../../getstarted/build_and_install/index_en.html" >Build and Install</a>.
 
 ## Overview
 For the first step, you will use PaddlePaddle to build a **text classification** system. For example, suppose you run an e-commence  website, and you want to analyze the sentiment of user reviews to evaluate product quality.
@@ -32,7 +32,7 @@ The monitor breaks down two months after purchase.
 the classifier should output “negative“.
 
 To build your text classification system, your code will need to perform five steps:
-<center> ![](./Pipeline_en.jpg) </center>
+<center> ![](./src/Pipeline_en.jpg) </center>
 
   - Preprocess data into a standardized format.
   - Provide data to the learning model.
@@ -59,12 +59,11 @@ To build your text classification system, your code will need to perform five st
 ## Preprocess data into standardized format
 In this example, you are going to use [Amazon electronic product review dataset](http://jmcauley.ucsd.edu/data/amazon/) to build a bunch of deep neural network models for text classification. Each text in this dataset is a product review. This dataset has two categories: “positive” and “negative”. Positive means the reviewer likes the product, while negative means the reviewer does not like the product.
 
-`demo/quick_start` in the [source code](https://github.com/baidu/Paddle) provides scripts for downloading data and preprocessing data as shown below. The data process takes several minutes (about 3 minutes in our machine).
+`demo/quick_start` in the [source code](https://github.com/PaddlePaddle/Paddle) provides script for downloading the preprocessed data as shown below. (If you want to process the raw data, you can use the script `demo/quick_start/data/proc_from_raw_data/get_data.sh`).
 
 ```bash
 cd demo/quick_start
 ./data/get_data.sh
-./preprocess.sh
 ```
 
 ## Transfer Data to Model
@@ -157,18 +156,18 @@ define_py_data_sources2(train_list='data/train.list',
                         obj="process",
                         args={"dictionary": word_dict})
 ```
-You can refer to the following link for more detailed examples and data formats: <a href = "../../ui/data_provider/pydataprovider2.html">PyDataProvider2</a>.
+You can refer to the following link for more detailed examples and data formats: <a href = "../../api/data_provider/pydataprovider2_en.html">PyDataProvider2</a>.
 
 ## Network Architecture
 You will describe four kinds of network architectures in this section.
-<center> ![](./PipelineNetwork_en.jpg) </center>
+<center> ![](./src/PipelineNetwork_en.jpg) </center>
 
 First, you will build a logistic regression model. Later, you will also get chance to build other more powerful network architectures.
-For more detailed documentation, you could refer to: <a href = "../../ui/api/trainer_config_helpers/layers_index.html">Layer documentation</a>。All configuration files are in `demo/quick_start` directory.
+For more detailed documentation, you could refer to: <a href = "../../api/trainer_config_helpers/layers.html">layer documentation</a>. All configuration files are in `demo/quick_start` directory.
 
 ### Logistic Regression
 The architecture is illustrated in the following picture:
-<center> ![](./NetLR_en.png) </center>
+<center> ![](./src/NetLR_en.png) </center>
 
 - You need define the data for text features. The size of the data layer is the number of words in the dictionary.
 
@@ -183,10 +182,10 @@ label = data_layer(name="label", size=label_dim)
 ```
 
 - It uses logistic regression model to classify the vector, and it will output the classification error during training.
-	- Each layer has an *input* argument that specifies its input layer. Some layers can have multiple input layers. You can use a list of the input layers as input in that case.
-	- *size* for each layer means the number of neurons of the layer.
-	- *act_type* means activation function applied to the output of each neuron independently.
-	- Some layers can have additional special inputs. For example, `classification_cost` needs ground truth label as input to compute classification loss and error.
+    - Each layer has an *input* argument that specifies its input layer. Some layers can have multiple input layers. You can use a list of the input layers as input in that case.
+    - *size* for each layer means the number of neurons of the layer.
+    - *act_type* means activation function applied to the output of each neuron independently.
+    - Some layers can have additional special inputs. For example, `classification_cost` needs ground truth label as input to compute classification loss and error.
 ```python
 # Define a fully connected layer with logistic activation (also called softmax activation).
 output = fc_layer(input=word,
@@ -241,7 +240,7 @@ def process(settings, file_name):
 ```
 
 This model is very similar to the framework of logistic regression, but it uses word embedding vectors instead of a sparse vectors to represent words.
-<center> ![](./NetContinuous_en.png) </center>
+<center> ![](./src/NetContinuous_en.png) </center>
 
 - It can look up the dense word embedding vector in the dictionary  (its words embedding vector is `word_dim`). The input is a sequence of N words, the output is N word_dim dimensional vectors.
 
@@ -284,7 +283,7 @@ The performance is summarized in the following table:
 
 ### Convolutional Neural Network Model
 Convolutional neural network converts a sequence of word embeddings into a sentence representation using temporal convolutions. You will transform the fully connected layer of the word embedding model to 3 new sub-steps.
-<center> ![](./NetConv_en.png) </center>
+<center> ![](./src/NetConv_en.png) </center>
 
 
 Text convolution has 3 steps:
@@ -296,8 +295,8 @@ Text convolution has 3 steps:
 # context_len means convolution kernel size.
 # context_start means the start of the convolution. It can be negative. In that case, zero padding is applied.
 text_conv = sequence_conv_pool(input=emb,
-	                           context_start=k,
-	                           context_len=2 * k + 1)
+                               context_start=k,
+                               context_len=2 * k + 1)
 ```
 
 The performance is summarized in the following table：
@@ -325,7 +324,7 @@ The performance is summarized in the following table：
 <br>
 
 ### Recurrent Model
-<center> ![](./NetRNN_en.png) </center>
+<center> ![](./src/NetRNN_en.png) </center>
 
 You can use Recurrent neural network as our time sequence model, including simple RNN model, GRU model, and LSTM model。
 
@@ -367,7 +366,7 @@ You can use single layer LSTM model with Dropout for our text classification pro
 <br>
 
 ## Optimization Algorithm
-<a href = "../../ui/api/trainer_config_helpers/optimizers.html">Optimization algorithms</a> include Momentum, RMSProp, AdaDelta, AdaGrad, Adam, and Adamax. You can use Adam optimization method here, with L2 regularization and gradient clipping, because Adam has been proved to work very well for training recurrent neural network.
+<a href = "../../api/trainer_config_helpers/optimizers.html">Optimization algorithms</a> include Momentum, RMSProp, AdaDelta, AdaGrad, Adam, and Adamax. You can use Adam optimization method here, with L2 regularization and gradient clipping, because Adam has been proved to work very well for training recurrent neural network.
 
 ```python
 settings(batch_size=128,
@@ -379,7 +378,7 @@ settings(batch_size=128,
 
 ## Training Model
 After completing data preparation and network architecture specification, you will run the training script.
-<center> ![](./PipelineTrain_en.png) </center>
+<center> ![](./src/PipelineTrain_en.png) </center>
 
 Training script: our training script is in `train.sh` file. The training arguments are listed below:
 
@@ -392,10 +391,11 @@ paddle train \
 --use_gpu=false
 ```
 
-If you want to install the remote training platform, which enables distributed training on clusters, follow the instructions here: <a href = "../../cluster/index.html">Platform</a> documentation. We do not provide examples on how to train on clusters. Please refer to other demos or platform training documentation for mode details on training on clusters.
+We do not provide examples on how to train on clusters here. If you want to train on clusters, please follow the <a href = "../../howto/cluster/cluster_train_en.html">distributed training</a> documentation or other demos for more details.
+
 ## Inference
 You can use the trained model to perform prediction on the dataset with no labels. You can also evaluate the model on dataset with labels to obtain its test accuracy.
-<center> ![](./PipelineTest_en.png) </center>
+<center> ![](./src/PipelineTest_en.png) </center>
 
 The test script is listed below. PaddlePaddle can evaluate a model on the data with labels specified in `test.list`.
 
@@ -407,7 +407,7 @@ paddle train \
 --init_model_path=./output/pass-0000x
 ```
 
-We will give an example of performing prediction using Recurrent model on a dataset with no labels. You can refer to: <a href = "../../ui/predict/swig_py_paddle_en.html">Python Prediction API</a> tutorial，or other <a href = "../../demo/index.html">demo</a> for the prediction process using Python. You can also use the following script for inference or evaluation.
+We will give an example of performing prediction using Recurrent model on a dataset with no labels. You can refer to <a href = "../../api/predict/swig_py_paddle_en.html">Python Prediction API</a> tutorial，or other <a href = "../../tutorials/index_en.html">demo</a> for the prediction process using Python. You can also use the following script for inference or evaluation.
 
 inference script (predict.sh)：
 
@@ -477,7 +477,7 @@ The scripts of data downloading, network configurations, and training scrips are
 <td class="left">Word embedding</td>
 <td class="left"> 15MB </td>
 <td class="left"> 8.484%</td>
-<td class="left">trainer_config.bow.py</td>
+<td class="left">trainer_config.emb.py</td>
 </tr>
 
 <tr>
@@ -509,7 +509,7 @@ The scripts of data downloading, network configurations, and training scrips are
 * \--config_args：Other configuration arguments.
 * \--init_model_path：The path of the initial model parameter.
 
-By default, the trainer will save model every pass. You can also specify `saving_period_by_batches` to set the frequency of batch saving. You can use `show_parameter_stats_period` to print the statistics of the parameters, which are very useful for tuning parameters. Other command line arguments can be found in <a href = "../../ui/index.html#command-line-argument">command line argument documentation</a>。
+By default, the trainer will save model every pass. You can also specify `saving_period_by_batches` to set the frequency of batch saving. You can use `show_parameter_stats_period` to print the statistics of the parameters, which are very useful for tuning parameters. Other command line arguments can be found in <a href = "../../howto/cmd_parameter/index_en.html">command line argument documentation</a>。
 
 ### Log
 
diff --git a/doc_cn/demo/quick_start/NetContinuous.jpg b/doc/tutorials/quick_start/src/NetContinuous_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/NetContinuous.jpg
rename to doc/tutorials/quick_start/src/NetContinuous_cn.jpg
diff --git a/doc/demo/quick_start/NetContinuous_en.png b/doc/tutorials/quick_start/src/NetContinuous_en.png
similarity index 100%
rename from doc/demo/quick_start/NetContinuous_en.png
rename to doc/tutorials/quick_start/src/NetContinuous_en.png
diff --git a/doc_cn/demo/quick_start/NetConv.jpg b/doc/tutorials/quick_start/src/NetConv_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/NetConv.jpg
rename to doc/tutorials/quick_start/src/NetConv_cn.jpg
diff --git a/doc/demo/quick_start/NetConv_en.png b/doc/tutorials/quick_start/src/NetConv_en.png
similarity index 100%
rename from doc/demo/quick_start/NetConv_en.png
rename to doc/tutorials/quick_start/src/NetConv_en.png
diff --git a/doc_cn/demo/quick_start/NetLR.jpg b/doc/tutorials/quick_start/src/NetLR_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/NetLR.jpg
rename to doc/tutorials/quick_start/src/NetLR_cn.jpg
diff --git a/doc/demo/quick_start/NetLR_en.png b/doc/tutorials/quick_start/src/NetLR_en.png
similarity index 100%
rename from doc/demo/quick_start/NetLR_en.png
rename to doc/tutorials/quick_start/src/NetLR_en.png
diff --git a/doc_cn/demo/quick_start/NetRNN.jpg b/doc/tutorials/quick_start/src/NetRNN_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/NetRNN.jpg
rename to doc/tutorials/quick_start/src/NetRNN_cn.jpg
diff --git a/doc/demo/quick_start/NetRNN_en.png b/doc/tutorials/quick_start/src/NetRNN_en.png
similarity index 100%
rename from doc/demo/quick_start/NetRNN_en.png
rename to doc/tutorials/quick_start/src/NetRNN_en.png
diff --git a/doc_cn/demo/quick_start/PipelineNetwork.jpg b/doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/PipelineNetwork.jpg
rename to doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg
diff --git a/doc/demo/quick_start/PipelineNetwork_en.jpg b/doc/tutorials/quick_start/src/PipelineNetwork_en.jpg
similarity index 100%
rename from doc/demo/quick_start/PipelineNetwork_en.jpg
rename to doc/tutorials/quick_start/src/PipelineNetwork_en.jpg
diff --git a/doc_cn/demo/quick_start/PipelineTest.jpg b/doc/tutorials/quick_start/src/PipelineTest_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/PipelineTest.jpg
rename to doc/tutorials/quick_start/src/PipelineTest_cn.jpg
diff --git a/doc/demo/quick_start/PipelineTest_en.png b/doc/tutorials/quick_start/src/PipelineTest_en.png
similarity index 100%
rename from doc/demo/quick_start/PipelineTest_en.png
rename to doc/tutorials/quick_start/src/PipelineTest_en.png
diff --git a/doc_cn/demo/quick_start/PipelineTrain.jpg b/doc/tutorials/quick_start/src/PipelineTrain_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/PipelineTrain.jpg
rename to doc/tutorials/quick_start/src/PipelineTrain_cn.jpg
diff --git a/doc/demo/quick_start/PipelineTrain_en.png b/doc/tutorials/quick_start/src/PipelineTrain_en.png
similarity index 100%
rename from doc/demo/quick_start/PipelineTrain_en.png
rename to doc/tutorials/quick_start/src/PipelineTrain_en.png
diff --git a/doc_cn/demo/quick_start/Pipeline.jpg b/doc/tutorials/quick_start/src/Pipeline_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/Pipeline.jpg
rename to doc/tutorials/quick_start/src/Pipeline_cn.jpg
diff --git a/doc/demo/quick_start/Pipeline_en.jpg b/doc/tutorials/quick_start/src/Pipeline_en.jpg
similarity index 100%
rename from doc/demo/quick_start/Pipeline_en.jpg
rename to doc/tutorials/quick_start/src/Pipeline_en.jpg
diff --git a/doc/tutorials/rec/ml_dataset_cn.md b/doc/tutorials/rec/ml_dataset_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..2207a776f0774e72aba15169e59258dd04583637
--- /dev/null
+++ b/doc/tutorials/rec/ml_dataset_cn.md
@@ -0,0 +1,105 @@
+```eval_rst
+.. _demo_ml_dataset:
+
+```
+
+# MovieLens数据集
+
+[MovieLens 数据集](http://grouplens.org/datasets/movielens/)由GroupLens Research实验室搜集整理。
+该数据集包含一些用户信息、电影信息以及电影评分\[1-5\]。根据数据量规模，该数据及有很多不同的版本。
+我们用[MovieLens 百万数据集](http://files.grouplens.org/datasets/movielens/ml-1m.zip)作为示例数据
+集，其中包含6,000位用户对4,000部电影的1,000,000条评价。该数据集于2003年2月发布。
+
+## 数据集特征
+
+在[ml-1m 数据集](http://files.grouplens.org/datasets/movielens/ml-1m.zip)中有许多的特征。在[ml-1m 数据集]
+(http://files.grouplens.org/datasets/movielens/ml-1m.zip)中的这些数据文件(含有".dat"的后缀)实际上是CSV文件，
+分隔符为"::"。以下我们翻译数据集网站中README文件的描述:
+
+### 评分文件描述(ratings.dat)
+
+
+所有的评分数据都包含在"ratings.dat"文件中，遵循如下的格式:
+
+用户ID::电影ID::评分::时间戳
+
+- 用户ID范围从1到6040
+- 电影ID范围从1到3952
+- 评分被调整为5星的规模(只允许整数的星级)
+- 时间戳表示为从1970-01-01(UTC)来的秒数，与time(2)的返回值一致
+- 每位用户至少有20条评分
+
+### 用户文件描述(users.dat)
+
+所有的用户信息都包含在"users.dat"文件中，遵循如下的格式:
+
+用户ID::性别::年龄::职业::邮编
+
+所有的人口统计学信息由用户自愿提供，没有进行正确性的检查。只有含有人
+口统计学信息的用户才被包含在数据集中。
+
+- 性别，用"M"表示男性，"F"表示女性
+- 年龄从下列列表范围中选取:
+
+	*   1:	"18岁以下"
+	*  18:	"18-24岁"
+	*  25:	"25-34岁"
+	*  35:	"35-44岁"
+	*  45:	"45-49岁"
+	*  50:	"50-55岁"
+	*  56:	"56+"
+
+- 职业从下面所列中选择:
+
+	*   0:  "其他"或不确定
+	*   1:  "学术/教育工作者"
+	*   2:  "艺术家"
+	*   3:  "文书工作/管理员"
+	*   4:  "大学生/研究生"
+	*   5:  "客户服务"
+	*   6:  "医生/医疗保健"
+	*   7:  "行政工作/管理人员"
+	*   8:  "农民"
+	*   9:  "操持家务者"
+	*  10:  "高中毕业生"
+	*  11:  "律师"
+	*  12:  "程序员"
+	*  13:  "退休人员"
+	*  14:  "销售/市场"
+	*  15:  "科学家"
+	*  16:  "自由职业者"
+	*  17:  "技术员/工程师"
+	*  18:  "推销员/手工艺者"
+	*  19:  "无业人士"
+	*  20:  "作家"
+
+### 电影文件描述(movies.dat)
+
+所有的电影信息都包含在"movies.dat"文件中，遵循如下的格式:
+
+电影ID::电影名称::电影类型
+
+- 电影名称（包括发行时间）与IMDB网站提供的一致
+- 电影类型如符合多种用管道符号|分割，选自下列类型:
+
+	*	动作片
+	*	冒险片
+	*	动画片
+	*	儿童片
+	*	喜剧片
+	*	犯罪片
+	*	纪录片
+	*	戏剧
+	*	奇幻片
+	*	黑色电影
+	*	恐怖片
+	*	音乐剧
+	*	悬疑片
+	*	浪漫片
+	*	科幻片
+	*	惊险电影
+	*	战争片
+	*	西部片
+
+- 由于意外的副本记录和测试记录，有些电影ID可能与实际电影不相符合
+- 电影大部分是手工输入数据，因此可能会有一些错误和不一致发生
diff --git a/doc/demo/rec/ml_dataset.md b/doc/tutorials/rec/ml_dataset_en.md
similarity index 98%
rename from doc/demo/rec/ml_dataset.md
rename to doc/tutorials/rec/ml_dataset_en.md
index c93a4585e4027b1912da8a77c2562d1ee69c5366..25dea5c4afbf1ce1c1ac6195cbd245b116459e2e 100644
--- a/doc/demo/rec/ml_dataset.md
+++ b/doc/tutorials/rec/ml_dataset_en.md
@@ -1,3 +1,7 @@
+```eval_rst
+..  _demo_ml_dataset:
+```
+
 # MovieLens Dataset
 
 The [MovieLens Dataset](http://grouplens.org/datasets/movielens/) was collected by GroupLens Research.
diff --git a/doc/tutorials/rec/ml_regression_cn.rst b/doc/tutorials/rec/ml_regression_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a084e4790c78018a2cb02392e22cf59a4b94aeeb
--- /dev/null
+++ b/doc/tutorials/rec/ml_regression_cn.rst
@@ -0,0 +1,349 @@
+MovieLens数据集评分回归模型
+=========================
+
+这里我们在MovieLens数据集描述一种 **余弦相似度回归** 任务。
+该示例将展示paddle如何进行词向量嵌入，处理相似度回归，针对文本
+的单词级别的卷积神经网络，以及paddle如何处理多种类型的输入。
+需要注意的是，该模型网络只是用于进行demo展示paddle如何工作，而
+没有进行结构的微调。
+
+
+**我们非常欢迎您用PADDLEPADDLE构建更好的示例，如果您有好的建议来
+让这个示例变得更好，希望能让我们知晓。**
+
+数据准备
+```````
+下载并解压数据集
+''''''''''''''
+这里我们使用 :ref:`demo_ml_dataset` 。
+要下载和解压数据集，只需要简单的运行下面的命令即可。
+
+.. code-block:: bash
+
+	cd demo/recommendation/data
+	./ml_data.sh
+
+:code:`demo/recommendation/data/ml-1m` 的目录结构为:
+
+.. code-block:: text
+
+	+--ml-1m
+		+--- movies.dat 	# 电影特征
+		+--- ratings.dat 	# 评分
+		+--- users.dat 		# 用户特征
+		+--- README 		# 数据集描述
+
+字段配置文件
+''''''''''
+**字段配置文件** 用来具体说明数据集的字段和文件格式，
+例如，说明每个特征文件具体字段是 **什么** 类型。
+
+ml-1m的字段配置文件在目录 :code:`demo/recommendation/data/config.json` 中。
+其具体说明了字段类型和文件名称:
+
+1) 用户文件中有四种类型的字段\: 编号，性别，年龄和职业；
+
+2) 文件名称为"users.dat"，文件的分隔符为"::"。
+
+.. include:: ../../../demo/recommendation/data/config.json
+   :code: json
+   :literal:
+
+准备数据
+```````
+你需要安装python的第三方库。
+**强烈推荐使用VIRTUALENV来创造一个干净的python环境。**
+
+.. code-block:: bash
+
+	pip install -r requirements.txt
+
+预处理数据一般的命令为:
+
+.. code-block:: bash
+
+	cd demo/recommendation
+	./preprocess.sh
+
+下面介绍预处理过程具体的步骤。
+
+提取电影或用户的特征并生成python对象
+''''''''''''''''''''''''''''''''
+
+在movielens 1m数据集中，电影和用户有许多的特征。
+评分文件的每一行仅仅提供电影或用户的编号来代表相应的电影或用户。
+我们首先处理电影或用户的特征文件，然后用pickle命令将特征( **Meta** )对象存储为文件。
+
+Meta配置文件
+...........
+
+**Meta配置文件** 用来具体描述 **如何** 解析数据集中的每一个字段。
+该文件可以从字段配置文件生成，或是手动编辑生成。文件的格式可以
+为json或yaml格式。解析器能通过文件的扩展名自动识别文件的格式。
+
+要将字段配置文件转化为meta配置文件，只需要运行：
+
+.. code-block:: bash
+
+	cd demo/recommendation/data
+	python config_generator.py config.json > meta_config.json
+
+生成的meta配置文件如下所示：
+
+.. include:: ../../../demo/recommendation/data/meta_config.json
+	:code: json
+	:literal:
+
+在meta文件中有两种特征\: 电影和用户。
+
+* 在电影文件movies.dat中
+	* 我们仅用"::"来分隔每一行
+	* pos 0 代表编号
+	* pos 1 特征：
+		* name是电影名
+		* 利用正则表达式来解析该特征
+		* 基于字母的词嵌入特征
+		* 是序列
+	* pos 2 特征：
+		* name是体裁
+		* type是one hot稠密向量
+		* dictionary由解析自动生成，每一个key由'|'分隔
+* 在用户文件users.dat中
+	* 我们仅用"::"来分隔每一行
+	* pos 0 代表编号
+	* pos 1 特征：
+		* name是性别
+		* 简单的基于字母的词嵌入
+	* pos 2 特征：
+		* name是年龄
+		* 是整个的词嵌入
+		* 嵌入编号会根据单词排序
+	* pos 3 特征：
+		* name是职业
+		* 简单的整个词嵌入
+
+
+Meta文件
+''''''''
+
+有了meta配置文件之后，我们可以生成 **Meta文件** ，该文件是python的pickle对象，
+存储着电影或用户信息。可以运行下面的命令来生成。
+
+.. code-block:: bash
+
+	python meta_generator.py ml-1m meta.bin --config=meta_config.json
+
+meta文件 :code:`meta.bin` 的结构如下：
+
+.. code-block:: text
+
+    +--+ movie
+    |      +--+ __meta__
+    |      |       +--+ raw_meta  # 每个特征的meta配置。列表
+    |      |       |       +
+    |      |       |       |     # 编号字段，我们用编号作为key 
+    |      |       |       +--+ {'count': 3883, 'max': 3952, 'is_key': True, 'type': 'id', 'min': 1}
+    |      |       |       |
+    |      |       |       |     # 电影名字段，嵌入特征字典
+    |      |       |       +--+ {'dict': [ ... ], 'type': 'embedding', 'name': 'title', 'seq': 'sequence'}
+    |      |       |       |
+    |      |       |       |     # 体裁字段，体裁字典
+    |      |       |       +--+ {'dict': [ ... ], 'type': 'one_hot_dense', 'name': 'genres'}
+    |      |       |
+    |      |       +--+ feature_map [1, 2] # a list for raw_meta index for feature field.
+    |      |                               # it means there are 2 features for each key.
+    |      |                               #    * 0 offset of feature is raw_meta[1], Title.
+    |      |                               #    * 1 offset of feature is raw_meta[2], Genres.
+    |      |
+    |      +--+ 1 # 电影1的特征
+    |      |    +
+    |      |    +---+ [[...], [...]] # title ids, genres dense vector
+    |      |
+    |      +--+ 2
+    |      |
+    |      +--+ ...
+    |
+    +--- user
+           +--+ __meta__
+           |       +
+           |       +--+ raw_meta
+           |       |       +
+           |       |       +--+ id field as user
+           |       |       |
+           |       |       +--+ {'dict': ['F', 'M'], 'type': 'embedding', 'name': 'gender', 'seq': 'no_sequence'}
+           |       |       |
+           |       |       +--+ {'dict': ['1', '18', '25', '35', '45', '50', '56'], 'type': 'embedding', 'name': 'age', 'seq': 'no_sequence'}
+           |       |       |
+           |       |       +--+ {'dict': [...], 'type': 'embedding', 'name': 'occupation', 'seq': 'no_sequence'}
+           |       |
+           |       +--+ feature_map [1, 2, 3]
+           |
+           +--+ 1 # 用户1的特征
+           |
+           +--+ 2
+           +--+ ...
+
+
+分割训练/测试文件
+'''''''''''''''
+
+我们将 :code:`ml-1m/ratings.dat` 文件分割为训练和测试文件。分割文件的方法是：对于每位用户，我们将评分分成两部分。
+这样的话每位用户在测试文件中将与训练文件含有同样的信息。
+
+用 :code:`separate.py` 来分离训练和测试文件。
+
+.. code-block:: bash
+
+	python split.py ml-1m/ratings.dat --delimiter="::" --test_ratio=0.1
+
+这样就会生成两个文件：:code:`ml-1m/ratings.dat.train` 和 :code:`ml-1m/ratings.data.test` 。
+将他们移动到目录 :code:`data` ，然后进行随机打乱，再为paddle的训练过程提供文件列表。
+
+..  code-block:: bash
+
+    shuf ml-1m/ratings.dat.train > ratings.dat.train
+    cp ml-1m/ratings.dat.test .
+    echo "./data/ratings.dat.train" > train.list
+    echo "./data/ratings.dat.test" > test.list
+
+
+神经网络结构配置
+``````````````
+
+训练器配置文件
+''''''''''''
+
+网络结构如下图所示：
+
+..  image:: rec_regression_network.png
+    :align: center
+    :alt: rec_regression_network
+
+该示例的神经网络配置文件 :code:`trainer_config.py` 如下所示：
+
+..  literalinclude:: ../../../demo/recommendation/trainer_config.py
+    :language: python
+    :lines: 15-
+
+在文件 :code:`trainer_config.py` 中，我们仅仅是将每个特征种类映射到一个特征向量中，以下
+展示了如何将每个特征映射到一个向量。
+
+* :code:`id` \: 仅仅是简单的嵌入，然后添加一个全连接层。
+* :code:`embedding` \:
+    - 如果是序列，则先做嵌入，然后再做一次文本卷积网络操作，
+      然后得到平均采样的结果。
+    - 如果不是序列，则先做嵌入，然后添加一个全连接层。
+* :code:`one_host_dense` \:
+    - 仅仅是两个全连接层。
+
+然后我们利用多输入的:code:`fc_layer` 全连接层将电影的每个特征结合成一个电影特征，
+并且对用户的特征做同样的操作，也得到一个用户特征。然后我们求这两个特征的余弦相似度。
+
+在这些网络中，我们用以下的一些:ref:`api_trainer_config` 中的接口。
+
+*  数据层， :ref:`api_trainer_config_helpers_layers_data_layer`
+*  全连接层， :ref:`api_trainer_config_helpers_layers_fc_layer`
+*  嵌入层， :ref:`api_trainer_config_helpers_layers_embedding_layer`
+*  文本投影层， :ref:`api_trainer_config_helpers_layers_context_projection`
+*  采样层， :ref:`api_trainer_config_helpers_layers_pooling_layer`
+*  余弦相似度层， :ref:`api_trainer_config_helpers_layers_cos_sim`
+*  文本卷积采样层， :ref:`api_trainer_config_helpers_network_text_conv_pool`
+*  声明Python数据源， :ref:`api_trainer_config_helpers_data_sources` 
+
+数据提供脚本
+'''''''''''
+
+..  literalinclude:: ../../../demo/recommendation/dataprovider.py
+    :language: python
+    :lines: 15-
+
+数据提供脚本仅仅是读取meta.bin和评分文件，生成训练需要的样本。
+在脚本 :code:`dataprovider.py` 中，我们需要设置：
+
+* obj.slots\: 特征的类型和维度。
+* use_seq\: :code:`dataprovider.py` 中的数据是否为序列模式。
+* process\: 返回数据的每一条样本给 :code:`paddle` 。
+
+数据提供脚本的细节文档可以参考 :ref:`api_pydataprovider` 。
+
+训练
+````
+
+准备好数据，配置了网络，编写好数据提供脚本后，现在我们可以开始paddle训练了。
+
+代码 :code:`run.sh` 如下：
+
+..  literalinclude:: ../../../demo/recommendation/run.sh
+    :language: bash
+    :lines: 16-
+
+该脚本仅仅是开始一个paddle训练过程，将日志写入文件 :code:`log.txt` ，然后
+打印在屏幕上。
+
+脚本 :code:`run.sh` 中的每一行命令，请参考页面 :ref:`cmd_line_index` 。
+这些参数的简短介绍如下：
+
+*  config\: 告诉paddle哪个文件是神经网络的配置文件。
+*  save_dir\: 告诉paddle将模型保存在: code:`./output` 中。
+*  use_gpu\: 是否使用GPU，默认为不使用。
+*  trainer_count\: 一台机器上面的线程数量。
+*  test_all_data_in_one_period\: 每一个测试周期测试一次所有数据。否则，
+   每个测试周期测试: code:`batch_size` 批次的数据。
+*  log_period\: 在训练了: code:`log_period` 批次后打印日志。
+*  dot_period\: 在每训练: code:`dot_period` 个批次后打印一个 :code:`.` 。
+*  num_passes\: 训练至多: code:`num_passes` 轮。
+
+如果训练过程启动成功的话，输出应该类似如下：
+
+..  code-block:: text
+
+    I0601 08:07:22.832059 10549 TrainerInternal.cpp:157]  Batch=100 samples=160000 AvgCost=4.13494 CurrentCost=4.13494 Eval:  CurrentEval:
+
+    I0601 08:07:50.672627 10549 TrainerInternal.cpp:157]  Batch=200 samples=320000 AvgCost=3.80957 CurrentCost=3.48421 Eval:  CurrentEval:
+
+    I0601 08:08:18.877369 10549 TrainerInternal.cpp:157]  Batch=300 samples=480000 AvgCost=3.68145 CurrentCost=3.42519 Eval:  CurrentEval:
+
+    I0601 08:08:46.863963 10549 TrainerInternal.cpp:157]  Batch=400 samples=640000 AvgCost=3.6007 CurrentCost=3.35847 Eval:  CurrentEval:
+
+    I0601 08:09:15.413025 10549 TrainerInternal.cpp:157]  Batch=500 samples=800000 AvgCost=3.54811 CurrentCost=3.33773 Eval:  CurrentEval:
+    I0601 08:09:36.058670 10549 TrainerInternal.cpp:181]  Pass=0 Batch=565 samples=902826 AvgCost=3.52368 Eval:
+    I0601 08:09:46.215489 10549 Tester.cpp:101]  Test samples=97383 cost=3.32155 Eval:
+    I0601 08:09:46.215966 10549 GradientMachine.cpp:132] Saving parameters to ./output/model/pass-00000
+    I0601 08:09:46.233397 10549 ParamUtil.cpp:99] save dir ./output/model/pass-00000
+    I0601 08:09:46.233438 10549 Util.cpp:209] copy trainer_config.py to ./output/model/pass-00000
+    I0601 08:09:46.233541 10549 ParamUtil.cpp:147] fileName trainer_config.py
+
+模型被保存在 :code:`output/` 目录中。你可以在任何时候用 :code:`Ctrl-C` 来停止训练。
+
+模型评估和预测
+````````````
+
+在训练了几个轮次以后，你可以对模型进行评估，得到最好轮次下的模型。运行下面命令即可：
+
+.. code-block:: bash
+
+    ./evaluate.sh 
+
+你将看到如下的信息：
+
+.. code-block:: text
+
+    Best pass is 00009,  error is 3.06949, which means predict get error as 0.875998002281
+    evaluating from pass output/pass-00009
+
+然后，你可以预测任何用户对于任何一部电影的评价，运行下面命令即可：
+
+..  code-block:: bash
+
+    python prediction.py 'output/pass-00009/'
+
+预测程序将读取用户的输入，然后输出预测分数。用户预测的命令行界面如下：
+
+..  code-block:: text
+
+    Input movie_id: 9
+    Input user_id: 4
+    Prediction Score is 2.56
+    Input movie_id: 8
+    Input user_id: 2
+    Prediction Score is 3.13
diff --git a/doc/demo/rec/ml_regression.rst b/doc/tutorials/rec/ml_regression_en.rst
similarity index 87%
rename from doc/demo/rec/ml_regression.rst
rename to doc/tutorials/rec/ml_regression_en.rst
index 0c14e4f5bb7f815a06c0c756b1a6e6ef9099fd66..993b9a516f134ff8b59e8755b721f76c8f32f0fd 100644
--- a/doc/demo/rec/ml_regression.rst
+++ b/doc/tutorials/rec/ml_regression_en.rst
@@ -16,7 +16,7 @@ Data Preparation
 ````````````````
 Download and extract dataset
 ''''''''''''''''''''''''''''
-We use `movielens 1m dataset <ml_dataset.html>`_ here. 
+We use :ref:`demo_ml_dataset` here. 
 To download and unzip the dataset, simply run the following commands.
 
 ..  code-block:: bash
@@ -36,7 +36,7 @@ And the directory structure of :code:`demo/recommendation/data/ml-1m` is:
 
 Field config file
 '''''''''''''''''
-**Field config file** is used to specific the fields dataset and file format,
+**Field config file** is used to specify the fields of the dataset and the file format,
 i.e, specific **WHAT** type it is in each feature file.
 
 The field config file of ml-1m shows in :code:`demo/recommendation/data/config.json`.
@@ -188,7 +188,7 @@ Split Training/Testing files
 We split :code:`ml-1m/ratings.dat` into a training and testing file. The way to split file is for each user, we split the
 rating by two parts. So each user in testing file will have some rating information in training file.
 
-Use separate.py to separate the training and testing file.
+Use :code:`separate.py` to separate the training and testing file.
 
 ..  code-block:: bash
 
@@ -217,7 +217,7 @@ The network structure shows below.
     :align: center
     :alt: rec_regression_network
 
-The demo's neural network config file "trainer_config.py" show as below.
+The demo's neural network config file :code:`trainer_config.py` show as below.
 
 ..  literalinclude:: ../../../demo/recommendation/trainer_config.py
     :language: python
@@ -239,26 +239,16 @@ Then we combine each features of movie into one movie feature by a
 get one user feature. Then we calculate the cosine similarity of these two
 features.
 
-In these network, we use several api in `trainer_config_helpers
-<../../ui/api/trainer_config_helpers/index.html>`_. There are
-
-*  Data Layer, `data_layer 
-   <../../ui/api/trainer_config_helpers/layers.html#id1>`_
-*  Fully Connected Layer, `fc_layer
-   <../../ui/api/trainer_config_helpers/layers.html#fc-layer>`_
-*  Embedding Layer, `embedding_layer
-   <../../ui/api/trainer_config_helpers/layers.html#embedding-layer>`_
-*  Context Projection Layer, `context_projection
-   <../../ui/api/trainer_config_helpers/layers.html#context-projection>`_
-*  Pooling Layer, `pooling_layer
-   <../../ui/api/trainer_config_helpers/layers.html#pooling-layer>`_
-*  Cosine Similarity Layer, `cos_sim
-   <../../ui/api/trainer_config_helpers/layers.html#cos-sim>`_
-*  Text Convolution Pooling Layer, `text_conv_pool
-   <../../ui/api/trainer_config_helpers/networks.html
-   #trainer_config_helpers.networks.text_conv_pool>`_
-*  Declare Python Data Sources, `define_py_data_sources2
-   <../../ui/api/trainer_config_helpers/data_sources.html>`_
+In these networks, we use several APIs in :ref:`api_trainer_config` . There are
+
+*  Data Layer, :ref:`api_trainer_config_helpers_layers_data_layer`
+*  Fully Connected Layer, :ref:`api_trainer_config_helpers_layers_fc_layer`
+*  Embedding Layer, :ref:`api_trainer_config_helpers_layers_embedding_layer`
+*  Context Projection Layer, :ref:`api_trainer_config_helpers_layers_context_projection`
+*  Pooling Layer, :ref:`api_trainer_config_helpers_layers_pooling_layer`
+*  Cosine Similarity Layer, :ref:`api_trainer_config_helpers_layers_cos_sim`
+*  Text Convolution Pooling Layer, :ref:`api_trainer_config_helpers_network_text_conv_pool`
+*  Declare Python Data Sources :ref:`api_trainer_config_helpers_data_sources`.
 
 Data Provider
 '''''''''''''
@@ -274,27 +264,26 @@ In this :code:`dataprovider.py`, we should set\:
 * use_seq\: Whether this :code:`dataprovider.py` in sequence mode or not.
 * process\: Return each sample of data to :code:`paddle`.
 
-The data provider details document see `there <../../ui/data_provider/pydataprovider2.html>`_.
+The data provider details document see :ref:`api_pydataprovider2`.
 
 Train
 `````
 
 After prepare data, config network, writting data provider, now we can run paddle training.
 
-The run.sh is shown as follow:
+The :code:`run.sh` is shown as follow:
 
 ..  literalinclude:: ../../../demo/recommendation/run.sh
     :language: bash
     :lines: 16-
 
-It just start a paddle training process, write the log to `log.txt`,
+It just start a paddle training process, write the log to :code:`log.txt`,
 then print it on screen.
 
-Each command line argument in :code:`run.sh`, please refer to the `command line
-arguments <../../ui/index.html#command-line-argument>`_ page. The short description of these arguments is shown as follow.
+Each command line argument in :code:`run.sh`, please refer to the :ref:`cmd_line_index` page. The short description of these arguments is shown as follow.
 
 *  config\: Tell paddle which file is neural network configuration.
-*  save_dir\: Tell paddle save model into './output'
+*  save_dir\: Tell paddle save model into :code:`./output`.
 *  use_gpu\: Use gpu or not. Default is false.
 *  trainer_count\: The compute thread in one machine.
 *  test_all_data_in_one_period\: Test All Data during one test period. Otherwise,
diff --git a/doc/demo/rec/rec_regression_network.png b/doc/tutorials/rec/rec_regression_network.png
similarity index 100%
rename from doc/demo/rec/rec_regression_network.png
rename to doc/tutorials/rec/rec_regression_network.png
diff --git a/doc/demo/semantic_role_labeling/feature.jpg b/doc/tutorials/semantic_role_labeling/feature.jpg
similarity index 100%
rename from doc/demo/semantic_role_labeling/feature.jpg
rename to doc/tutorials/semantic_role_labeling/feature.jpg
diff --git a/doc/tutorials/semantic_role_labeling/index_cn.md b/doc/tutorials/semantic_role_labeling/index_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..f6061766c038a7bb6e4ae376685a10cd5669d2ed
--- /dev/null
+++ b/doc/tutorials/semantic_role_labeling/index_cn.md
@@ -0,0 +1,201 @@
+# 语义角色标注教程 #
+
+语义角色标注（Semantic role labeling, SRL）是浅层语义解析的一种形式，其目的是在给定的输入句子中发现每个谓词的谓词论元结构。 SRL作为很多自然语言处理任务中的中间步骤是很有用的，如信息提取、文档自动分类和问答。 实例如下 [1]:
+
+ [ <sub>A0</sub> He ] [ <sub>AM-MOD</sub> would ][ <sub>AM-NEG</sub> n’t ] [ <sub>V</sub> accept] [ <sub>A1</sub> anything of value ] from [<sub>A2</sub> those he was writing about ]. 
+
+- V: 动词
+- A0: 接受者
+- A1: 接受的东西
+- A2: 从……接受
+- A3: 属性
+- AM-MOD: 情态动词 
+- AM-NEG: 否定
+
+给定动词“accept”，句子中的组块将会扮演某些语义角色。这里，标签方案来自 Penn Proposition Bank。
+
+到目前为止，大多数成功的SRL系统是建立在某种形式的句法分析结果之上的，使用了基于句法结构的预定义特征模板。 本教程将介绍使用深度双向长短期记忆（DB-LSTM）模型[2]的端到端系统来解决SRL任务，这在很大程度上优于先前的最先进的系统。 这个系统将SRL任务视为序列标注问题。
+
+## 数据描述
+相关论文[2]采用 CoNLL-2005＆2012 共享任务中设置的数据进行训练和测试。由于数据许可的原因，演示采用 CoNLL-2005 的测试数据集，可以在网站上找到。
+
+用户只需执行以下命令就可以下载并处理原始数据：
+
+```bash
+cd data
+./get_data.sh
+```
+`data `目录会出现如下几个新的文件：
+```bash
+conll05st-release：the test data set of CoNll-2005 shared task 
+test.wsj.words：the Wall Street Journal data sentences
+test.wsj.props:  the propositional arguments
+feature: the extracted features from data set
+```
+
+## 训练
+### DB-LSTM
+请参阅情感分析的演示以了解有关长期短期记忆单元的更多信息。
+
+与在 Sentiment Analysis 演示中使用的 Bidirectional-LSTM 不同，DB-LSTM 采用另一种方法来堆叠LSTM层。首先，标准LSTM以正向处理该序列。该 LSTM 层的输入和输出作为下一个 LSTM 层的输入，并被反向处理。这两个标准 LSTM 层组成一对 LSTM。然后我们堆叠一对对的 LSTM 层后得到深度 LSTM 模型。
+
+下图展示了时间扩展的2层 DB-LSTM 网络。
+<center>
+![pic](./network_arch.png)
+</center>
+
+### 特征
+两个输入特征在这个流程中起着至关重要的作用：predicate（pred）和argument（arguments）。 还采用了两个其他特征：谓词上下文（ctx-p）和区域标记（mr）。 因为单个谓词不能精确地描述谓词信息，特别是当相同的词在句子中出现多于一次时。 使用谓词上下文，可以在很大程度上消除歧义。类似地，如果它位于谓词上下文区域中，则使用区域标记 m<sub>r</sub> = 1 来表示参数位置，反之则 m<sub>r</sub> = 0。这四个简单的特征是我们的SRL系统所需要的。上下文大小设置为1的一个样本的特征如下[2]所示：
+<center>
+![pic](./feature.jpg)
+</center>
+
+在这个示例中，相应的标记句子是：
+
+[ <sub>A1</sub> A record date ] has [ <sub>AM-NEG</sub> n't ] been [ <sub>V</sub> set ] . 
+
+在演示中, 我们采用上面的特征模板, 包括：  `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` 并使用 `B/I/O` 方案来标记每个参数。这些特征和标签存储在 `feature` 文件中, 用`\t`分割。
+
+### 数据提供
+
+`dataprovider.py` 是一个包装数据的 Python 文件。 函数 `hook()` 定义了网络的数据槽。六个特征和标签都是索引槽。
+```
+def hook(settings, word_dict, label_dict, **kwargs):
+    settings.word_dict = word_dict
+    settings.label_dict = label_dict
+    #all inputs are integral and sequential type
+    settings.slots = [
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(predicate_dict)),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(2),
+        integer_value_sequence(len(label_dict))]
+```
+相应的数据迭代器如下：
+```
+@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size,
+          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_name):
+    with open(file_name, 'r') as fdata:
+        for line in fdata:
+            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
+                line.strip().split('\t')
+
+            words = sentence.split()
+            sen_len = len(words)
+            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
+
+            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
+            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
+            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
+
+            marks = mark.split()
+            mark_slot = [int(w) for w in marks]
+
+            label_list = label.split()
+            label_slot = [settings.label_dict.get(w) for w in label_list]
+            yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
+                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
+```
+函数 `process` 返回8个特征list和1个标签list。
+
+### 神经网络配置
+
+`db_lstm.py` 是在训练过程中加载字典并定义数据提供程序模块和网络架构的神经网络配置文件。
+
+九个 `data_layer` 从数据提供程序加载实例。八个特征分别转换为向量，并由`mixed_layer`混合。 深度双向LSTM层提取softmax层的特征。目标函数是标签的交叉熵。
+
+### 训练 
+训练的脚本是 `train.sh`，用户只需执行:
+```bash
+  ./train.sh
+```
+`train.sh` 中的内容：
+```
+paddle train \
+  --config=./db_lstm.py \
+  --use_gpu=0 \
+  --log_period=5000 \
+  --trainer_count=1 \
+  --show_parameter_stats_period=5000 \
+  --save_dir=./output \
+  --num_passes=10000 \
+  --average_test_period=10000000 \
+  --init_model_path=./data \
+  --load_missing_parameter_strategy=rand \
+  --test_all_data_in_one_period=1 \
+2>&1 | tee 'train.log'
+```
+
+-  \--config=./db_lstm.py : 网络配置文件
+-  \--use_gpu=false: 使用 CPU 训练（如果已安装 PaddlePaddle GPU版本并想使用 GPU 训练可以设置为true，目前 crf_layer 不支持 GPU）
+-  \--log_period=500: 每20个batch输出日志
+-  \--trainer_count=1: 设置线程数（或 GPU 数）
+-  \--show_parameter_stats_period=5000: 每100个batch显示参数统计
+-  \--save_dir=./output: 模型输出路径
+-  \--num_passes=10000: 设置数据遍历次数，一个pass意味着PaddlePaddle训练数据集中的所有样本被遍历一次
+-  \--average_test_period=10000000:  每个 average_test_period 批次对平均参数进行测试
+-  \--init_model_path=./data: 参数初始化路径
+-  \--load_missing_parameter_strategy=rand: 随机初始不存在的参数
+-  \--test_all_data_in_one_period=1: 在一个周期内测试所有数据
+
+
+训练后，模型将保存在目录`output`中。 我们的训练曲线如下：
+<center>
+![pic](./src/curve.jpg)
+</center>
+
+### 测试
+测试脚本是 `test.sh`, 执行:
+```bash
+  ./test.sh
+```
+`tesh.sh` 的主要部分：
+```
+paddle train \
+  --config=./db_lstm.py \
+  --model_list=$model_list \
+  --job=test \
+  --config_args=is_test=1 \
+```
+
+  - \--config=./db_lstm.py: 网络配置文件
+  - \--model_list=$model_list.list: 模型列表文件
+  - \--job=test: 指示测试任务
+  - \--config_args=is_test=1: 指示测试任务的标记
+  - \--test_all_data_in_one_period=1: 在一个周期内测试所有数据
+  
+
+### 预测
+预测脚本是 `predict.sh`，用户只需执行：
+```bash
+  ./predict.sh
+  
+```
+在`predict.sh`中，用户应该提供网络配置文件，模型路径，标签文件，字典文件，特征文件。
+```
+python predict.py 
+     -c $config_file \
+     -w $best_model_path \
+     -l $label_file \
+     -p $predicate_dict_file  \
+     -d $dict_file \
+     -i $input_file \
+     -o $output_file
+```
+
+`predict.py` 是主要的可执行python脚本，其中包括函数：加载模型，加载数据，数据预测。网络模型将输出标签的概率分布。 在演示中，我们使用最大概率的标签作为结果。用户还可以根据概率分布矩阵实现柱搜索或维特比解码。
+
+预测后，结果保存在 `predict.res` 中。
+
+## 引用
+[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. 
+
+[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
diff --git a/doc/demo/semantic_role_labeling/semantic_role_labeling.md b/doc/tutorials/semantic_role_labeling/index_en.md
similarity index 97%
rename from doc/demo/semantic_role_labeling/semantic_role_labeling.md
rename to doc/tutorials/semantic_role_labeling/index_en.md
index e2793b2b3494160a7a80f07ec2127bd1f1a4f2e4..92d7c634832119c718711a57c16f69492d405f28 100644
--- a/doc/demo/semantic_role_labeling/semantic_role_labeling.md
+++ b/doc/tutorials/semantic_role_labeling/index_en.md
@@ -1,200 +1,204 @@
-# Semantic Role labeling Tutorial #
-
-Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering.  An instance is as following [1]:
-
- [ <sub>A0</sub> He ] [ <sub>AM-MOD</sub> would ][ <sub>AM-NEG</sub> n’t ] [ <sub>V</sub> accept] [ <sub>A1</sub> anything of value ] from [<sub>A2</sub> those he was writing about ]. 
-
-- V: verb
-- A0: acceptor
-- A1: thing accepted
-- A2: accepted-from
-- A3: Attribute
-- AM-MOD: modal 
-- AM-NEG: negation
-
-Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank. 
-
-To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem. 
-
-## Data Description
-The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license,  the demo adopts the test data set of CoNLL-2005, which can be reached on website.
-
-To download and process the original data, user just need to execute the following command:
-
-```bash
-cd data
-./get_data.sh
-```
-Several new files appear in the `data `directory as follows.
-```bash
-conll05st-release：the test data set of CoNll-2005 shared task 
-test.wsj.words：the Wall Street Journal data sentences
-test.wsj.props:  the propositional arguments
-feature: the extracted features from data set
-```
-
-## Training
-### DB-LSTM
-Please refer to the Sentiment Analysis demo to learn more about the long short-term memory unit. 
-
-Unlike Bidirectional-LSTM that used in Sentiment Analysis demo,  the DB-LSTM adopts another way to stack LSTM layer. First a standard LSTM processes the sequence in forward direction. The input and output of this LSTM layer are taken by the next LSTM layer as input, processed in reversed direction. These two standard LSTM layers compose a pair of LSTM. Then we stack LSTM layers pair after pair to obtain the deep LSTM model. 
-
-The following figure shows a temporal expanded 2-layer DB-LSTM network.
-<center>
-![pic](./network_arch.png)
-</center>
-
-### Features
-Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark m<sub>r</sub> = 1 to denote the argument position if it locates in the predicate context region, or m<sub>r</sub> = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]:
-<center>
-![pic](./feature.jpg)
-</center>
-
-In this sample, the coresponding labelled sentence is:
-
-[ <sub>A1</sub> A record date ] has [ <sub>AM-NEG</sub> n't ] been [ <sub>V</sub> set ] . 
-
-In the demo, we adopt the feature template as above, consists of :  `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` and use `B/I/O` scheme to label each argument. These features and labels are stored in `feature` file, and separated by `\t`.
-
-### Data Provider
-
-`dataprovider.py` is the python file to wrap data. `hook()` function is to define the data slots for network. The  Six features and label are all IndexSlots.
-```
-def hook(settings, word_dict, label_dict, **kwargs):
-    settings.word_dict = word_dict
-    settings.label_dict = label_dict
-    #all inputs are integral and sequential type
-    settings.slots = [
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(predicate_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(2),
-        integer_value_sequence(len(label_dict))]
-```
-The corresponding data iterator is as following:
-```
-@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size,
-          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    with open(file_name, 'r') as fdata:
-        for line in fdata:
-            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
-                line.strip().split('\t')
-
-            words = sentence.split()
-            sen_len = len(words)
-            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
-
-            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
-            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
-            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
-            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
-            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
-            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
-
-            marks = mark.split()
-            mark_slot = [int(w) for w in marks]
-
-            label_list = label.split()
-            label_slot = [settings.label_dict.get(w) for w in label_list]
-            yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
-                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
-```
-The `process`function yield 9 lists which are 8 features and label.
- 
-### Neural Network Config
-`db_lstm.py` is the neural network config file to load the dictionaries and define the  data provider module and network architecture during the training procedure. 
-
-Nine `data_layer` load instances from data provider. Eight features are transformed into embedddings respectively, and mixed by `mixed_layer` .  Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels.
-
-### Run Training 
-The script for training is `train.sh`, user just need to execute:
-```bash
-  ./train.sh
-```
-The content in `train.sh`:
-```
-paddle train \
-  --config=./db_lstm.py \
-  --use_gpu=0 \
-  --log_period=5000 \
-  --trainer_count=1 \
-  --show_parameter_stats_period=5000 \
-  --save_dir=./output \
-  --num_passes=10000 \
-  --average_test_period=10000000 \
-  --init_model_path=./data \
-  --load_missing_parameter_strategy=rand \
-  --test_all_data_in_one_period=1 \
-2>&1 | tee 'train.log'
-```
-
--  \--config=./db_lstm.py : network config file.
--  \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train, until now crf_layer do not support GPU
--  \--log_period=500: print log every 20 batches.
--  \--trainer_count=1: set thread number (or GPU count).
--  \--show_parameter_stats_period=5000: show parameter statistic every 100 batches.
--  \--save_dir=./output: output path to save models.
--  \--num_passes=10000: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
--  \--average_test_period=10000000:  do test on average parameter every average_test_period batches
--  \--init_model_path=./data: parameter initialization path 
--  \--load_missing_parameter_strategy=rand: random initialization unexisted parameters
--  \--test_all_data_in_one_period=1: test all data in one period
-
-
-After training, the models  will be saved in directory `output`. Our training curve is as following:
-<center>
-![pic](./curve.jpg)
-</center>
-
-### Run testing
-The script for testing is `test.sh`, user just need to execute:
-```bash
-  ./test.sh
-```
-The main part in `tesh.sh`
-```
-paddle train \
-  --config=./db_lstm.py \
-  --model_list=$model_list \
-  --job=test \
-  --config_args=is_test=1 \
-```
-
-  - \--config=./db_lstm.py: network config file
-  - \--model_list=$model_list.list: model list file
-  - \--job=test: indicate the test job
-  - \--config_args=is_test=1: flag to indicate test
-  - \--test_all_data_in_one_period=1: test all data in 1 period
-  
-
-### Run prediction
-The script for prediction is `predict.sh`, user just need to execute:
-```bash
-  ./predict.sh
-  
-```
-In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file
-```
-python predict.py 
-     -c $config_file \
-     -w $best_model_path \
-     -l $label_file \
-     -p $predicate_dict_file  \
-     -d $dict_file \
-     -i $input_file \
-     -o $output_file
-```
-
-`predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix.
-
-After prediction,  the result is saved in `predict.res`.
-
-## Reference
-[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. 
-
-[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
+```eval_rst
+..  _semantic_role_labeling:
+```
+
+# Semantic Role labeling Tutorial #
+
+Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering.  An instance is as following [1]:
+
+ [ <sub>A0</sub> He ] [ <sub>AM-MOD</sub> would ][ <sub>AM-NEG</sub> n’t ] [ <sub>V</sub> accept] [ <sub>A1</sub> anything of value ] from [<sub>A2</sub> those he was writing about ]. 
+
+- V: verb
+- A0: acceptor
+- A1: thing accepted
+- A2: accepted-from
+- A3: Attribute
+- AM-MOD: modal 
+- AM-NEG: negation
+
+Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank. 
+
+To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem. 
+
+## Data Description
+The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license,  the demo adopts the test data set of CoNLL-2005, which can be reached on website.
+
+To download and process the original data, user just need to execute the following command:
+
+```bash
+cd data
+./get_data.sh
+```
+Several new files appear in the `data `directory as follows.
+```bash
+conll05st-release：the test data set of CoNll-2005 shared task 
+test.wsj.words：the Wall Street Journal data sentences
+test.wsj.props:  the propositional arguments
+feature: the extracted features from data set
+```
+
+## Training
+### DB-LSTM
+Please refer to the Sentiment Analysis demo to learn more about the long short-term memory unit. 
+
+Unlike Bidirectional-LSTM that used in Sentiment Analysis demo,  the DB-LSTM adopts another way to stack LSTM layer. First a standard LSTM processes the sequence in forward direction. The input and output of this LSTM layer are taken by the next LSTM layer as input, processed in reversed direction. These two standard LSTM layers compose a pair of LSTM. Then we stack LSTM layers pair after pair to obtain the deep LSTM model. 
+
+The following figure shows a temporal expanded 2-layer DB-LSTM network.
+<center>
+![pic](./src/network_arch.png)
+</center>
+
+### Features
+Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark m<sub>r</sub> = 1 to denote the argument position if it locates in the predicate context region, or m<sub>r</sub> = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]:
+<center>
+![pic](./src/feature.jpg)
+</center>
+
+In this sample, the coresponding labelled sentence is:
+
+[ <sub>A1</sub> A record date ] has [ <sub>AM-NEG</sub> n't ] been [ <sub>V</sub> set ] . 
+
+In the demo, we adopt the feature template as above, consists of :  `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` and use `B/I/O` scheme to label each argument. These features and labels are stored in `feature` file, and separated by `\t`.
+
+### Data Provider
+
+`dataprovider.py` is the python file to wrap data. `hook()` function is to define the data slots for network. The  Six features and label are all IndexSlots.
+```
+def hook(settings, word_dict, label_dict, **kwargs):
+    settings.word_dict = word_dict
+    settings.label_dict = label_dict
+    #all inputs are integral and sequential type
+    settings.slots = [
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(predicate_dict)),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(2),
+        integer_value_sequence(len(label_dict))]
+```
+The corresponding data iterator is as following:
+```
+@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size,
+          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_name):
+    with open(file_name, 'r') as fdata:
+        for line in fdata:
+            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
+                line.strip().split('\t')
+
+            words = sentence.split()
+            sen_len = len(words)
+            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
+
+            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
+            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
+            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
+
+            marks = mark.split()
+            mark_slot = [int(w) for w in marks]
+
+            label_list = label.split()
+            label_slot = [settings.label_dict.get(w) for w in label_list]
+            yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
+                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
+```
+The `process`function yield 9 lists which are 8 features and label.
+ 
+### Neural Network Config
+`db_lstm.py` is the neural network config file to load the dictionaries and define the  data provider module and network architecture during the training procedure. 
+
+Nine `data_layer` load instances from data provider. Eight features are transformed into embedddings respectively, and mixed by `mixed_layer` .  Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels.
+
+### Run Training 
+The script for training is `train.sh`, user just need to execute:
+```bash
+  ./train.sh
+```
+The content in `train.sh`:
+```
+paddle train \
+  --config=./db_lstm.py \
+  --use_gpu=0 \
+  --log_period=5000 \
+  --trainer_count=1 \
+  --show_parameter_stats_period=5000 \
+  --save_dir=./output \
+  --num_passes=10000 \
+  --average_test_period=10000000 \
+  --init_model_path=./data \
+  --load_missing_parameter_strategy=rand \
+  --test_all_data_in_one_period=1 \
+2>&1 | tee 'train.log'
+```
+
+-  \--config=./db_lstm.py : network config file.
+-  \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train, until now crf_layer do not support GPU
+-  \--log_period=500: print log every 20 batches.
+-  \--trainer_count=1: set thread number (or GPU count).
+-  \--show_parameter_stats_period=5000: show parameter statistic every 100 batches.
+-  \--save_dir=./output: output path to save models.
+-  \--num_passes=10000: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
+-  \--average_test_period=10000000:  do test on average parameter every average_test_period batches
+-  \--init_model_path=./data: parameter initialization path 
+-  \--load_missing_parameter_strategy=rand: random initialization unexisted parameters
+-  \--test_all_data_in_one_period=1: test all data in one period
+
+
+After training, the models  will be saved in directory `output`. Our training curve is as following:
+<center>
+![pic](./src/curve.jpg)
+</center>
+
+### Run testing
+The script for testing is `test.sh`, user just need to execute:
+```bash
+  ./test.sh
+```
+The main part in `tesh.sh`
+```
+paddle train \
+  --config=./db_lstm.py \
+  --model_list=$model_list \
+  --job=test \
+  --config_args=is_test=1 \
+```
+
+  - \--config=./db_lstm.py: network config file
+  - \--model_list=$model_list.list: model list file
+  - \--job=test: indicate the test job
+  - \--config_args=is_test=1: flag to indicate test
+  - \--test_all_data_in_one_period=1: test all data in 1 period
+  
+
+### Run prediction
+The script for prediction is `predict.sh`, user just need to execute:
+```bash
+  ./predict.sh
+  
+```
+In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file
+```
+python predict.py 
+     -c $config_file \
+     -w $best_model_path \
+     -l $label_file \
+     -p $predicate_dict_file  \
+     -d $dict_file \
+     -i $input_file \
+     -o $output_file
+```
+
+`predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix.
+
+After prediction,  the result is saved in `predict.res`.
+
+## Reference
+[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. 
+
+[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
diff --git a/doc/demo/semantic_role_labeling/network_arch.png b/doc/tutorials/semantic_role_labeling/network_arch.png
similarity index 100%
rename from doc/demo/semantic_role_labeling/network_arch.png
rename to doc/tutorials/semantic_role_labeling/network_arch.png
diff --git a/doc/demo/semantic_role_labeling/curve.jpg b/doc/tutorials/semantic_role_labeling/src/curve.jpg
similarity index 100%
rename from doc/demo/semantic_role_labeling/curve.jpg
rename to doc/tutorials/semantic_role_labeling/src/curve.jpg
diff --git a/doc/tutorials/semantic_role_labeling/src/feature.jpg b/doc/tutorials/semantic_role_labeling/src/feature.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0e3310e4ace5613917e7779d3198ccbb3cdc5ada
Binary files /dev/null and b/doc/tutorials/semantic_role_labeling/src/feature.jpg differ
diff --git a/doc/tutorials/semantic_role_labeling/src/network_arch.png b/doc/tutorials/semantic_role_labeling/src/network_arch.png
new file mode 100644
index 0000000000000000000000000000000000000000..4ae7864212f2a0a38102ee7ff600527ea99fec82
Binary files /dev/null and b/doc/tutorials/semantic_role_labeling/src/network_arch.png differ
diff --git a/doc/demo/sentiment_analysis/bi_lstm.jpg b/doc/tutorials/sentiment_analysis/bi_lstm.jpg
similarity index 100%
rename from doc/demo/sentiment_analysis/bi_lstm.jpg
rename to doc/tutorials/sentiment_analysis/bi_lstm.jpg
diff --git a/doc_cn/demo/sentiment_analysis/sentiment_analysis.md b/doc/tutorials/sentiment_analysis/index_cn.md
similarity index 93%
rename from doc_cn/demo/sentiment_analysis/sentiment_analysis.md
rename to doc/tutorials/sentiment_analysis/index_cn.md
index b70f2d59675615c26b29932cdf99d728bb206148..1323ec1a6abb2e7b5eeb2fbfff9cce5fe78a2c06 100644
--- a/doc_cn/demo/sentiment_analysis/sentiment_analysis.md
+++ b/doc/tutorials/sentiment_analysis/index_cn.md
@@ -1,324 +1,325 @@
-# 情感分析教程
-
-情感分析有许多应用场景。 一个基本的应用场景是区分给定文本的褒贬两极性，给定的文本可以是一个文档、句子、或者是一个小的文本片段。 一个简单的例子如：把用户在购物网站、旅游网站、团购网站（亚马逊、天猫、淘宝等）上发表的评论分成正面评论和负面评论两类。
-
-情感分析也常用于基于大量评论和个人博客来监控社会媒体。 例如，研究人员分析了几个关于消费者信心和政治观点的调查，结果发现它们与同时期的Twitter消息中的情绪词频率相关 [1]。 另一个例子是通过分析每日Twitter博客的文本内容来预测股票变动 [2]。
-
-另一方面，抓取产品的用户评论并分析他们的情感，有助于理解用户对不同公司，不同产品，甚至不同竞争对手产品的偏好。
-
-本教程将指导您完成长期短期记忆（LSTM）网络的训练过程，以分类来自[大型电影评论数据集](http://ai.stanford.edu/~amaas/data/sentiment/)（有时称为[互联网电影数据库 (IMDB)](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf)）的句子的情感 。 此数据集包含电影评论及其相关联的类别标签，即正面和负面。
-
-## 数椐准备
-
-### IMDB 数椐介绍
-
-训练模型之前, 我们需要预处理数椐并构建一个字典。 首先, 你可以使用下面的脚本下载 IMDB 数椐集和[Moses](http://www.statmt.org/moses/)工具, 这是一个基于统计的机器翻译系统. 我们提供了一个数据预处理脚本，它不仅能够处理IMDB数据，还能处理其他用户自定义的数据。 为了使用提前编写的脚本，需要将标记的训练和测试样本移动到另一个路径，这已经在`get_imdb.sh`中完成。
-
-```
-cd demo/sentiment/data
-./get_imdb.sh
-```
-如果数椐获取成功，你将在目录```./demo/sentiment/data```中看到下面的文件：
-
-```
-aclImdb  get_imdb.sh  imdb  mosesdecoder-master
-```
-
-* aclImdb: 从外部网站上下载的原始数椐集。
-* imdb: 仅包含训练和测试数椐集。
-* mosesdecoder-master: Moses 工具。
-
-IMDB数据集包含25,000个已标注过的高极性电影评论用于训练，25,000个用于测试。负面的评论的得分小于等于4,正面的评论的得大于等于7，总评分10分。 运行完脚本 `./get_imdb.sh`后, 我们可以看到在目录 `aclImdb`中的数椐集的结构如下：
-
-```
-imdbEr.txt  imdb.vocab  README  test  train
-```
-* train: 训练数椐集。
-* test : 测试数椐集。
-* imdb.vocab: 字典文件。
-* imdbEr.txt: 字典imdb.vocab中每个切分单词的预期评级。
-* README: 数椐说明文档。
-
-测试集和训练集目录包含下面的文件:
-
-```
-labeledBow.feat  neg  pos  unsup  unsupBow.feat  urls_neg.txt  urls_pos.txt  urls_unsup.txt
-```
-
-* pos: 正面评价样本，包含12,500个txt文件，每个文件是一个电影评论。
-* neg: 负面评价样本，包含12,500个txt文件，每个文件是一个电影评论。
-* unsup: 未标记的评价样本，包含50,000个txt文件。
-* urls_xx.txt: 每个评论的网址。
-* xxBow.feat: 用于统计词频的Bow模型特征。
-
-### IMDB 数椐准备
-
-在这个例子中，我们只使用已经标注过的训练集和测试集，且默认在训练集上构建字典，而不使用IMDB数椐集中的imdb.vocab做为字典。训练集已经做了随机打乱排序而测试集没有。 Moses 工具中的脚本`tokenizer.perl` 用于切分单单词和标点符号。执行下面的命令就可以预处理数椐。
-
-```
-cd demo/sentiment/
-./preprocess.sh
-```
-preprocess.sh:
-
-```
-data_dir="./data/imdb"
-python preprocess.py -i data_dir
-```
-
-* data_dir: 输入数椐所在目录。
-* preprocess.py: 预处理脚本。
-
-运行成功后目录`demo/sentiment/data/pre-imdb` 结构如下:
-
-```
-dict.txt  labels.list  test.list  test_part_000  train.list  train_part_000
-```
-* test\_part\_000 and train\_part\_000: 所有标记的测试集和训练集， 训练集已经随机打乱。
-* train.list and test.list: 训练集和测试集文件列表。
-* dict.txt: 利用训练集生成的字典。
-* labels.txt: neg  0, pos 1, 含义：标签0表示负面的评论，标签1表示正面的评论。
-
-### 用户自定义数椐预处理
-
-如果你执行其它的用情感分析来分类文本的任务，可以按如下的结构来准备数椐. 我们提供了脚本来构建字典和预处理数椐。所以你只用按下面的结构来组织数椐就行了。
-
-```
-dataset
-|----train
-|    |----class1
-|    |    |----text_files
-|    |----class2
-|    |    |----text_files
-|    |    ...
-|----test
-|    |----class1
-|    |    |----text_files
-|    |----class2
-|    |    |----text_files
-|    |    ...
-```
-* dataset: 一级目录。
-* train, test: 二级目录。
-* class1,class2,...: 三级目录。
-* text_files: 文本格式的实例文件。
-
-所有同目录下的文本实例文件都是同级别的。 每个文本文件包含一个或者多个实例，每一行表示一个实例。 为了充分的随机打乱训练集, 在预处理含有多行数椐的文本文件时参数设置稍有不同, 执行`preprocess.sh`脚本时需要加上`-m True`参数。 tokenizer.perl 默认用来切分单记和标点符号，如果你不需要这个操作，在运行`preprocess.sh`时加上`-t False`参数即可。
-
-## 训练模型
-
-在这步任务中,我们使用了循环神经网络（RNN）的 LSTM 架构来训练情感分析模型。 引入LSTM模型主要是为了克服消失梯度的问题。 LSTM网络类似于具有隐藏层的标准循环神经网络, 但是隐藏层中的每个普通节点被一个记忆单元替换。 每个记忆单元包含四个主要的元素: 输入门, 具有自循环连接的神经元，忘记门和输出门。 更多的细节可以在文献中找到[4]。 LSTM架构的最大优点是它可以在长时间间隔内记忆信息，而没有短时记忆的损失。在有新的单词来临的每一个时间步骤内，存储在记忆单元区块的历史信息被更新用来迭代的学习单词以合理的序列程现。
-
-<center>![LSTM](../../../doc/demo/sentiment_analysis/lstm.png)</center>
-<center>图表 1. LSTM [3]</center>
-
-情感分析是自然语言理解中最典型的问题之一。 它的目的是预测在一个序列中表达的情感态度。 通常, ，仅仅是一些关键词，如形容词和副词，在预测序列或段落的情感中起主要作用。然而有些评论上下文非常长，例如 IMDB的数椐集。 我们只所以使用LSTM来执行这个任务是因为其改进的设计并且具有门机制。 首先，它能够从词级到具有可变上下文长度的上下文级别来总结表示。 第二，它可以在句子级别利用可扩展的上下文, 而大多数方法只是利用n-gram级别的知识。第三，它直接学习段落表示，而不是组合上下文级别信息。
-
-在本演示中，我们提供两个网络，即双向LSTM和三层堆叠LSTM。
-
-#### 双向LSTM
-
-图2是双向LSTM网络，后面连全连接层和softmax层。
-
-<center>![BiLSTM](../../../doc/demo/sentiment_analysis/bi_lstm.jpg)</center>
-<center>图 2. Bidirectional-LSTM </center>
-
-#### Stacked-LSTM
-图3是三层LSTM结构。图的底部是word embedding(对文档处理后形成的单词向量)。 接下来，连接三个LSTM隐藏层，并且第二个是反向LSTM。然后提取隐藏LSTM层的所有时间步长的最大词向量作为整个序列的表示。 最后，使用具有softmax激活的全连接前馈层来执行分类任务。 更多内容可查看参考文献 [5]。
-
-<center>![StackedLSTM](../../../doc/demo/sentiment_analysis/stacked_lstm.jpg)</center>
-<center>图 3. Stacked-LSTM for sentiment analysis </center>
-
-**配置**
-
-进入`demo/sentiment` 目录 , `trainer_config.py` 是一个配置文件的例子, 其中包含算法和网络配置。第一行从`sentiment_net.py`中导出预定义的网络。
-
-trainer_config.py:
-
-```python
-from sentiment_net import *
-
-data_dir  = "./data/pre-imdb"
-# whether this config is used for test
-is_test = get_config_arg('is_test', bool, False)
-# whether this config is used for prediction
-is_predict = get_config_arg('is_predict', bool, False)
-dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
-
-################## Algorithm Config #####################
-
-settings(
-  batch_size=128,
-  learning_rate=2e-3,
-  learning_method=AdamOptimizer(),
-  regularization=L2Regularization(8e-4),
-  gradient_clipping_threshold=25
-)
-
-#################### Network Config ######################
-stacked_lstm_net(dict_dim, class_dim=class_dim,
-                 stacked_num=3, is_predict=is_predict)
-#bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
-```
-
-* **数椐定义**:
-   * get\_config\_arg(): 获取通过 `--config_args=xx` 设置的命令行参数。
-   * 定义训练数椐和测试数椐提供者, 这里使用了PaddlePaddle的Python接口来加载数椐。想了解更多细节可以参考PyDataProvider部分的文档
-
-* **算法配置**:
-   * 使用随机梯度下降（sgd）算法。
-   * 使用 adam 优化。
-   * 设置batch size大小为128。
-   * 设置平均sgd窗口。
-   * 设置全局学习率。
-* **网络配置**:
-   * dict_dim: 获取字典维度。
-   * class_dim: 设置类别数，IMDB有两个标签，即正面评价标签和负面评价标签。
-   * `stacked_lstm_net`: 预定义网络如图3所示，默认情况下使用此网络
-   * `bidirectional_lstm_net`: 预定义网络，如图2所示。
-
-**训练**
-
-首先安装PaddlePaddle。 然后使用下面的脚本 `train.sh` 来开启本地的训练。
-
-```
-cd demo/sentiment/
-./train.sh
-```
-
-train.sh:
-
-```
-config=trainer_config.py
-output=./model_output
-paddle train --config=$config \
-             --save_dir=$output \
-             --job=train \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --num_passes=10 \
-             --log_period=20 \
-             --dot_period=20 \
-             --show_parameter_stats_period=100 \
-             --test_all_data_in_one_period=1 \
-             2>&1 | tee 'train.log'
-```
-
-* \--config=$config: 设置网络配置。
-* \--save\_dir=$output: 设置输出路径以保存训练完成的模型。
-* \--job=train: 设置工作模式为训练。
-* \--use\_gpu=false: 使用CPU训练，如果你安装GPU版本的PaddlePaddle，并想使用GPU来训练设置为true。
-* \--trainer\_count=4:设置线程数（或GPU个数）。
-* \--num\_passes=15: 设置pass，PaddlePaddle中的一个pass意味着对数据集中的所有样本进行一次训练。
-* \--log\_period=20: 每20个batch打印一次日志。
-* \--show\_parameter\_stats\_period=100: 每100个batch打印一次统计信息。
-* \--test\_all_data\_in\_one\_period=1: 每次测试都测试所有数据。
-
-如果运行成功，输出日志保存在路径 `demo/sentiment/train.log`中，模型保存在目录`demo/sentiment/model_output/`中。  输出日志说明如下：
-
-```
-Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875  CurrentEval: classification_error_evaluator=0.36875
-...
-Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922
-Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406
-```
-- Batch=xx: 表示训练了xx个Batch。
-- samples=xx: 表示训练了xx个样本。。
-- AvgCost=xx: 从第0个batch到当前batch的平均损失。
-- CurrentCost=xx: 最新log_period个batch处理的当前损失。
-- Eval: classification\_error\_evaluator=xx: 表示第0个batch到当前batch的分类错误。
-- CurrentEval: classification\_error\_evaluator: 最新log_period个batch的分类错误。
-- Pass=0: 通过所有训练集一次称为一遍。 0表示第一次经过训练集。
-
-默认情况下，我们使用`stacked_lstm_net`网络，当传递相同的样本数时，它的收敛速度比`bidirectional_lstm_net`快。如果要使用双向LSTM，只需删除最后一行中的注释并把“stacked_lstm_net”注释掉。
-
-## 测试模型
-
-测试模型是指使用训练出的模型评估已标记的验证集。
-
-```
-cd demo/sentiment
-./test.sh
-```
-
-test.sh:
-
-```bash
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
-  sort | head -n 1
-}
-
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-evaluate_pass="model_output/pass-${LOG[1]}"
-
-echo 'evaluating from pass '$evaluate_pass
-
-model_list=./model.list
-touch $model_list | echo $evaluate_pass > $model_list
-net_conf=trainer_config.py
-paddle train --config=$net_conf \
-             --model_list=$model_list \
-             --job=test \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --config_args=is_test=1 \
-             2>&1 | tee 'test.log'
-```
-
-函数`get_best_pass`依据分类错误率获得最佳模型进行测试。 在本示例中，我们默认使用IMDB的测试数据集作为验证。 与训练不同，它需要在这里指定`--job = test`和模型路径，即`--model_list = $model_list`。如果运行成功，日志将保存在“demo / sentiment / test.log”的路径中。例如，在我们的测试中，最好的模型是`model_output / pass-00002`，分类误差是0.115645，如下：
-
-```
-Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645
-```
-
-## 预测
-
-`predict.py`脚本提供了一个预测接口。在使用它之前请安装PaddlePaddle的python api。 预测IMDB的未标记评论的一个实例如下：
-
-```
-cd demo/sentiment
-./predict.sh
-```
-predict.sh:
-
-```
-#Note the default model is pass-00002, you shold make sure the model path
-#exists or change the mode path.
-model=model_output/pass-00002/
-config=trainer_config.py
-label=data/pre-imdb/labels.list
-python predict.py \
-     -n $config\
-     -w $model \
-     -b $label \
-     -d data/pre-imdb/dict.txt \
-     -i data/aclImdb/test/pos/10007_10.txt
-```
-
-* `predict.py`: 预测接口脚本。
-*  -n $config : 设置网络配置。
-*  -w $model: 设置模型路径。
-*  -b $label: 设置标签类别字典，这个字典是整数标签和字符串标签的一个对应。
-*  -d data/pre-imdb/dict.txt: 设置字典文件。
-*  -i data/aclImdb/test/pos/10014_7.txt: 设置一个要预测的示例文件。
-
-注意应该确保默认模型路径`model_output / pass-00002`存在或更改为其它模型路径。
-
-本示例的预测结果：
-
-```
-Loading parameters from model_output/pass-00002/
-./data/aclImdb/test/pos/10014_7.txt: predicting label is pos
-```
-我们真诚地感谢您的关注，并欢迎您来参与贡献。
-
-## 参考文档
-[1] Brendan O'Connor, Ramnath Balasubramanyan, Bryan R. Routledge, and Noah A. Smith. 2010. [From Tweets to Polls: Linking Text Sentiment to Public Opinion Time Series](http://homes.cs.washington.edu/~nasmith/papers/oconnor+balasubramanyan+routledge+smith.icwsm10.pdf). In ICWSM-2010. <br>
-[2] Johan Bollen, Huina Mao, Xiaojun Zeng. 2011. [Twitter mood predicts the stock market](http://arxiv.org/abs/1010.3003), Journal of Computational Science.<br>
-[3] Alex Graves, Marcus Liwicki, Santiago Fernan- dez, Roman Bertolami, Horst Bunke, and Ju ̈rgen Schmidhuber. 2009. [A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine In- telligence](http://www.cs.toronto.edu/~graves/tpami_2009.pdf), 31(5):855–868.<br>
-[4] Zachary C. Lipton, [A Critical Review of Recurrent Neural Networks for Sequence Learning](http://arxiv.org/abs/1506.00019v1), arXiv:1506.00019. <br>
-[5] Jie Zhou and Wei Xu; [End-to-end Learning of Semantic Role Labeling Using Recurrent Neural Networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf); ACL-IJCNLP 2015. <br>
+# 情感分析教程
+
+情感分析有许多应用场景。 一个基本的应用场景是区分给定文本的褒贬两极性，给定的文本可以是一个文档、句子、或者是一个小的文本片段。 一个简单的例子如：把用户在购物网站、旅游网站、团购网站（亚马逊、天猫、淘宝等）上发表的评论分成正面评论和负面评论两类。
+
+情感分析也常用于基于大量评论和个人博客来监控社会媒体。 例如，研究人员分析了几个关于消费者信心和政治观点的调查，结果发现它们与同时期的Twitter消息中的情绪词频率相关 [1]。 另一个例子是通过分析每日Twitter博客的文本内容来预测股票变动 [2]。
+
+另一方面，抓取产品的用户评论并分析他们的情感，有助于理解用户对不同公司，不同产品，甚至不同竞争对手产品的偏好。
+
+本教程将指导您完成长期短期记忆（LSTM）网络的训练过程，以分类来自[大型电影评论数据集](http://ai.stanford.edu/~amaas/data/sentiment/)（有时称为[互联网电影数据库 (IMDB)](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf)）的句子的情感 。 此数据集包含电影评论及其相关联的类别标签，即正面和负面。
+
+## 数椐准备
+
+### IMDB 数椐介绍
+
+训练模型之前, 我们需要预处理数椐并构建一个字典。 首先, 你可以使用下面的脚本下载 IMDB 数椐集和[Moses](http://www.statmt.org/moses/)工具, 这是一个基于统计的机器翻译系统. 我们提供了一个数据预处理脚本，它不仅能够处理IMDB数据，还能处理其他用户自定义的数据。 为了使用提前编写的脚本，需要将标记的训练和测试样本移动到另一个路径，这已经在`get_imdb.sh`中完成。
+
+```
+cd demo/sentiment/data
+./get_imdb.sh
+```
+如果数椐获取成功，你将在目录```./demo/sentiment/data```中看到下面的文件：
+
+```
+aclImdb  get_imdb.sh  imdb  mosesdecoder-master
+```
+
+* aclImdb: 从外部网站上下载的原始数椐集。
+* imdb: 仅包含训练和测试数椐集。
+* mosesdecoder-master: Moses 工具。
+
+IMDB数据集包含25,000个已标注过的高极性电影评论用于训练，25,000个用于测试。负面的评论的得分小于等于4,正面的评论的得大于等于7，总评分10分。 运行完脚本 `./get_imdb.sh`后, 我们可以看到在目录 `aclImdb`中的数椐集的结构如下：
+
+```
+imdbEr.txt  imdb.vocab  README  test  train
+```
+* train: 训练数椐集。
+* test : 测试数椐集。
+* imdb.vocab: 字典文件。
+* imdbEr.txt: 字典imdb.vocab中每个切分单词的预期评级。
+* README: 数椐说明文档。
+
+测试集和训练集目录包含下面的文件:
+
+```
+labeledBow.feat  neg  pos  unsup  unsupBow.feat  urls_neg.txt  urls_pos.txt  urls_unsup.txt
+```
+
+* pos: 正面评价样本，包含12,500个txt文件，每个文件是一个电影评论。
+* neg: 负面评价样本，包含12,500个txt文件，每个文件是一个电影评论。
+* unsup: 未标记的评价样本，包含50,000个txt文件。
+* urls_xx.txt: 每个评论的网址。
+* xxBow.feat: 用于统计词频的Bow模型特征。
+
+### IMDB 数椐准备
+
+在这个例子中，我们只使用已经标注过的训练集和测试集，且默认在训练集上构建字典，而不使用IMDB数椐集中的imdb.vocab做为字典。训练集已经做了随机打乱排序而测试集没有。 Moses 工具中的脚本`tokenizer.perl` 用于切分单单词和标点符号。执行下面的命令就可以预处理数椐。
+
+```
+cd demo/sentiment/
+./preprocess.sh
+```
+preprocess.sh:
+
+```
+data_dir="./data/imdb"
+python preprocess.py -i data_dir
+```
+
+* data_dir: 输入数椐所在目录。
+* preprocess.py: 预处理脚本。
+
+运行成功后目录`demo/sentiment/data/pre-imdb` 结构如下:
+
+```
+dict.txt  labels.list  test.list  test_part_000  train.list  train_part_000
+```
+* test\_part\_000 and train\_part\_000: 所有标记的测试集和训练集， 训练集已经随机打乱。
+* train.list and test.list: 训练集和测试集文件列表。
+* dict.txt: 利用训练集生成的字典。
+* labels.txt: neg  0, pos 1, 含义：标签0表示负面的评论，标签1表示正面的评论。
+
+### 用户自定义数椐预处理
+
+如果你执行其它的用情感分析来分类文本的任务，可以按如下的结构来准备数椐. 我们提供了脚本来构建字典和预处理数椐。所以你只用按下面的结构来组织数椐就行了。
+
+```
+dataset
+|----train
+|    |----class1
+|    |    |----text_files
+|    |----class2
+|    |    |----text_files
+|    |    ...
+|----test
+|    |----class1
+|    |    |----text_files
+|    |----class2
+|    |    |----text_files
+|    |    ...
+```
+* dataset: 一级目录。
+* train, test: 二级目录。
+* class1,class2,...: 三级目录。
+* text_files: 文本格式的实例文件。
+
+所有同目录下的文本实例文件都是同级别的。 每个文本文件包含一个或者多个实例，每一行表示一个实例。 为了充分的随机打乱训练集, 在预处理含有多行数椐的文本文件时参数设置稍有不同, 执行`preprocess.sh`脚本时需要加上`-m True`参数。 tokenizer.perl 默认用来切分单记和标点符号，如果你不需要这个操作，在运行`preprocess.sh`时加上`-t False`参数即可。
+
+## 训练模型
+
+在这步任务中,我们使用了循环神经网络（RNN）的 LSTM 架构来训练情感分析模型。 引入LSTM模型主要是为了克服消失梯度的问题。 LSTM网络类似于具有隐藏层的标准循环神经网络, 但是隐藏层中的每个普通节点被一个记忆单元替换。 每个记忆单元包含四个主要的元素: 输入门, 具有自循环连接的神经元，忘记门和输出门。 更多的细节可以在文献中找到[4]。 LSTM架构的最大优点是它可以在长时间间隔内记忆信息，而没有短时记忆的损失。在有新的单词来临的每一个时间步骤内，存储在记忆单元区块的历史信息被更新用来迭代的学习单词以合理的序列程现。
+
+<center>![LSTM](src/lstm.png)</center>
+<center>图表 1. LSTM [3]</center>
+
+情感分析是自然语言理解中最典型的问题之一。 它的目的是预测在一个序列中表达的情感态度。 通常, ，仅仅是一些关键词，如形容词和副词，在预测序列或段落的情感中起主要作用。然而有些评论上下文非常长，例如 IMDB的数椐集。 我们只所以使用LSTM来执行这个任务是因为其改进的设计并且具有门机制。 首先，它能够从词级到具有可变上下文长度的上下文级别来总结表示。 第二，它可以在句子级别利用可扩展的上下文, 而大多数方法只是利用n-gram级别的知识。第三，它直接学习段落表示，而不是组合上下文级别信息。
+
+在本演示中，我们提供两个网络，即双向LSTM和三层堆叠LSTM。
+
+#### 双向LSTM
+
+图2是双向LSTM网络，后面连全连接层和softmax层。
+
+<center>![BiLSTM](src/bi_lstm.jpg)</center>
+<center>图 2. Bidirectional-LSTM </center>
+
+#### Stacked-LSTM
+图3是三层LSTM结构。图的底部是word embedding(对文档处理后形成的单词向量)。 接下来，连接三个LSTM隐藏层，并且第二个是反向LSTM。然后提取隐藏LSTM层的所有时间步长的最大词向量作为整个序列的表示。 最后，使用具有softmax激活的全连接前馈层来执行分类任务。 更多内容可查看参考文献 [5]。
+
+<center>![StackedLSTM](src/stacked_lstm.jpg)</center>
+<center>图 3. Stacked-LSTM for sentiment analysis </center>
+
+**配置**
+
+进入`demo/sentiment` 目录 , `trainer_config.py` 是一个配置文件的例子, 其中包含算法和网络配置。第一行从`sentiment_net.py`中导出预定义的网络。
+
+trainer_config.py:
+
+```python
+from sentiment_net import *
+
+data_dir  = "./data/pre-imdb"
+# whether this config is used for test
+is_test = get_config_arg('is_test', bool, False)
+# whether this config is used for prediction
+is_predict = get_config_arg('is_predict', bool, False)
+dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
+
+################## Algorithm Config #####################
+
+settings(
+  batch_size=128,
+  learning_rate=2e-3,
+  learning_method=AdamOptimizer(),
+  regularization=L2Regularization(8e-4),
+  gradient_clipping_threshold=25
+)
+
+#################### Network Config ######################
+stacked_lstm_net(dict_dim, class_dim=class_dim,
+                 stacked_num=3, is_predict=is_predict)
+#bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
+```
+
+* **数椐定义**:
+   * get\_config\_arg(): 获取通过 `--config_args=xx` 设置的命令行参数。
+   * 定义训练数椐和测试数椐提供者, 这里使用了PaddlePaddle的Python接口来加载数椐。想了解更多细节可以参考PyDataProvider部分的文档
+
+* **算法配置**:
+   * 使用随机梯度下降（sgd）算法。
+   * 使用 adam 优化。
+   * 设置batch size大小为128。
+   * 设置平均sgd窗口。
+   * 设置全局学习率。
+* **网络配置**:
+   * dict_dim: 获取字典维度。
+   * class_dim: 设置类别数，IMDB有两个标签，即正面评价标签和负面评价标签。
+   * `stacked_lstm_net`: 预定义网络如图3所示，默认情况下使用此网络
+   * `bidirectional_lstm_net`: 预定义网络，如图2所示。
+
+**训练**
+
+首先安装PaddlePaddle。 然后使用下面的脚本 `train.sh` 来开启本地的训练。
+
+```
+cd demo/sentiment/
+./train.sh
+```
+
+train.sh:
+
+```
+config=trainer_config.py
+output=./model_output
+paddle train --config=$config \
+             --save_dir=$output \
+             --job=train \
+             --use_gpu=false \
+             --trainer_count=4 \
+             --num_passes=10 \
+             --log_period=20 \
+             --dot_period=20 \
+             --show_parameter_stats_period=100 \
+             --test_all_data_in_one_period=1 \
+             2>&1 | tee 'train.log'
+```
+
+* \--config=$config: 设置网络配置。
+* \--save\_dir=$output: 设置输出路径以保存训练完成的模型。
+* \--job=train: 设置工作模式为训练。
+* \--use\_gpu=false: 使用CPU训练，如果你安装GPU版本的PaddlePaddle，并想使用GPU来训练设置为true。
+* \--trainer\_count=4:设置线程数（或GPU个数）。
+* \--num\_passes=15: 设置pass，PaddlePaddle中的一个pass意味着对数据集中的所有样本进行一次训练。
+* \--log\_period=20: 每20个batch打印一次日志。
+* \--show\_parameter\_stats\_period=100: 每100个batch打印一次统计信息。
+* \--test\_all_data\_in\_one\_period=1: 每次测试都测试所有数据。
+
+如果运行成功，输出日志保存在路径 `demo/sentiment/train.log`中，模型保存在目录`demo/sentiment/model_output/`中。  输出日志说明如下：
+
+```
+Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875  CurrentEval: classification_error_evaluator=0.36875
+...
+Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922
+Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406
+```
+- Batch=xx: 表示训练了xx个Batch。
+- samples=xx: 表示训练了xx个样本。。
+- AvgCost=xx: 从第0个batch到当前batch的平均损失。
+- CurrentCost=xx: 最新log_period个batch处理的当前损失。
+- Eval: classification\_error\_evaluator=xx: 表示第0个batch到当前batch的分类错误。
+- CurrentEval: classification\_error\_evaluator: 最新log_period个batch的分类错误。
+- Pass=0: 通过所有训练集一次称为一遍。 0表示第一次经过训练集。
+
+默认情况下，我们使用`stacked_lstm_net`网络，当传递相同的样本数时，它的收敛速度比`bidirectional_lstm_net`快。如果要使用双向LSTM，只需删除最后一行中的注释并把“stacked_lstm_net”注释掉。
+
+## 测试模型
+
+测试模型是指使用训练出的模型评估已标记的验证集。
+
+```
+cd demo/sentiment
+./test.sh
+```
+
+test.sh:
+
+```bash
+function get_best_pass() {
+  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
+  sed  -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
+  sort | head -n 1
+}
+
+log=train.log
+LOG=`get_best_pass $log`
+LOG=(${LOG})
+evaluate_pass="model_output/pass-${LOG[1]}"
+
+echo 'evaluating from pass '$evaluate_pass
+
+model_list=./model.list
+touch $model_list | echo $evaluate_pass > $model_list
+net_conf=trainer_config.py
+paddle train --config=$net_conf \
+             --model_list=$model_list \
+             --job=test \
+             --use_gpu=false \
+             --trainer_count=4 \
+             --config_args=is_test=1 \
+             2>&1 | tee 'test.log'
+```
+
+函数`get_best_pass`依据分类错误率获得最佳模型进行测试。 在本示例中，我们默认使用IMDB的测试数据集作为验证。 与训练不同，它需要在这里指定`--job = test`和模型路径，即`--model_list = $model_list`。如果运行成功，日志将保存在“demo / sentiment / test.log”的路径中。例如，在我们的测试中，最好的模型是`model_output / pass-00002`，分类误差是0.115645，如下：
+
+```
+Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645
+```
+
+## 预测
+
+`predict.py`脚本提供了一个预测接口。在使用它之前请安装PaddlePaddle的python api。 预测IMDB的未标记评论的一个实例如下：
+
+```
+cd demo/sentiment
+./predict.sh
+```
+predict.sh:
+
+```
+#Note the default model is pass-00002, you shold make sure the model path
+#exists or change the mode path.
+model=model_output/pass-00002/
+config=trainer_config.py
+label=data/pre-imdb/labels.list
+cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
+     --tconf=$config\
+     --model=$model \
+     --label=$label \
+     --dict=./data/pre-imdb/dict.txt \
+     --batch_size=1
+```
+
+* `cat ./data/aclImdb/test/pos/10007_10.txt` : 输入预测样本。
+* `predict.py` : 预测接口脚本。
+* `--tconf=$config` : 设置网络配置。
+* `--model=$model` : 设置模型路径。
+* `--label=$label` : 设置标签类别字典，这个字典是整数标签和字符串标签的一个对应。
+* `--dict=data/pre-imdb/dict.txt` : 设置字典文件。
+* `--batch_size=1` : 设置batch size。
+
+注意应该确保默认模型路径`model_output / pass-00002`存在或更改为其它模型路径。
+
+本示例的预测结果：
+
+```
+Loading parameters from model_output/pass-00002/
+./data/aclImdb/test/pos/10014_7.txt: predicting label is pos
+```
+我们真诚地感谢您的关注，并欢迎您来参与贡献。
+
+## 参考文档
+[1] Brendan O'Connor, Ramnath Balasubramanyan, Bryan R. Routledge, and Noah A. Smith. 2010. [From Tweets to Polls: Linking Text Sentiment to Public Opinion Time Series](http://homes.cs.washington.edu/~nasmith/papers/oconnor+balasubramanyan+routledge+smith.icwsm10.pdf). In ICWSM-2010. <br>
+[2] Johan Bollen, Huina Mao, Xiaojun Zeng. 2011. [Twitter mood predicts the stock market](http://arxiv.org/abs/1010.3003), Journal of Computational Science.<br>
+[3] Alex Graves, Marcus Liwicki, Santiago Fernan- dez, Roman Bertolami, Horst Bunke, and Ju ̈rgen Schmidhuber. 2009. [A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine In- telligence](http://www.cs.toronto.edu/~graves/tpami_2009.pdf), 31(5):855–868.<br>
+[4] Zachary C. Lipton, [A Critical Review of Recurrent Neural Networks for Sequence Learning](http://arxiv.org/abs/1506.00019v1), arXiv:1506.00019. <br>
+[5] Jie Zhou and Wei Xu; [End-to-end Learning of Semantic Role Labeling Using Recurrent Neural Networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf); ACL-IJCNLP 2015. <br>
diff --git a/doc/demo/sentiment_analysis/sentiment_analysis.md b/doc/tutorials/sentiment_analysis/index_en.md
similarity index 96%
rename from doc/demo/sentiment_analysis/sentiment_analysis.md
rename to doc/tutorials/sentiment_analysis/index_en.md
index c53952c544de9fa88a6318432e34b0d05b149445..bb7681db44ca6f286ad6935ddfecb9becb429192 100644
--- a/doc/demo/sentiment_analysis/sentiment_analysis.md
+++ b/doc/tutorials/sentiment_analysis/index_en.md
@@ -293,20 +293,21 @@ predict.sh:
 model=model_output/pass-00002/
 config=trainer_config.py
 label=data/pre-imdb/labels.list
-python predict.py \
-     -n $config\
-     -w $model \
-     -b $label \
-     -d data/pre-imdb/dict.txt \
-     -i data/aclImdb/test/pos/10007_10.txt
-```
-
-* `predict.py`: predicting interface.
-*  -n $config : set network configure.
-*  -w $model: set model path.
-*  -b $label: set dictionary about corresponding relation between integer label and string label.
-*  -d data/pre-imdb/dict.txt: set dictionary.
-*  -i data/aclImdb/test/pos/10014_7.txt: set one example file to predict.
+cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
+     --tconf=$config\
+     --model=$model \
+     --label=$label \
+     --dict=./data/pre-imdb/dict.txt \
+     --batch_size=1
+```
+
+* `cat ./data/aclImdb/test/pos/10007_10.txt` : the input sample.
+* `predict.py` : predicting interface.
+* `--tconf=$config` : set network configure.
+* ` --model=$model` : set model path.
+* `--label=$label` : set dictionary about corresponding relation between integer label and string label.
+* `--dict=data/pre-imdb/dict.txt` : set dictionary.
+* `--batch_size=1` : set batch size.
 
 Note you should make sure the default model path `model_output/pass-00002`
 exists or change the model path.
diff --git a/doc/demo/sentiment_analysis/lstm.png b/doc/tutorials/sentiment_analysis/lstm.png
similarity index 100%
rename from doc/demo/sentiment_analysis/lstm.png
rename to doc/tutorials/sentiment_analysis/lstm.png
diff --git a/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg b/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..adec1606d64d6e35ffe7e62abfa9a09309b05c84
Binary files /dev/null and b/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg differ
diff --git a/doc/tutorials/sentiment_analysis/src/lstm.png b/doc/tutorials/sentiment_analysis/src/lstm.png
new file mode 100644
index 0000000000000000000000000000000000000000..aaf1fc690da2ffb8418cde5ed81848ddb5263030
Binary files /dev/null and b/doc/tutorials/sentiment_analysis/src/lstm.png differ
diff --git a/doc/demo/sentiment_analysis/stacked_lstm.jpg b/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg
similarity index 100%
rename from doc/demo/sentiment_analysis/stacked_lstm.jpg
rename to doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg
diff --git a/doc/tutorials/sentiment_analysis/stacked_lstm.jpg b/doc/tutorials/sentiment_analysis/stacked_lstm.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4239055050966e0095e188a8c81d860711bce29d
Binary files /dev/null and b/doc/tutorials/sentiment_analysis/stacked_lstm.jpg differ
diff --git a/doc/demo/text_generation/encoder-decoder-attention-model.png b/doc/tutorials/text_generation/encoder-decoder-attention-model.png
similarity index 100%
rename from doc/demo/text_generation/encoder-decoder-attention-model.png
rename to doc/tutorials/text_generation/encoder-decoder-attention-model.png
diff --git a/doc/demo/text_generation/text_generation.md b/doc/tutorials/text_generation/index_en.md
similarity index 100%
rename from doc/demo/text_generation/text_generation.md
rename to doc/tutorials/text_generation/index_en.md
diff --git a/doc/ui/api/trainer_config_helpers/attrs.rst b/doc/ui/api/trainer_config_helpers/attrs.rst
deleted file mode 100644
index 44919aba90df0b9da7c311a62339052c16c44ad1..0000000000000000000000000000000000000000
--- a/doc/ui/api/trainer_config_helpers/attrs.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-Parameter and Extra Layer Attribute
-===================================
-
-..  automodule:: paddle.trainer_config_helpers.attrs
-    :members:
diff --git a/doc/ui/api/trainer_config_helpers/index.rst b/doc/ui/api/trainer_config_helpers/index.rst
deleted file mode 100644
index 8395eb75710b3e67ec0c5442f79c999bdacdff42..0000000000000000000000000000000000000000
--- a/doc/ui/api/trainer_config_helpers/index.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Model Config Interface
-======================
-
-.. toctree::
-  :maxdepth: 1
-
-  optimizers.rst
-  data_sources.rst
-  layers.rst
-  activations.rst 
-  poolings.rst
-  networks.rst
-  evaluators.rst
-  attrs.rst
diff --git a/doc/ui/index.md b/doc/ui/index.md
deleted file mode 100644
index 9c1ba27bdc14fa9ab762ffb97424a8a5946808f9..0000000000000000000000000000000000000000
--- a/doc/ui/index.md
+++ /dev/null
@@ -1,20 +0,0 @@
-# User Interface
-
-## Data Provider
-
-* [Introduction](data_provider/index.rst)
-* [PyDataProvider2](data_provider/pydataprovider2.rst)
-
-## API Reference
-
-* [Model Config Interface](api/trainer_config_helpers/index.md)
-
-## Command Line Argument
-
-* [Use Case](cmd_argument/use_case.md)
-* [Argument Outline](cmd_argument/argument_outline.md)
-* [Detailed Descriptions](cmd_argument/detail_introduction.md)
-
-## Predict
-
-* [Python Prediction API](predict/swig_py_paddle_en.rst)
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
deleted file mode 100644
index d4deb3ca5a4523b509ea5082f32be8a315570dea..0000000000000000000000000000000000000000
--- a/doc/user_guide.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-User Guide
-==========
-
-..  toctree::
-  :maxdepth: 1
-
-  demo/quick_start/index_en.md
-  build/index.rst
-  build/contribute_to_paddle.md
-  ui/index.md
-  ui/api/trainer_config_helpers/index.rst
-  demo/index.md
-  cluster/index.md
diff --git a/doc_cn/CMakeLists.txt b/doc_cn/CMakeLists.txt
deleted file mode 100644
index 314b34525ca1d328f4e3b9814ee26deed39d89fd..0000000000000000000000000000000000000000
--- a/doc_cn/CMakeLists.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-if(NOT DEFINED SPHINX_THEME)
-    set(SPHINX_THEME default)
-endif()
-
-if(NOT DEFINED SPHINX_THEME_DIR)
-    set(SPHINX_THEME_DIR)
-endif()
-
-# configured documentation tools and intermediate build results
-set(BINARY_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/_build")
-
-# Sphinx cache with pickled ReST documents
-set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
-
-# HTML output directory
-set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html")
-
-configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
-    "${BINARY_BUILD_DIR}/conf.py"
-    @ONLY)
-
-sphinx_add_target(paddle_docs_cn
-                  html
-                  ${BINARY_BUILD_DIR}
-                  ${SPHINX_CACHE_DIR}
-                  ${CMAKE_CURRENT_SOURCE_DIR}
-                  ${SPHINX_HTML_DIR})
-
-add_dependencies(paddle_docs_cn
-  gen_proto_py)
diff --git a/doc_cn/algorithm/rnn/hierarchical-rnn.md b/doc_cn/algorithm/rnn/hierarchical-rnn.md
deleted file mode 100644
index c184a34e85a571e98e88c14ef653356fdd555a19..0000000000000000000000000000000000000000
--- a/doc_cn/algorithm/rnn/hierarchical-rnn.md
+++ /dev/null
@@ -1,403 +0,0 @@
-# 双层RNN配置与示例
-
-我们在`paddle/gserver/tests/test_RecurrentGradientMachine`单测中，通过多组语义相同的单双层RNN配置，讲解如何使用双层RNN。
-
-## 示例1：双进双出，subseq间无memory
-
-配置：单层RNN（`sequence_layer_group`）和双层RNN（`sequence_nest_layer_group`），语义完全相同。
-
-### 读取双层序列的方法
-
-首先，我们看一下单双层序列的不同数据组织形式（您也可以采用别的组织形式）：
-
-- 单层序列的数据（`Sequence/tour_train_wdseg`）如下，一共有10个样本。每个样本由两部分组成，一个label（此处都为2）和一个已经分词后的句子。
-
-```text
-2  	酒店 有 很 舒适 的 床垫 子 ， 床上用品 也 应该 是 一人 一 换 ， 感觉 很 利落 对 卫生 很 放心 呀 。
-2  	很 温馨 ， 也 挺 干净 的 * 地段 不错 ， 出来 就 有 全家 ， 离 地铁站 也 近 ， 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 ， 就 第一天 给 了 一次性杯子 *
-2  	位置 方便 ， 强烈推荐 ， 十一 出去玩 的 时候 选 的 ， 对面 就是 华润万家 ， 周围 吃饭 的 也 不少 。
-2  	交通便利 ， 吃 很 便利 ， 乾 浄 、 安静 ， 商务 房 有 电脑 、 上网 快 ， 价格 可以 ， 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。
-2  	本来 准备 住 两 晚 ， 第 2 天 一早 居然 停电 ， 且 无 通知 ， 只有 口头 道歉 。 总体来说 性价比 尚可 ， 房间 较 新 ， 还是 推荐 .
-2  	这个 酒店 去过 很多 次 了 ， 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店
-2  	挺好 的 汉庭 ， 前台 服务 很 热情 ， 卫生 很 整洁 ， 房间 安静 ， 水温 适中 ， 挺好 ！
-2  	HowardJohnson 的 品质 ， 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 ， 简直 一 流 。 就 在 天一阁 、 月湖 旁边 ， 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。
-2  	酒店 很干净 ， 很安静 ， 很 温馨 ， 服务员 服务 好 ， 各方面 都 不错 *
-2  	挺好 的 ， 就是 没 窗户 ， 不过 对 得 起 这 价格
-```
-
-- 双层序列的数据（`Sequence/tour_train_wdseg.nest`）如下，一共有4个样本。样本间用空行分开，代表不同的双层序列，序列数据和上面的完全一样。每个样本的子句数分别为2,3,2,3。
-
-```text
-2  	酒店 有 很 舒适 的 床垫 子 ， 床上用品 也 应该 是 一人 一 换 ， 感觉 很 利落 对 卫生 很 放心 呀 。
-2  	很 温馨 ， 也 挺 干净 的 * 地段 不错 ， 出来 就 有 全家 ， 离 地铁站 也 近 ， 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 ， 就 第一天 给 了 一次性杯子 *
-
-2  	位置 方便 ， 强烈推荐 ， 十一 出去玩 的 时候 选 的 ， 对面 就是 华润万家 ， 周围 吃饭 的 也 不少 。
-2  	交通便利 ， 吃 很 便利 ， 乾 浄 、 安静 ， 商务 房 有 电脑 、 上网 快 ， 价格 可以 ， 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。
-2  	本来 准备 住 两 晚 ， 第 2 天 一早 居然 停电 ， 且 无 通知 ， 只有 口头 道歉 。 总体来说 性价比 尚可 ， 房间 较 新 ， 还是 推荐 .
-
-2  	这个 酒店 去过 很多 次 了 ， 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店
-2  	挺好 的 汉庭 ， 前台 服务 很 热情 ， 卫生 很 整洁 ， 房间 安静 ， 水温 适中 ， 挺好 ！
-
-2  	HowardJohnson 的 品质 ， 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 ， 简直 一 流 。 就 在 天一阁 、 月湖 旁边 ， 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。
-2  	酒店 很干净 ， 很安静 ， 很 温馨 ， 服务员 服务 好 ， 各方面 都 不错 *
-2  	挺好 的 ， 就是 没 窗户 ， 不过 对 得 起 这 价格
-```
-
-其次，我们看一下单双层序列的不同dataprovider（见`sequenceGen.py`）：
-
-- 单层序列的dataprovider如下：
-  - word_slot是integer_value_sequence类型，代表单层序列。
-  - label是integer_value类型，代表一个向量。
-
-```python
-def hook(settings, dict_file, **kwargs):
-    settings.word_dict = dict_file
-    settings.input_types = [integer_value_sequence(len(settings.word_dict)), 
-                            integer_value(3)]
-
-@provider(init_hook=hook)
-def process(settings, file_name):
-    with open(file_name, 'r') as fdata:
-        for line in fdata:
-            label, comment = line.strip().split('\t')
-            label = int(''.join(label.split()))
-            words = comment.split()
-            word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict]
-            yield word_slot, label
-```
-
-- 双层序列的dataprovider如下：
-  - word_slot是integer_value_sub_sequence类型，代表双层序列。
-  - label是integer_value_sequence类型，代表单层序列，即一个子句一个label。注意：也可以为integer_value类型，代表一个向量，即一个句子一个label。通常根据任务需求进行不同设置。
-  - 关于dataprovider中input_types的详细用法，参见PyDataProvider2。
-
-```python
-def hook2(settings, dict_file, **kwargs):
-    settings.word_dict = dict_file
-    settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)),
-                            integer_value_sequence(3)]
-
-@provider(init_hook=hook2)
-def process2(settings, file_name):
-    with open(file_name) as fdata:
-        label_list = []
-        word_slot_list = []
-        for line in fdata:
-            if (len(line)) > 1:
-                label,comment = line.strip().split('\t')
-                label = int(''.join(label.split()))
-                words = comment.split()
-                word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict]
-                label_list.append(label)
-                word_slot_list.append(word_slot)
-            else:
-                yield word_slot_list, label_list
-                label_list = []
-                word_slot_list = []
-```
-
-### 模型中的配置
-
-首先，我们看一下单层序列的配置（见`sequence_layer_group.conf`）。注意：batchsize=5表示一次过5句单层序列，因此2个batch就可以完成1个pass。
-
-```python
-settings(batch_size=5)
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-# (lstm_input + lstm) is equal to lstmemory 
-with mixed_layer(size=hidden_dim*4) as lstm_input:
-    lstm_input += full_matrix_projection(input=emb)
-
-lstm = lstmemory_group(input=lstm_input,
-                       size=hidden_dim,
-                       act=TanhActivation(),
-                       gate_act=SigmoidActivation(),
-                       state_act=TanhActivation(),
-                       lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
-
-lstm_last = last_seq(input=lstm)
-
-with mixed_layer(size=label_dim, 
-                 act=SoftmaxActivation(), 
-                 bias_attr=True) as output:
-    output += full_matrix_projection(input=lstm_last)
-
-outputs(classification_cost(input=output, label=data_layer(name="label", size=1)))
-
-```
-其次，我们看一下语义相同的双层序列配置（见`sequence_nest_layer_group.conf`），并对其详细分析：
-
-- batchsize=2表示一次过2句双层序列。但从上面的数据格式可知，2句双层序列和5句单层序列的数据完全一样。
-- data_layer和embedding_layer不关心数据是否是序列格式，因此两个配置在这两层上的输出是一样的。
-- lstmemory:
-  - 单层序列过了一个mixed_layer和lstmemory_group。
-  - 双层序列在同样的mixed_layer和lstmemory_group外，直接加了一层group。由于这个外层group里面没有memory，表示subseq间不存在联系，即起到的作用仅仅是把双层seq拆成单层，因此双层序列过完lstmemory的输出和单层的一样。
-- last_seq：
-  - 单层序列直接取了最后一个元素
-  - 双层序列首先（last_seq层）取了每个subseq的最后一个元素，将其拼接成一个新的单层序列；接着（expand_layer层）将其扩展成一个新的双层序列，其中第i个subseq中的所有向量均为输入的单层序列中的第i个向量；最后（average_layer层）取了每个subseq的平均值。
-  - 分析得出：第一个last_seq后，每个subseq的最后一个元素就等于单层序列的最后一个元素，而expand_layer和average_layer后，依然保持每个subseq最后一个元素的值不变（这两层仅是为了展示它们的用法，实际中并不需要）。因此单双层序列的输出是一样旳。
-
-```python
-settings(batch_size=2)
-
-data = data_layer(name="word", size=dict_dim)
-
-emb_group = embedding_layer(input=data, size=word_dim)
-
-# (lstm_input + lstm) is equal to lstmemory 
-def lstm_group(lstm_group_input):
-    with mixed_layer(size=hidden_dim*4) as group_input:
-      group_input += full_matrix_projection(input=lstm_group_input)
-
-    lstm_output = lstmemory_group(input=group_input,
-                                  name="lstm_group",
-                                  size=hidden_dim,
-                                  act=TanhActivation(),
-                                  gate_act=SigmoidActivation(),
-                                  state_act=TanhActivation(),
-                                  lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
-    return lstm_output
-
-lstm_nest_group = recurrent_group(input=SubsequenceInput(emb_group),
-                                  step=lstm_group,
-                                  name="lstm_nest_group")
-# hasSubseq ->(seqlastins) seq
-lstm_last = last_seq(input=lstm_nest_group, agg_level=AggregateLevel.EACH_SEQUENCE)
-
-# seq ->(expand) hasSubseq
-lstm_expand = expand_layer(input=lstm_last, expand_as=emb_group, expand_level=ExpandLevel.FROM_SEQUENCE)
-
-# hasSubseq ->(average) seq
-lstm_average = pooling_layer(input=lstm_expand,
-                             pooling_type=AvgPooling(),
-                             agg_level=AggregateLevel.EACH_SEQUENCE)
-
-with mixed_layer(size=label_dim, 
-                 act=SoftmaxActivation(), 
-                 bias_attr=True) as output:
-    output += full_matrix_projection(input=lstm_average)
-
-outputs(classification_cost(input=output, label=data_layer(name="label", size=1)))
-```
-## 示例2：双进双出，subseq间有memory
-
-配置：单层RNN（`sequence_rnn.conf`），双层RNN（`sequence_nest_rnn.conf`和`sequence_nest_rnn_readonly_memory.conf`），语义完全相同。
-
-### 读取双层序列的方法
-
-我们看一下单双层序列的不同数据组织形式和dataprovider（见`rnn_data_provider.py`）
-```python
-data = [
-    [[[1, 3, 2], [4, 5, 2]], 0],
-    [[[0, 2], [2, 5], [0, 1, 2]], 1],
-]
-
-@provider(input_types=[integer_value_sub_sequence(10),
-                       integer_value(3)])
-def process_subseq(settings, file_name):
-    for d in data:
-        yield d
-
-@provider(input_types=[integer_value_sequence(10),
-                       integer_value(3)])
-def process_seq(settings, file_name):
-    for d in data:
-        seq = []
-```
-- 单层序列：有两句，分别为[1,3,2,4,5,2]和[0,2,2,5,0,1,2]。
-- 双层序列：有两句，分别为[[1,3,2],[4,5,2]]（2个子句）和[[0,2],[2,5],[0,1,2]]（3个子句）。
-- 单双层序列的label都分别是0和1
-
-### 模型中的配置
-
-我们选取单双层序列配置中的不同部分，来对比分析两者语义相同的原因。
-
-- 单层序列：过了一个很简单的recurrent_group。每一个时间步，当前的输入y和上一个时间步的输出rnn_state做了一个全链接。
-
-```python
-def step(y):
-    mem = memory(name="rnn_state", size=hidden_dim)
-    return fc_layer(input=[y, mem],
-                    size=hidden_dim,
-                    act=TanhActivation(),
-                    bias_attr=True,
-                    name="rnn_state")
-
-out = recurrent_group(step=step, input=emb)
-```
-- 双层序列，外层memory是一个元素：
-  - 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem，表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中，outer_mem是一个子句的最后一个向量，即整个双层group是将前一个子句的最后一个向量，作为下一个子句memory的初始状态。
-  - 从输入数据上看，单双层序列的句子是一样的，只是双层序列将其又做了子序列划分。因此双层序列的配置中，必须将前一个子句的最后一个元素，作为boot_layer传给下一个子句的memory，才能保证和单层序列的配置中“每一个时间步都用了上一个时间步的输出结果”一致。
-
-```python
-def outer_step(x):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-    def inner_step(y):
-        inner_mem = memory(name="inner_rnn_state",
-                           size=hidden_dim,
-                           boot_layer=outer_mem)
-        return fc_layer(input=[y, inner_mem],
-                        size=hidden_dim,
-                        act=TanhActivation(),
-                        bias_attr=True,
-                        name="inner_rnn_state")
-
-    inner_rnn_output = recurrent_group(
-        step=inner_step,
-        input=x)
-    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
-
-    return inner_rnn_output
-
-out = recurrent_group(step=outer_step, input=SubsequenceInput(emb))
-```
-- 双层序列，外层memory是单层序列：
-  - 由于外层每个时间步返回的是一个子句，这些子句的长度往往不等长。因此当外层有is_seq=True的memory时，内层是**无法直接使用**它的，即内层memory的boot_layer不能链接外层的这个memory。
-  - 如果内层memory想**间接使用**这个外层memory，只能通过`pooling_layer`、`last_seq`或`first_seq`这三个layer将它先变成一个元素。但这种情况下，外层memory必须有boot_layer，否则在第0个时间步时，由于外层memory没有任何seq信息，因此上述三个layer的前向会报出“**Check failed: input.sequenceStartPositions**”的错误。
-
-## 示例3：双进双出，输入不等长
-
-**输入不等长**是指recurrent_group的多个输入在各时刻的长度可以不相等, 但需要指定一个和输出长度一致的input，用<font color="red">targetInlink</font>表示。参考配置：单层RNN（`sequence_rnn_multi_unequalength_inputs.conf`），双层RNN（`sequence_nest_rnn_multi_unequalength_inputs.conf`）
-
-### 读取双层序列的方法
-
-我们看一下单双层序列的数据组织形式和dataprovider（见`rnn_data_provider.py`）
-```python
-data2 = [
-    [[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0],
-    [[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1],
-]
-
-@provider(input_types=[integer_value_sub_sequence(10),
-                       integer_value_sub_sequence(10),
-                       integer_value(2)],
-          should_shuffle=False)
-def process_unequalength_subseq(settings, file_name): #双层RNN的dataprovider
-    for d in data2:
-        yield d
-
-
-@provider(input_types=[integer_value_sequence(10),
-                       integer_value_sequence(10),
-                       integer_value(2)],
-          should_shuffle=False)
-def process_unequalength_seq(settings, file_name): #单层RNN的dataprovider
-    for d in data2:
-        words1=reduce(lambda x,y: x+y, d[0])
-        words2=reduce(lambda x,y: x+y, d[1])
-        yield words1, words2, d[2]
-```
-
-data2 中有两个样本，每个样本有两个特征, 记fea1, fea2。
-
-- 单层序列：两个样本分别为[[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]] 和 [[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]]
-- 双层序列：两个样本分别为
-  - **样本1**：[[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]]]。fea1和fea2都分别有2个子句，fea1=[[1, 2], [4, 5, 2]], fea2=[[5, 4, 1], [3, 1]]
-  - **样本2**：[[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]]。fea1和fea2都分别有3个子句， fea1=[[0, 2], [2, 5], [0, 1, 2]], fea2=[[1, 5], [4], [2, 3, 6, 1]]。<br/>
-  - **注意**：每个样本中，各特征的子句数目需要相等。这里说的“双进双出，输入不等长”是指fea1在i时刻的输入的长度可以不等于fea2在i时刻的输入的长度。如对于第1个样本，时刻i=2, fea1[2]=[4, 5, 2]，fea2[2]=[3, 1]，3≠2。
-- 单双层序列中，两个样本的label都分别是0和1
-
-### 模型中的配置
-
-单层RNN（`sequence_rnn_multi_unequalength_inputs.conf`）和双层RNN（`sequence_nest_rnn_multi_unequalength_inputs.conf`）两个模型配置达到的效果完全一样，区别只在于输入为单层还是双层序列，现在我们来看它们内部分别是如何实现的。
-
-- 单层序列：
-  - 过了一个简单的recurrent_group。每一个时间步，当前的输入y和上一个时间步的输出rnn_state做了一个全连接，功能与示例2中`sequence_rnn.conf`的`step`函数完全相同。这里，两个输入x1,x2分别通过calrnn返回最后时刻的状态。结果得到的encoder1_rep和encoder2_rep分别是单层序列，最后取encoder1_rep的最后一个时刻和encoder2_rep的所有时刻分别相加得到context。
-  - 注意到这里recurrent_group输入的每个样本中，fea1和fea2的长度都分别相等，这并非偶然，而是因为recurrent_group要求输入为单层序列时，所有输入的长度都必须相等。
-
-```python
-def step(x1, x2):
-	def calrnn(y):
-		mem = memory(name = 'rnn_state_' + y.name, size = hidden_dim)
-        out = fc_layer(input = [y, mem],
-	        size = hidden_dim,
-	        act = TanhActivation(),
-            bias_attr = True,
-            name = 'rnn_state_' + y.name)
-        return out
-
-	encoder1 = calrnn(x1)
-    encoder2 = calrnn(x2)
-    return [encoder1, encoder2]
-    
-encoder1_rep, encoder2_rep = recurrent_group(
-    name="stepout",                           
-    step=step,
-    input=[emb1, emb2])
-
-encoder1_last = last_seq(input = encoder1_rep)                           
-encoder1_expandlast = expand_layer(input = encoder1_last,
-                                   expand_as = encoder2_rep)
-context = mixed_layer(input = [identity_projection(encoder1_expandlast),
-                               identity_projection(encoder2_rep)],
-                      size = hidden_dim)
-```
-- 双层序列：
-  - 双层RNN中，对输入的两个特征分别求时序上的连续全连接(`inner_step1`和`inner_step2`分别处理fea1和fea2)，其功能与示例2中`sequence_nest_rnn.conf`的`outer_step`函数完全相同。不同之处是，此时输入`[SubsequenceInput(emb1), SubsequenceInput(emb2)]`在各时刻并不等长。
-  - 函数`outer_step`中可以分别处理这两个特征，但我们需要用<font color=red>targetInlink</font>指定recurrent_group的输出的格式（各子句长度）只能和其中一个保持一致，如这里选择了和emb2的长度一致。
-  - 最后，依然是取encoder1_rep的最后一个时刻和encoder2_rep的所有时刻分别相加得到context。
-
-```python
-def outer_step(x1, x2):
-    outer_mem1 = memory(name = "outer_rnn_state1", size = hidden_dim)
-    outer_mem2 = memory(name = "outer_rnn_state2", size = hidden_dim)
-    def inner_step1(y):
-        inner_mem = memory(name = 'inner_rnn_state_' + y.name,
-                           size = hidden_dim,
-                           boot_layer = outer_mem1)
-        out = fc_layer(input = [y, inner_mem],
-                       size = hidden_dim,
-                       act = TanhActivation(),
-                       bias_attr = True,
-                       name = 'inner_rnn_state_' + y.name)
-        return out
-
-    def inner_step2(y):
-        inner_mem = memory(name = 'inner_rnn_state_' + y.name,
-                           size = hidden_dim,
-                           boot_layer = outer_mem2)
-        out = fc_layer(input = [y, inner_mem],
-                       size = hidden_dim,
-                       act = TanhActivation(),
-                       bias_attr = True,
-                       name = 'inner_rnn_state_' + y.name)
-        return out
-
-    encoder1 = recurrent_group(
-        step = inner_step1,
-        name = 'inner1',
-        input = x1)
-
-    encoder2 = recurrent_group(
-        step = inner_step2,
-        name = 'inner2',
-        input = x2)
-
-    sentence_last_state1 = last_seq(input = encoder1, name = 'outer_rnn_state1')
-    sentence_last_state2_ = last_seq(input = encoder2, name = 'outer_rnn_state2')
-
-    encoder1_expand = expand_layer(input = sentence_last_state1,
-                                   expand_as = encoder2)
-
-    return [encoder1_expand, encoder2]
-
-encoder1_rep, encoder2_rep = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[SubsequenceInput(emb1), SubsequenceInput(emb2)],
-    targetInlink=emb2)
-
-encoder1_last = last_seq(input = encoder1_rep)
-encoder1_expandlast = expand_layer(input = encoder1_last,
-                                   expand_as = encoder2_rep)
-context = mixed_layer(input = [identity_projection(encoder1_expandlast),
-                               identity_projection(encoder2_rep)],
-                      size = hidden_dim)
-```
-
-## 示例4：beam_search的生成
-
-TBD
diff --git a/doc_cn/build/docker/build_docker_image.rst b/doc_cn/build/docker/build_docker_image.rst
deleted file mode 100644
index 73409ceaff4e1a1f8ac3ad0d828e003a214c8fcb..0000000000000000000000000000000000000000
--- a/doc_cn/build/docker/build_docker_image.rst
+++ /dev/null
@@ -1,38 +0,0 @@
-构建PaddlePaddle Docker Image
-===========================
-
-PaddlePaddle的Docker Image构建源码放置在 :code:`${源码根目录}/paddle/scripts/docker/`目录下。
-该Image基于ubuntu 14.04。该目录下有两个文件，Dockerfile和build.sh。其中:
-
-*  Dockerfile是docker image的主要描述文件。描述了Docker image的构建步骤、各种参数和维护
-   人员等等。
-*  build.sh是docker image的主要构建步骤。
-
-该image的构建在docker 1.12版本测试通过, 低于docker 1.12版本的情况下并没有测试。主要由于旧版本
-的docker可能缺乏 :code:`--build-arg` 参数，从而不能在运行编译命令的时候接受参数。
-
-同时，该构建脚本充分考虑了网络不稳定的情况，对于cuda的Toolkit有断点续传和传输速度过小重启下载的
-简单优化。
-
-使用脚本构建PaddlePaddle Docker Image
--------------------------------------------
-
-该脚本的使用方法是，进入该源码目录，执行 :code:`docker build .` 命令。可以使用
- :code:`--build-arg` 传入的配置参数包括:
-
-*  LOWEST\_DL\_SPEED\: 多线程下载过程中，最低线程的下载速度(默认单位是Bytes，可以传入10K, 
-   10M，或者10G这样的单位)。如果小于这个下载速度，那么这个下载线程将会关闭。所有的下载线程关闭时，
-   下载进程会重启。
-*  WITH\_GPU\: ON or OFF。是否开启GPU功能。注意，编译PaddlePaddle的GPU版本并不需要一定在具有GPU
-   的机器上进行。但是，运行PaddlePaddle的GPU版本一定要在具有CUDA的机器上运行。
-
-简单的使用样例为\:
-
-..  code-block:: bash
-
-    cd ${源码根目录}/paddle/scripts/docker/
-    docker build --build-arg LOWEST_DL_SPEED=50K\
-                 --build-arg WITH_GPU=ON \
-                 --tag  paddle_gpu:latest .
-
-即可在本地编译出PaddlePaddle的镜像。
diff --git a/doc_cn/build_and_install/cmake/index.rst b/doc_cn/build_and_install/cmake/index.rst
deleted file mode 100644
index e2a12c500177ea5b075416380796ab82e1217f60..0000000000000000000000000000000000000000
--- a/doc_cn/build_and_install/cmake/index.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-使用cmake编译PaddlePaddle
-=========================
-
-..  toctree::
-    
-    install_deps.rst
-    compile_options.rst
-    make_and_install.rst
diff --git a/doc_cn/build_and_install/cmake/install_deps.rst b/doc_cn/build_and_install/cmake/install_deps.rst
deleted file mode 100644
index 7fa4665a954bd41e74145c4a1b00734c3ac41d83..0000000000000000000000000000000000000000
--- a/doc_cn/build_and_install/cmake/install_deps.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-安装编译PaddlePaddle需要的依赖
-==============================
-
-参见 `安装编译依赖 <../../../doc/build/build_from_source.html#install-dependencies>`_
diff --git a/doc_cn/build_and_install/cmake/make_and_install.rst b/doc_cn/build_and_install/cmake/make_and_install.rst
deleted file mode 100644
index 212b9c9352b01db5215221a6c2faafe0d679d962..0000000000000000000000000000000000000000
--- a/doc_cn/build_and_install/cmake/make_and_install.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-make和make install
-==================
-
-参见 `make和make install <../../../doc/build/build_from_source.html#build-and-install>`_
diff --git a/doc_cn/build_and_install/index.rst b/doc_cn/build_and_install/index.rst
deleted file mode 100644
index 2205e282248c4e7f6d1173be47aadf160554c6be..0000000000000000000000000000000000000000
--- a/doc_cn/build_and_install/index.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-编译与安装
-========================
-
-安装
-++++
-
-PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜像，ubuntu的deb安装包等。我们推荐使用Docker镜像来部署环境，同时欢迎贡献更多的安装包。
-
-.. toctree::
-   :maxdepth: 1
-   :glob:
-   
-   使用Jumbo安装(对内) <../build/internal/install_from_jumbo.rst>
-   install/docker_install.rst 
-   install/ubuntu_install.rst
-
-
-
-编译
-++++
-
-..  warning::
-
-	编译选项主要推荐高级用户查看，普通用户请走安装流程。
-
-.. toctree::
-   :maxdepth: 1
-   :glob:
-
-   源码下载(对内) <../build/internal/download_paddle_source_zh_cn.rst>
-   从源码编译安装(对内)  <../build/internal/build_from_source_zh_cn.rst>
-   cmake/index.rst
diff --git a/doc_cn/build_and_install/install/paddle_ssh.Dockerfile b/doc_cn/build_and_install/install/paddle_ssh.Dockerfile
deleted file mode 100644
index 7cb947bddf4593259cb69f525b44015836291605..0000000000000000000000000000000000000000
--- a/doc_cn/build_and_install/install/paddle_ssh.Dockerfile
+++ /dev/null
@@ -1,15 +0,0 @@
-FROM paddledev/paddle:cpu-latest
-
-MAINTAINER PaddlePaddle dev team <paddle-dev@baidu.com>
-
-RUN apt-get update
-RUN apt-get install -y openssh-server
-RUN mkdir /var/run/sshd
-RUN echo 'root:root' | chpasswd
-
-RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
-RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
-
-EXPOSE 22
-
-CMD    ["/usr/sbin/sshd", "-D"]
diff --git a/doc_cn/build_and_install/install/paddle_version.txt b/doc_cn/build_and_install/install/paddle_version.txt
deleted file mode 100644
index a80873303fd0d05d963482629000d76260185ef6..0000000000000000000000000000000000000000
--- a/doc_cn/build_and_install/install/paddle_version.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-PaddlePaddle 0.8.0b1, compiled with
-    with_avx: ON
-    with_gpu: OFF
-    with_double: OFF
-    with_python: ON
-    with_rdma: OFF
-    with_glog: ON
-    with_gflags: ON
-    with_metric_learning:
-    with_timer: OFF
-    with_predict_sdk:
diff --git a/doc_cn/build_and_install/install/ubuntu_install.rst b/doc_cn/build_and_install/install/ubuntu_install.rst
deleted file mode 100644
index 70ac5225bd82e40838875b49f67e70ff08eff853..0000000000000000000000000000000000000000
--- a/doc_cn/build_and_install/install/ubuntu_install.rst
+++ /dev/null
@@ -1,83 +0,0 @@
-使用deb包在Ubuntu上安装PaddlePaddle
-===================================
-
-PaddlePaddle目前支持使用deb包安装。Paddle的 :code:`deb` 安装包在ubuntu 14.04中正确，但理论上支持其他的 debian 发行版。
-
-
-PaddlePaddle的ubuntu安装包分为四个版本，他们是 cpu、gpu、cpu-noavx、gpu-noavx 四个版本。其中 noavx 用于不支持AVX指令集的cpu。安装包的下载地址是\: https://github.com/baidu/Paddle/releases/
-
-
-用户需要先将PaddlePaddle安装包下载到本地，然后执行如下 :code:`gdebi` 命令即可完成安装。
-
-..  code-block:: shell
-
-    gdebi paddle-*-cpu.deb
-
-如果 :code:`gdebi` 没有安装,则需要使用 :code:`sudo apt-get install gdebi`, 来安装 :code:`gdebi` 。
-
-
-或者使用下面一条命令安装.
-
-..  code-block:: shell
-
-    dpkg -i paddle-*-cpu.deb
-    apt-get install -f
-
-在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的，
-在 :code:`apt-get install -f` 里会继续安装 PaddlePaddle。
-
-需要注意的是，如果使用GPU版本的PaddlePaddle，请安装CUDA 7.5 和CUDNN 5到本地环境中，
-并设置好对应的环境变量(LD_LIBRARY_PATH等等)。
-
-安装完成后,可以使用命令 :code:`paddle version` 查看安装后的paddle 版本。可能的输出为
-
-..  literalinclude:: paddle_version.txt
-
-可能遇到的问题
---------------
-
-libcudart.so/libcudnn.so找不到
-++++++++++++++++++++++++++++++
-
-安装完成PaddlePaddle后，运行 :code:`paddle train` 报错\:
-
-..	code-block:: shell
-
-	0831 12:36:04.151525  1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH.
-
-PaddlePaddle使用运行时动态连接CUDA的so，如果在 LD_LIBRARY_PATH里面找不到这些动态
-库的话，会报寻找不到这些动态库。
-
-解决方法很简单，就是将这些动态库加到环境变量里面。比较可能的命令如下。
-
-..	code-block:: text
-
-	export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-
-CUDA Driver找不到
-+++++++++++++++++
-
-运行 :code:`paddle train` 报错\:
-
-..	code-block:: text
-
-	F0831 12:39:16.699000  1090 hl_cuda_device.cc:530] Check failed: cudaSuccess == cudaStat (0 vs. 35) Cuda Error: CUDA driver version is insufficient for CUDA runtime version
-
-PaddlePaddle运行时如果没有寻找到cuda的driver，变会报这个错误。解决办法是将cuda 
-driver添加到LD_LIBRARY_PATH中。比较可能的命令如下。
-
-..	code-block:: text
-
-	export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
-
-config文件找不到
-++++++++++++++++
-
-运行 :code:`paddle train` 得到结果\:
-
-..	code-block:: text
-
-	F0831 20:53:07.525789  1302 TrainerMain.cpp:94] Check failed: config != nullptr no valid config
-
-PaddlePaddle在运行时找不到对应的config文件，说明命令行参数 :code:`config` 没有设置。
-而这个一般说明PaddlePaddle已经安装完毕了。
\ No newline at end of file
diff --git a/doc_cn/cluster/index.rst b/doc_cn/cluster/index.rst
deleted file mode 100644
index 25313a9635bbf567a1aedfac3c379802d601d283..0000000000000000000000000000000000000000
--- a/doc_cn/cluster/index.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-集群训练
-========
-
-* `集群训练 <../../doc/cluster/index.html>`_
-
-.. toctree::
-    :maxdepth: 2
-    :glob:
-
-    集群训练(对内) <internal/index.md>
-
diff --git a/doc_cn/concepts/nn.rst b/doc_cn/concepts/nn.rst
deleted file mode 100644
index f4d2cf490d14761f4b9f6a308180c5e8015cbecb..0000000000000000000000000000000000000000
--- a/doc_cn/concepts/nn.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-TBD
-
-目前正在书写中。敬请期待。
\ No newline at end of file
diff --git a/doc_cn/concepts/program_concepts.rst b/doc_cn/concepts/program_concepts.rst
deleted file mode 100644
index af5bbdac260afce0a032461ab913d05bc2f55929..0000000000000000000000000000000000000000
--- a/doc_cn/concepts/program_concepts.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-TBD
-###
-
-目前正在书写中。敬请期待。
\ No newline at end of file
diff --git a/doc_cn/concepts/use_concepts.rst b/doc_cn/concepts/use_concepts.rst
deleted file mode 100644
index 67e98edabc0c2a4ecdf8d7993f8dd66b9365a05d..0000000000000000000000000000000000000000
--- a/doc_cn/concepts/use_concepts.rst
+++ /dev/null
@@ -1,191 +0,0 @@
-#########################
-PaddlePaddle 基本使用概念
-#########################
-
-PaddlePaddle是一个神经网络学习框架。其单机进程为 :code:`paddle train`。 单机的所有设备使用，均在单机进程内调度完成。 而多机辅助进程 :code:`paddle pserver` 负责联合多个单机进程进行通信，进而充分利用集群的计算资源。 PaddlePaddle同时以 :code:`swig api` 的形式，提供训练结果模型预测的方法和自定义训练流程。
-
-下面我们会分别介绍主要进程 :code:`paddle train` 中的一些概念。这些概念会对如何使用PaddlePaddle有一定的帮助。 了解这些概念的前提是，读者已经了解 `基本的神经网络/机器学习原理和概念 <nn.html>`_ 。同时，如果想要了解PaddlePaddle实现中的一些概念，请参考 `PaddlePaddle 编程中的基本概念 <program_concepts.html>`_ 。
-
-..	contents::
-
-PaddlePaddle 的进程模型
-=======================
-
-PaddlePaddle进程内嵌了一个 :code:`python` 解释器。 这个 :code:`python` 解释器负责解析用户定义的神经网络配置，和解析用户数据，并将用户数据传入给 PaddlePaddle。
-
-..	graphviz:: 
-
-	digraph pp_process {
-		rankdir=LR;
-		config_file [label="用户神经网络配置"];
-		subgraph cluster_pp {
-			style=filled;
-			color=lightgrey;
-			node [style=filled, color=white, shape=box];
-			label = "PaddlePaddle C++";
-			py [label="Python解释器"];
-		}
-		data_provider [label="用户数据解析"];
-		config_file -> py;
-		py -> data_provider [dir="back"];
-	}
-
-所以，PaddlePaddle单机训练进程，:code:`paddle train` , 对于用户的主要接口语言为 python。 主要需要用户配置的两个文件为 :code:`DataProvider` 和训练文件 :code:`TrainerConfig` 。
-
-
-DataProvider
-============
-
-DataProvider是 :code:`paddle train` 的数据提供器。 它负责将用户的原始数据转换成 PaddlePaddle 可以识别的数据类型。每当 PaddlePaddle 需要新的数据训练时，都会调用 DataProvider 返回数据。 当所有数据读取完一轮后，DataProvider 便返回空数据通知 PaddlePaddle。PaddlePaddle负责在下一轮训练开始前，将DataProvider重置。
-
-需要注意的是，DataProvider在PaddlePaddle中是被训练逻辑调用的关系， 而不是新的数据驱动训练。并且所有的 :code:`shuffle` , 和一些随机化的噪声添加，都应该在 DataProvider 阶段完成。
-
-为了方便用户使用自己的数据格式， PaddlePaddle 提供了 `PyDataProvider`_ 来处理数据。 并且在这个Provider中，PaddlePaddle的 C++ 部分接管了如何shuffle，处理 batch，GPU/CPU通信，双缓冲，异步读取等问题。 用户可以参考 `PyDataProvider`_ 的相关文档，继续深入了解 DataProvider 的使用。
-
-
-训练文件
-========
-
-训练文件是PaddlePaddle中配置神经网络结构、学习优化算法、数据传入方式的地方。 训练文件是一个python文件，使用命令行参数 :code:`--config` 传给 paddle 的主程序。 例如\:
-
-..	code-block:: bash
-
-	paddle train --config=trainer_config.py
-
-一个典型简单的训练文件可能为
-
-..  literalinclude:: trainer_config.py
-    :linenos:
-
-下面我们详细的介绍一下训练文件中各个模块的概念。
-
-
-trainer_config_helpers
-----------------------
-
-PaddlePaddle的配置文件与PaddlePaddle C++端通信的最基础协议是 :code:`protobuf` 。而为了避免用户直接写比较难写的 protobuf string，我们书写了一个helpers来生成这个protobuf包。所以在文件的开始，import这些helpers函数。
-
-需要注意的是，这个 :code:`paddle.trainer_config_helpers` 包是标准的python包，这意味着用户可以选择自己喜欢的 :code:`ide` 或者编辑器来编写Paddle的配置文件，这个python包注释文档比较完善，并且考虑了IDE的代码提示与类型注释。
-
-data_sources
-------------
-
-data_sources是配置神经网络的数据源。这里使用的函数是 :code:`define_py_data_sources2` ，这个函数是定义了使用 `PyDataProvider`_ 作为数据源。 而后缀 :code:`2` 是Paddle历史遗留问题，因为Paddle之前使用的 PyDataProvider 性能较差，所以完全重构了一个新的 `PyDataProvider`_ 。
-
-data_sources里面的 train_list 和 test_list 指定的是训练文件列表和测试文件列表。 如果传入一个字符串的话，是指一个训练列表文件。这个训练列表文件中包含的是每一个训练或者测试文件的路径。如果传入一个list的话，则会默认生成一个 list 文件，再传入给 train.list 或者 test.list 。
-
-而 :code:`module` 和 :code:`obj` 指定了 DataProvider 的模块名和函数名。
-
-更具体的使用，请参考 `PyDataProvider`_ 。
-
-settings
---------
-
-`settings`_ 是神经网络训练算法相关的设置项。包括学习率，batch_size，优化算法，正则方法等等。具体的使用方法请参考 `settings`_ 文档。
-
-网络配置
---------
-
-上述网络配置中余下的部分均是神经网络配置。第一行是定义一个名字叫 "pixel" 的 :code:`data_layer` 。每一个layer返回的都是一个 :code:`LayerOutput` 对象。 这里第一层的输出对象是 :code:`img` 。然后这个对象传输给了另一个 layer 函数，
-:code:`simple_img_conv_pool` 。:code:`simple_img_conv_pool` 是一个组合层，
-包括了图像的卷积 (convolution) 和池化(pooling)，
-并继续接了一个全连接层( :code:`fc_layer` )，然后再接了一个Softmax的全连接层。
-
-最终，网络配置输出了 :code:`classification_cost` 。标记网络输出的函数为 
-:code:`outputs` 。网络的输出是神经网络的优化目标，神经网络训练的时候，实际上就是
-要最小化这个输出。
-
-在神经网络进行预测的时候，实际上网络的输出也是通过 :code:`outputs` 标记。
-
-
-Layer、Projection、Operator
-===========================
-
-PaddlePaddle的网络基本上是基于Layer来配置的。所谓的Layer即是神经网络的某一层，
-而神经网络的某一层，一般是封装了许多复杂操作的操作集合。比如最简单的
-:code:`fc_layer` ，也包括矩阵乘法，多输入的求和，和activation。
-
-..	code-block:: python
-
-	data = data_layer(name='data', size=200)
-	out = fc_layer(input=data, size=200, act=TanhActivation())
-
-而对于更灵活配置需求，可能这样基于Layer的配置是不灵活的。于是 PaddlePaddle 提供
-了基于 Projection 或者 Operator 的配置。使用Projection和Operator需要与
-:code:`mixed_layer` 配合使用。 :code:`mixed_layer` 是将layer中的元素累加求和，
-并且做一个 :code:`activation` ， 而这个layer具体如何计算，是交由内部的Projection
-和 Operator 定义。Projection是指含有可学习参数的操作，而Operator不含有可学习的
-参数，输入全是其他Layer的输出。
-
-
-例如，和 :code:`fc_layer` 同样功能的 :code:`mixed_layer` 。
-
-..	code-block:: python
-
-	data = data_layer(name='data', size=200)
-	with mixed_layer(size=200) as out:
-		out += full_matrix_projection(input=data)
-
-PaddlePaddle可以使用的mixed layer 配置出非常复杂的网络，甚至可以直接配置一个完整的LSTM。
-用户可以参考 `mixed_layer`_ 的相关文档进行配置。
-
-如何利用单机的所有GPU或所有CPU核心
-==================================
-
-PaddlePaddle的单机进程 :code:`paddle train` 可以充分利用一台计算机上所有的GPU资
-源或者CPU。
-
-如果要使用机器上多块GPU，使用如下命令即可\:
-
-..	code-block:: bash
-
-	paddle train --use_gpu=true --trainer_count=4  # use 4 gpu card, 0, 1, 2, 3
-
-如果要使用机器上多块CPU, 使用如下命令即可\:
-
-..	code-block:: bash
-
-	paddle train --trainer_config=4  # use 4 cpu cores.
-
-对于其他设置GPU的选择情况，例如选择第0、2号GPU显卡，则可以使用 :code:`CUDA_VISIBLE_DEVICES` 环境变量来选择部分的显卡。 具体可以参考连接`masking-gpus`_ 。 可以使用的命令为
-
-..	code-block:: bash
-
-	env CUDA_VISIBLE_DEVICES=0,2 paddle train --use_gpu=true --trainer_config=2
-
-如何利用多台机器的计算资源训练神经网络
-======================================
-
-PaddlePaddle多机使用的经典方法是通过 :code:`Parameter Server` 来对多机的 :code:`paddle train` 进行同步。 而多机训练神经网络，首先要讲数据切分到不同的机器上。 切分数据文件的方式在PaddlePaddle的开源实现中并没有提供工具包。 但是切分数据并不是一件非常复杂的事情，也不是神经网络实现的重点。
-
-多机训练过程中，经典的拓扑结构如下\:
-
-..	graphviz:: pserver_topology.dot
-
-图中每个灰色方块是一台机器，在每个机器中，先去启动一个 :code:`paddle pserver` 进程，并确定整体的端口号。可能的参数是\:
-
-..	code-block:: bash
-
-	paddle pserver --port=5000 --num_gradient_servers=4 --nics='eth0'
-
-这里说明系统的 :code:`paddle pserver` 的起始端口是 :code:`5000` ，并且有四个训练进程(:code:`gradient_servers`，Paddle同时将 :code:`paddle train` 进程称作 :code:`GradientServer` 。因为其为负责提供Gradient的进程)。 而对于训练进程的话，则需要在 :code:`paddle pserver` 启动之后，再在各个节点上运行如下命令\:
-
-..	code-block:: bash
-
-	paddle train --port=5000 --pservers=192.168.100.101,192.168.100.102,192.168.100.103,192.168.100.104 --config=...
-
-对于简单的多机协同使用上述方式即可。同时，pserver/train 通常在高级情况下，还有两个参数需要设置，他们是
-
-* --ports_num\: 一个 pserver进程共绑定多少个端口用来做稠密更新。默认是1
-* --ports_num_for_sparse\: 一个pserver进程共绑定多少端口用来做稀疏更新，默认是0
-
-使用手工指定端口数量，是因为Paddle的网络通信中，使用了 :code:`int32` 作为消息长度，比较容易在大模型下溢出。所以，在 :code:`paddle pserver` 进程中可以启动多个子线程去接受 trainer 的数据，这样单个子线程的长度就不会溢出了。但是这个值不可以调的过大，因为增加这个值，还是对性能，尤其是内存占用有一定的开销的，另外稀疏更新的端口如果太大的话，很容易某一个参数服务器没有分配到任何参数。
-
-详细的说明可以参考，使用 `集群训练Paddle`_ 。
-
-
-..  _PyDataProvider: ../ui/data_provider/pydataprovider2.html
-..	_settings: ../../doc/ui/api/trainer_config_helpers/optimizers.html#settings
-..	_mixed_layer: ../../doc/ui/api/trainer_config_helpers/layers.html#mixed-layer
-..	_masking-gpu: http://www.acceleware.com/blog/cudavisibledevices-masking-gpus
-..  _集群训练Paddle: ../cluster/index.html
diff --git a/doc_cn/demo/index.rst b/doc_cn/demo/index.rst
deleted file mode 100644
index e15e839f93d4ac0d455e49fd8b1cde8bf60a29ac..0000000000000000000000000000000000000000
--- a/doc_cn/demo/index.rst
+++ /dev/null
@@ -1,26 +0,0 @@
-使用示例
-========
-
-图像
-''''
-
-* `图像分类 <../../doc/demo/image_classification/index.html>`_
-
-自然语言处理
-''''''''''''
-
-* `情感分析 <sentiment_analysis/index.html>`_
-* `文本生成 <../../doc/demo/text_generation/index.html>`_
-* `词性标注 <../../doc/demo/semantic_role_labeling/index.html>`_
-
-推荐
-''''
-
-* `MovieLens数据集 <../../doc/demo/rec/ml_dataset.html>`_
-* `MovieLens评分回归 <../../doc/demo/rec/ml_regression.html>`_
-
-常用模型
-''''''''
-
-* `ImageNet: ResNet <../../doc/demo/imagenet_model/resnet_model.html>`_
-* `Embedding: Chinese Word <../../doc/demo/embedding_model/index.html>`_
diff --git a/doc_cn/demo/quick_start/index.md b/doc_cn/demo/quick_start/index.md
deleted file mode 100644
index 4d9b24ba851a7aaaeb0d79bfbeb0703b8878b77f..0000000000000000000000000000000000000000
--- a/doc_cn/demo/quick_start/index.md
+++ /dev/null
@@ -1,545 +0,0 @@
-# PaddlePaddle快速入门教程
-
-我们以文本分类问题作为背景，介绍PaddlePaddle使用流程和常用的网络基础单元的配置方法。
-
-## 安装(Install)
-
-首先请参考<a href = "../../build_and_install/index.html">安装教程</a>安装PaddlePaddle。
-
-## 使用概述(Overview)
-
-**文本分类问题**：对于给定的一条文本， 我们从提前给定的类别集合中选择其所属类
-别。比如通过用户对电子商务网站评论，评估产品的质量：
-
-- 这个显示器很棒！ （好评）
-- 用了两个月之后这个显示器屏幕碎了。（差评）
-
-每一个任务流程都可以分为如下5个基础部分。
-<center> ![](./Pipeline.jpg) </center>
-
-1. 数据格式准备
-    - 每行保存一条样本，类别Id 和文本信息用Tab间隔， 文本中的单词用空格分隔（如果不切词，则字与字之间用空格分隔），例如：```类别Id ‘\t’ 这 个 显 示 器 很 棒 ！```
-2. 数据向模型传送
-    - PaddlePaddle可以读取Python写的传输数据脚本，所有字符都将转换为连续整数表示的Id传给模型
-3. 网络结构（由易到难展示4种不同的网络配置）
-    - 逻辑回归模型
-    - 词向量模型
-    - 卷积模型
-    - 时序模型
-    - 优化算法
-4. 训练模型
-5. 预测
-
-## 数据格式准备(Data Preparation)
-在本问题中，我们使用[Amazon电子产品评论数据](http://jmcauley.ucsd.edu/data/amazon/)，
-将评论分为好评(正样本)和差评(负样本)两类。[源码](https://github.com/baidu/Paddle)的`demo/quick_start`里提供了数据下载脚本
-和预处理脚本。
-
-```bash
-cd demo/quick_start
-./data/get_data.sh
-./preprocess.sh
-```
-
-## 数据向模型传送(Transfer Data to Model)
-
-### Python数据加载脚本(Data Provider Script)
-
-下面dataprovider_bow.py文件给出了完整例子，主要包括两部分：
-
-* initalizer： 定义文本信息、类别Id的数据类型。
-* process： yield文本信息和类别Id，和initalizer里定义顺序一致。
-
-```python
-from paddle.trainer.PyDataProvider2 import *
-
-# id of the word not in dictionary
-UNK_IDX = 0
-
-# initializer is called by the framework during initialization.
-# It allows the user to describe the data types and setup the
-# necessary data structure for later use.
-# `settings` is an object. initializer need to properly fill settings.input_types.
-# initializer can also store other data structures needed to be used at process().
-# In this example, dictionary is stored in settings.
-# `dictionay` and `kwargs` are arguments passed from trainer_config.lr.py
-def initializer(settings, dictionary, **kwargs):
-    # Put the word dictionary into settings
-    settings.word_dict = dictionary
-
-    # setting.input_types specifies what the data types the data provider
-    # generates.
-    settings.input_types = [
-        # The first input is a sparse_binary_vector,
-        # which means each dimension of the vector is either 0 or 1. It is the
-        # bag-of-words (BOW) representation of the texts.
-        sparse_binary_vector(len(dictionary)),
-        # The second input is an integer. It represents the category id of the
-        # sample. 2 means there are two labels in the dataset.
-        # (1 for positive and 0 for negative)
-        integer_value(2)]
-
-# Delaring a data provider. It has an initializer 'data_initialzer'.
-# It will cache the generated data of the first pass in memory, so that
-# during later pass, no on-the-fly data generation will be needed.
-# `setting` is the same object used by initializer()
-# `file_name` is the name of a file listed train_list or test_list file given
-# to define_py_data_sources2(). See trainer_config.lr.py.
-@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    # Open the input data file.
-    with open(file_name, 'r') as f:
-        # Read each line.
-        for line in f:
-            # Each line contains the label and text of the comment, separated by \t.
-            label, comment = line.strip().split('\t')
-
-            # Split the words into a list.
-            words = comment.split()
-
-            # convert the words into a list of ids by looking them up in word_dict.
-            word_vector = [settings.word_dict.get(w, UNK_IDX) for w in words]
-
-            # Return the features for the current comment. The first is a list
-            # of ids representing a 0-1 binary sparse vector of the text,
-            # the second is the integer id of the label.
-            yield word_vector, int(label)
-```
-
-### 配置中的数据加载定义(Data Provider in Configure)
-
-在模型配置中利用`define_py_data_sources2`加载数据：
-
-```python
-from paddle.trainer_config_helpers import *
-
-file = "data/dict.txt"
-word_dict = dict()
-with open(dict_file, 'r') as f:
-    for i, line in enumerate(f):
-        w = line.strip().split()[0]
-        word_dict[w] = i
-# define the data sources for the model.
-# We need to use different process for training and prediction.
-# For training, the input data includes both word IDs and labels.
-# For prediction, the input data only includs word Ids.
-define_py_data_sources2(train_list='data/train.list',
-                        test_list='data/test.list',
-                        module="dataprovider_bow",
-                        obj="process",
-                        args={"dictionary": word_dict})
-```
-* data/train.list,data/test.list: 指定训练、测试数据
-* module="dataprovider": 数据处理Python文件名
-* obj="process": 指定生成数据的函数
-* args={"dictionary": word_dict}: 额外的参数，这里指定词典
-
-更详细数据格式和用例请参考<a href = "../../ui/data_provider/pydataprovider2.html">
-PyDataProvider2</a>。
-
-## 网络结构(Network Architecture)
-本节我们将专注于网络结构的介绍。
-<center> ![](./PipelineNetwork.jpg) </center>
-
-我们将以基本的逻辑回归网络作为起点，并逐渐展示更加深入的功能。更详细的网络配置
-连接请参考<a href = "../../../doc/layer.html">Layer文档</a>。
-所有配置在[源码](https://github.com/baidu/Paddle)`demo/quick_start`目录，首先列举逻辑回归网络。
-
-### 逻辑回归模型(Logistic Regression)
-
-流程如下：
-<center> ![](./NetLR.jpg) </center>
-
-- 获取利用one-hot vector表示的每个单词，维度是词典大小
-
-```python
-word = data_layer(name="word",  size=word_dim)
-```
-
-- 获取该条样本类别Id，维度是类别个数。
-
-```python
-label = data_layer(name="label", size=label_dim)
-```
-
-- 利用逻辑回归模型对该向量进行分类，同时会计算分类准确率
-
-```python
-# Define a fully connected layer with logistic activation (also called softmax activation).
-output = fc_layer(input=word,
-                  size=label_dim,
-                  act_type=SoftmaxActivation())
-# Define cross-entropy classification loss and error.
-classification_cost(input=output, label=label)
-```
-
- - input: 除过data层，每个层都有一个或多个input,多个input以list方式输入
- - size: 该层神经元个数
- - act_type: 激活函数类型
-
-效果总结：我们将在后面介绍训练和预测的流程的脚本。在此为方便对比不同网络结构，
-我们随时总结了各个网络的复杂度和效果。
-
-<html>
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">网络名称</th>
-<th scope="col" class="left">参数数量</th>
-<th scope="col" class="left">错误率</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">逻辑回归</td>
-<td class="left">252 KB</td>
-<td class="left">8.652%</td>
-</tr>
-
-</tbody>
-</table></center>
-</html>
-<br>
-
-### 词向量模型(Word Vector)
-
-embedding模型需要稍微改变数据提供的脚本，即`dataprovider_emb.py`，词向量模型、
-卷积模型、时序模型均使用该脚本。其中文本输入类型定义为整数时序类型integer_value_sequence。
-
-```
-def initializer(settings, dictionary, **kwargs):
-    settings.word_dict = dictionary
-    settings.input_types = [
-        # Define the type of the first input as sequence of integer.
-        # The value of the integers range from 0 to len(dictrionary)-1
-        integer_value_sequence(len(dictionary)),
-        # Define the second input for label id
-        integer_value(2)]
-
-@provider(init_hook=initializer)
-def process(settings, file_name):
-    ...
-    # omitted, it is same as the data provider for LR model
-```
-
-该模型依然是使用逻辑回归分类网络的框架， 只是将句子利用连续向量表示替换稀疏
-向量表示， 即对第3步进行替换。句子表示的计算更新为2步：
-<center> ![](./NetContinuous.jpg) </center>
-
-- 利用单词Id查找对应的该单词的连续表示向量(维度为word_dim)， 输入N个单词，输出为N个word_dim维度向量
-
-```python
-emb = embedding_layer(input=word, size=word_dim)
-```
-
-- 将该句话包含的所有单词向量求平均得到句子的表示
-
-```python
-avg = pooling_layer(input=emb, pooling_type=AvgPooling())
-```
-
-其它部分和逻辑回归网络结构一致。
-效果总结：
-
-<html>
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">网络名称</th>
-<th scope="col" class="left">参数数量</th>
-<th scope="col" class="left">错误率</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">词向量模型</td>
-<td class="left">15 MB</td>
-<td class="left">8.484%</td>
-</tr>
-
-</tbody>
-</table>
-</html></center>
-<br>
-
-### 卷积模型(Convolution)
-卷积网络是一种特殊的从词向量表示到句子表示的方法， 也就是将词向量模型额步
-骤3-2进行进一步演化， 变为3个新的子步骤。
-<center> ![](./NetConv.jpg) </center>
-
-文本卷积分为三个步骤：
-1. 获取每个单词左右各k个近邻， 拼接成一个新的向量表示；
-2. 对该表示进行非线性变换 （例如Sigmoid变换）, 成为维度为hidden_dim的新的向量；
-3. 在每个维度上取出在该句话新的向量集合上该维度的最大值作为最后的句子表示向量。 这3个子步骤可配置为:
-
-```python
-text_conv = sequence_conv_pool(input=emb,
-	                           context_start=k,
-	                           context_len=2 * k + 1)
-```
-
-效果总结：
-
-<html>
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">网络名称</th>
-<th scope="col" class="left">参数数量</th>
-<th scope="col" class="left">错误率</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">卷积模型</td>
-<td class="left">16 MB</td>
-<td class="left">5.628%</td>
-</tr>
-
-</tbody>
-</table></center>
-<br>
-
-### 时序模型(Time Sequence)
-<center> ![](./NetRNN.jpg) </center>
-
-时序模型即为RNN模型, 包括简单的RNN模型、GRU模型、LSTM模型等。
-
-- GRU模型配置：
-
-```python
-gru = simple_gru(input=emb, size=gru_size)
-```
-
-- LSTM模型配置：
-
-```python
-lstm = simple_lstm(input=emb, size=lstm_size)
-```
-
-针对本问题，我们采用单层LSTM模型，并使用了Dropout，效果总结：
-
-<html>
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">网络名称</th>
-<th scope="col" class="left">参数数量</th>
-<th scope="col" class="left">错误率</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">时序模型</td>
-<td class="left">16 MB</td>
-<td class="left">4.812%</td>
-</tr>
-
-</tbody>
-</table></center>
-</html>
-<br>
-
-## 优化算法(Optimization Algorithm)
-<a href = "../../../doc/ui/trainer_config_helpers_api.html#module-paddle.trainer_config_helpers.optimizers">优化算法</a>包括
-Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优化方法，加了L2正则和梯度截断。
-
-```python
-settings(batch_size=128,
-         learning_rate=2e-3,
-         learning_method=AdamOptimizer(),
-         regularization=L2Regularization(8e-4),
-         gradient_clipping_threshold=25)
-```
-
-## 训练模型(Training Model)
-在完成了数据和网络结构搭建之后， 我们进入到训练部分。
-<center> ![](./PipelineTrain.jpg) </center>
-
-训练脚本：我们将训练的命令行保存在了 `train.sh`文件中。训练时所需设置的主要参数如下：
-
-```bash
-paddle train \
---config=trainer_config.py \
---log_period=20 \
---save_dir=./output \
---num_passes=15 \
---use_gpu=false
-```
-这里没有介绍多机分布式训练，可以参考<a href = "../../cluster/index.html">分布式训练</a>的demo学习如何进行多机训练。
-
-## 预测(Prediction)
-可以使用训练好的模型评估带有label的验证集，也可以预测没有label的测试集。
-<center> ![](./PipelineTest.jpg) </center>
-
-测试脚本如下，将会测试配置文件中test.list指定的数据。
-
-```bash
-paddle train \
---use_gpu=false \
---job=test \
---init_model_path=./output/pass-0000x
-```
-
-可以参考<a href = "../../ui/predict/swig_py_paddle.html">Python API预测</a>
-教程，或其他<a href = "../../demo/index.html">demo</a>的Python预测过程。也可以通过如下方式预测。
-
-预测脚本(`predict.sh`)：
-
-```bash
-model="output/pass-00003"
-paddle train \
-    --config=trainer_config.lstm.py \
-    --use_gpu=false \
-    --job=test \
-    --init_model_path=$model \
-    --config_args=is_predict=1 \
-    --predict_output_dir=. \
-
-mv rank-00000 result.txt
-```
-这里以`output/pass-00003`为例进行预测，用户可以根据训练log选择test结果最好的模型来预测。与训练网络配置不同的是：无需label相关的层，指定outputs输出概率层(softmax输出)，
-指定batch_size=1，数据传输无需label数据，预测数据指定test_list的位置。
-
-预测结果以文本的形式保存在`result.txt`中，一行为一个样本，格式如下：
-
-```
-预测ID;ID为0的概率 ID为1的概率
-预测ID;ID为0的概率 ID为1的概率
-```
-
-```
-is_predict = get_config_arg('is_predict', bool, False)
-trn = 'data/train.list' if not is_predict else None
-tst = 'data/test.list' if not is_predict else 'data/pred.list'
-obj = 'process' if not is_predict else 'process_pre'
-batch_size = 128 if not is_predict else 1
-if is_predict:
-    maxid = maxid_layer(output)
-    outputs([maxid,output])
-else:
-    label = data_layer(name="label", size=2)
-    cls = classification_cost(input=output, label=label)
-    outputs(cls)
-```
-
-## 总体效果总结(Summary)
-这些流程中的数据下载、网络配置、训练脚本在`/demo/quick_start`目录，我们在此总
-结上述网络结构在Amazon-Elec测试集(25k)上的效果:
-
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">网络名称</th>
-<th scope="col" class="left">参数数量</th>
-<th scope="col" class="left">错误率</th>
-<th scope="col" class="left">配置文件</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">逻辑回归模型</td>
-<td class="left"> 252KB </td>
-<td class="left">8.652%</td>
-<td class="left">trainer_config.lr.py</td>
-</tr>
-
-<tr>
-<td class="left">词向量模型</td>
-<td class="left"> 15MB </td>
-<td class="left"> 8.484%</td>
-<td class="left">trainer_config.emb.py</td>
-</tr>
-
-<tr>
-<td class="left">卷积模型</td>
-<td class="left"> 16MB </td>
-<td class="left"> 5.628%</td>
-<td class="left">trainer_config.cnn.py</td>
-</tr>
-
-<tr>
-<td class="left">时序模型</td>
-<td class="left"> 16MB </td>
-<td class="left"> 4.812%</td>
-<td class="left">trainer_config.lstm.py</td>
-</tr>
-
-</tbody>
-</table>
-</center>
-<br>
-
-## 附录(Appendix)
-### 命令行参数(Command Line Argument)
-
-* \--config：网络配置
-* \--save_dir：模型存储路径
-* \--log_period：每隔多少batch打印一次日志
-* \--num_passes：训练轮次，一个pass表示过一遍所有训练样本
-* \--config_args：命令指定的参数会传入网络配置中。
-* \--init_model_path：指定初始化模型路径，可用在测试或训练时指定初始化模型。
-
-默认一个pass保存一次模型，也可以通过saving_period_by_batches设置每隔多少batch保存一次模型。
-可以通过show_parameter_stats_period设置打印参数信息等。
-其他参数请参考<a href = "../../ui/index.html#command-line-argument">令行参数文档</a>。
-
-### 输出日志(Log)
-
-```
-TrainerInternal.cpp:160]  Batch=20 samples=2560 AvgCost=0.628761 CurrentCost=0.628761 Eval: classification_error_evaluator=0.304297  CurrentEval: classification_error_evaluator=0.304297
-```
-模型训练会看到这样的日志，详细的参数解释如下面表格：
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">名称</th>
-<th scope="col" class="left">解释</th>
-</tr>
-</thead>
-
-<tr>
-<td class="left">Batch=20</td>
-<td class="left"> 表示过了20个batch </td>
-</tr>
-
-<tr>
-<td class="left">samples=2560</td>
-<td class="left"> 表示过了2560个样本 </td>
-</tr>
-
-<tr>
-<td class="left">AvgCost</td>
-<td class="left"> 每个pass的第0个batch到当前batch所有样本的平均cost </td>
-</tr>
-
-<tr>
-<td class="left">CurrentCost</td>
-<td class="left"> 当前log_period个batch所有样本的平均cost </td>
-</tr>
-
-<tr>
-<td class="left">Eval: classification_error_evaluator</td>
-<td class="left"> 每个pass的第0个batch到当前batch所有样本的平均分类错误率 </td>
-</tr>
-
-<tr>
-<td class="left">CurrentEval: classification_error_evaluator</td>
-<td class="left"> 当前log_period个batch所有样本的平均分类错误率 </td>
-</tr>
-
-</tbody>
-</table>
-</center>
-<br>
diff --git a/doc_cn/demo/sentiment_analysis/index.rst b/doc_cn/demo/sentiment_analysis/index.rst
deleted file mode 100644
index 82400b2459ebcaf89ff5e884edfe721b9ec01d7f..0000000000000000000000000000000000000000
--- a/doc_cn/demo/sentiment_analysis/index.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-情感分析教程
-===========================
-
-.. toctree::
-    :maxdepth: 3
-    :glob:
-
-    Training Locally <sentiment_analysis.md>
\ No newline at end of file
diff --git a/doc_cn/index.rst b/doc_cn/index.rst
deleted file mode 100644
index f1398206fddffca77f583c195e00034e55932639..0000000000000000000000000000000000000000
--- a/doc_cn/index.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-PaddlePaddle文档
-================
-
-使用指南
---------
-* `介绍 <introduction/index.html>`_
-* `快速入门 <demo/quick_start/index.html>`_
-* `基本使用概念 <concepts/use_concepts.html>`_
-* `编译与安装 <build_and_install/index.html>`_
-* `用户接口 <ui/index.html>`_
-* `使用示例 <demo/index.html>`_
-* `模型配置 <../doc/ui/api/trainer_config_helpers/index.html>`_
-* `集群训练 <cluster/index.html>`_
-
-开发指南
---------
-* `新写Layer <../doc/dev/new_layer/index.html>`_
-* `如何贡献文档 <howto/how_to_write_docs/index.html>`_
-
-算法教程
---------
-
-* `Recurrent Group教程 <algorithm/rnn/rnn-tutorial.html>`_
-* `单层RNN示例 <../doc/algorithm/rnn/rnn.html>`_
-* `双层RNN示例 <algorithm/rnn/hierarchical-rnn.html>`_
-* `支持双层序列作为输入的Layer <algorithm/rnn/hierarchical-layer.html>`_
-
-常见问题
---------
-
-* `常见问题 <faq/index.html>`_
diff --git a/doc_cn/introduction/index.md b/doc_cn/introduction/index.md
deleted file mode 100644
index 164cb7d4943dfbfcc00a2df7329ae2a877b2d703..0000000000000000000000000000000000000000
--- a/doc_cn/introduction/index.md
+++ /dev/null
@@ -1,105 +0,0 @@
-# 简介
-
-PaddlePaddle 是起源于百度的开源深度学习平台。它是简单易用的：你可以通过简单的十数行配置搭建经典的神经网络模型；它也是高效强大的：PaddlePaddle可以支撑复杂集群环境下超大模型的训练，令你受益于深度学习的前沿成果。在百度内部，已经有大量产品线使用了基于PaddlePaddle的深度学习技术。
-
-这份简短的介绍将像你展示如何利用PaddlePaddle解决一个经典的学习问题。
-
-## 1. 一个经典的任务
-
-让我们从一个基础问题开始：<a href="https://www.baidu.com/s?wd=单变量线性回归">单变量的线性回归</a>。问题假定观测到了一批二维空间上的点`(x, y) `，并且已知 `x` 和 `y` 之间存在着某种线性关系，我们的目标是通过观测数据还原这个线性关系。作为一个简单基础的模型，线性回归却有着广泛的应用场景。比如可以想象一个资产定价的简化场景，其中 `x` 对应于房屋的大小，`y` 对应于房屋价格。我们可以通过观察市场上房屋的情况获得二者之间的关系，从而为新房屋的定价提供参考。
-
-
-## 2. 准备数据
-
-假设变量 `X` 和 `Y` 的真实关系为： `Y = 2X + 0.3`，这里展示如何使用观测数据还原这一线性关系。如下Python代码将随机产生2000个观测点，它们将被用作PaddlePaddle的输入。产生PaddlePaddle的输入数据和写一段普通的Python脚本几乎一样，你唯一需要增加的就是定义输入数据的类型。
-
-```python
-# -*- coding:utf-8 -*-
-# dataprovider.py
-from paddle.trainer.PyDataProvider2 import *
-import random
-
-# 定义输入数据的类型: 2个浮点数
-@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-def process(settings, input_file):
-    for i in xrange(2000):
-        x = random.random()
-        yield [x], [2*x+0.3]
-```
-
-## 3. 训练模型
-
-为了还原 `Y = 2X + 0.3`，我们先从一条随机的直线 `Y' = wX + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `Y'` 和 `Y` 的差距不断减小，最终趋于相同。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
-
-在PaddlePaddle里，该模型的网络配置如下。
-
-```python
-# -*- coding:utf-8 -*-
-# trainer_config.py
-from paddle.trainer_config_helpers import *
-
-# 1. 定义数据来源，调用上面的process函数获得观测数据
-data_file = 'empty.list'
-with open(data_file, 'w') as f: f.writelines(' ')
-define_py_data_sources2(train_list=data_file, test_list=None, 
-        module='dataprovider', obj='process',args={})
-
-# 2. 学习算法。控制如何改变模型参数 w 和 b
-settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-
-# 3. 神经网络配置
-x = data_layer(name='x', size=1)
-y = data_layer(name='y', size=1)
-# 线性计算单元: y_predict = wx + b
-y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-# 损失计算，度量 y_predict 和真实 y 之间的差距
-cost = regression_cost(input=y_predict, label=y)
-outputs(cost)
-```
-这段简短的配置展示了PaddlePaddle的基本用法：
-
-- 首先，第一部分定义了数据输入。一般情况下，PaddlePaddle先从一个文件列表里获得数据文件地址，然后交给用户自定义的函数（例如上面的`process`函数）进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件，所以放一个空列表（`empty.list`）即可。
-
-- 第二部分主要是选择学习算法，它定义了模型参数如何改变。PaddlePaddle提供了很多优秀的学习算法，但这里使用一个简单的基于momentum的算法就足够了，它每次读取12个数据进行计算和模型更新。
-
-- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络单元（Layer），所以很多时候你需要做的只是声明正确的网络单元并把它们拼接起来。这里使用了三种网络单元：
-	- **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到下游的其它单元。这里数据层有两个，分别对应于变量 `X` 和 `Y`。
-	- **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以挖掘复杂的数据关系。
-	- **回归损失层**：回归损失层 `regression_cost`是众多损失函数层的一种，它们在训练过程作为网络的出口，用来计算模型的表现，并指导模型参数的改变。
-
-这样定义了网络结构并保存为`trainer_config.py`之后，运行训练命令即可：
- ```
- paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
- ```
-
-PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加损失函数的输出在不断的减小，这意味着模型在不断的改进，直到逼近真实解：` Y = 2X + 0.3 `
-
-## 4. 模型检验
-
-训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用模型对另外一组数据进行预测，然后评价预测的效果。但在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
-
-PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件，所以可以利用如下方法读取模型的参数。
-
-```python
-import numpy as np
-import os
-
-def load(file_name):
-    with open(file_name, 'rb') as f:
-        f.read(16) # skip header for float type.
-        return np.fromfile(f, dtype=np.float32)
-        
-print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-# w=1.999743, b=0.300137
-```
-<center> ![](./parameters.png) </center>
-
-从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型重合。
-
-这样，我们就完成了对单变量线性回归问题的解决：将数据输入PaddlePaddle，训练模型，最后验证结果。
-
-## 5. 推荐后续阅读
-
-- <a href="../build_and_install/index.html">安装/编译</a>：PaddlePaddle的安装与编译文档。
-- <a href="../demo/quick_start/index.html">快速入门 </a>：使用商品评论分类任务，系统性的介绍如何一步步改进，最终得到产品级的深度模型。
-- <a href="../demo/index.html">示例</a>：各种实用案例，涵盖图像、文本、推荐等多个领域。
diff --git a/doc_cn/ui/cmd/dump_config.rst b/doc_cn/ui/cmd/dump_config.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/doc_cn/ui/cmd/index.rst b/doc_cn/ui/cmd/index.rst
deleted file mode 100644
index 6d62180a6a5e3f2490cccd2a90213050aa3c172e..0000000000000000000000000000000000000000
--- a/doc_cn/ui/cmd/index.rst
+++ /dev/null
@@ -1,24 +0,0 @@
-命令行参数
-==========
-
-安装好的PaddlePaddle脚本包括多条命令，他们是
-
-* paddle train即为PaddlePaddle的训练进程。可以使用paddle train完成单机多显卡多线程的训
-  练。也可以和paddle pserver组合使用，完成多机训练。
-* paddle pserver为PaddlePaddle的parameter server进程。负责多机训练中的参数聚合工作。
-* paddle version可以打印出PaddlePaddle的版本和编译时信息。
-* merge_model 可以将PaddlePaddle的模型和配置打包成一个文件。方便部署分发。
-* dump_config 可以将PaddlePaddle的训练模型以proto string的格式打印出来
-* make_diagram 可以使用graphviz对PaddlePaddle的网络模型进行绘制，方便调试使用。
-
-更详细的介绍请参考各个命令的命令行参数文档。
-
-..  toctree::
-    :glob:
-
-    paddle_train.rst
-    paddle_pserver.rst
-    paddle_version.rst
-    merge_model.rst
-    dump_config.rst
-    make_diagram.rst
diff --git a/doc_cn/ui/cmd/make_diagram.rst b/doc_cn/ui/cmd/make_diagram.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/doc_cn/ui/cmd/merge_model.rst b/doc_cn/ui/cmd/merge_model.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/doc_cn/ui/cmd/paddle_pserver.rst b/doc_cn/ui/cmd/paddle_pserver.rst
deleted file mode 100644
index 891975c34af5c34dddc754b79bd3e1adda9d9671..0000000000000000000000000000000000000000
--- a/doc_cn/ui/cmd/paddle_pserver.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-paddle pserver的命令行参数
-==========================
diff --git a/doc_cn/ui/cmd/paddle_train.rst b/doc_cn/ui/cmd/paddle_train.rst
deleted file mode 100644
index 87b84f5cbdbbe016d9bcdbda2cb30d93d2ad8022..0000000000000000000000000000000000000000
--- a/doc_cn/ui/cmd/paddle_train.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-paddle train的命令行参数
-========================
diff --git a/doc_cn/ui/cmd/paddle_version.rst b/doc_cn/ui/cmd/paddle_version.rst
deleted file mode 100644
index 0a4f8dd472a6009ef6832df75be043c24bb32ba0..0000000000000000000000000000000000000000
--- a/doc_cn/ui/cmd/paddle_version.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-paddle version的命令行参数
-==========================
-
-paddle version可以打印出paddle的版本信息和编译的选项。常见的输出格式为
-
-..  literalinclude:: paddle_version.txt
-
-其第一行说明了paddle的版本，后面跟着一系列编译参数。这里可以参考paddle的
-`编译参数选项文件 <../../build/cmake/compile_options.html>`_
diff --git a/doc_cn/ui/cmd/paddle_version.txt b/doc_cn/ui/cmd/paddle_version.txt
deleted file mode 100644
index 33e2e4de7c24afd481eb6ca7eabed4924863d2b7..0000000000000000000000000000000000000000
--- a/doc_cn/ui/cmd/paddle_version.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-PaddlePaddle 0.8.0b, compiled with
-    with_avx: ON
-    with_gpu: ON
-    with_double: OFF
-    with_python: ON
-    with_rdma: OFF
-    with_glog: ON
-    with_gflags: ON
-    with_metric_learning: OFF
-    with_timer: OFF
-    with_predict_sdk: OFF
diff --git a/doc_cn/ui/data_provider/index.rst b/doc_cn/ui/data_provider/index.rst
deleted file mode 100644
index ec8f8e5dc5b29e3504d0087e844c1f14436919d9..0000000000000000000000000000000000000000
--- a/doc_cn/ui/data_provider/index.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-PaddlePaddle的数据提供(DataProvider)介绍
-========================================
-
-数据提供(DataProvider)是PaddlePaddle负责提供数据的模块。其作用是将训练数据传入内存或者显存，让神经网络可以进行训练。简单的使用，用户可以使用Python的 :code:`PyDataProvider` 来自定义传数据的过程。如果有更复杂的使用，或者需要更高的效率，用户也可以在C++端自定义一个 :code:`DataProvider` 。
-
-PaddlePaddle需要用户在网络配置(trainer_config.py)中定义使用哪种DataProvider及其参数，训练文件列表(train.list)和测试文件列表(test.list)。
-
-其中，train.list和test.list均为本地的两个文件(推荐直接放置到训练目录，以相对路径引用)。如果test.list不设置，或者设置为None，那么在训练过程中，不会执行测试操作。否则，会根据命令行参数指定的测试方式，在训练过程中进行测试，从而防止过拟合。
-
-一般情况下，train.list和test.list为纯文本文件，一行对应一个数据文件，数据文件存放在本地磁盘中。将文件的绝对路径或相对路径(相对于PaddlePaddle程序运行时的路径)写在train.list和test.list中。当然，train.list和test.list也可以放置hdfs文件路径，或者数据库连接地址等等。
-
-用户在DataProvider中需要实现如何访问其中每一个文件。DataProvider的具体用法和如何实现一个新的DataProvider，请参考下述文章:
-
-..	toctree::
-
-	pydataprovider2.rst
-	write_new_dataprovider.rst
diff --git a/doc_cn/ui/data_provider/mnist_provider.py b/doc_cn/ui/data_provider/mnist_provider.py
deleted file mode 100644
index 8b828641d55735e67ca634107d5b239150649651..0000000000000000000000000000000000000000
--- a/doc_cn/ui/data_provider/mnist_provider.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from paddle.trainer.PyDataProvider2 import *
-
-
-# Define a py data provider
-@provider(input_types=[dense_vector(28 * 28), integer_value(10)])
-def process(settings, filename):  # settings is not used currently.
-    f = open(filename, 'r')  # open one of training file
-
-    for line in f:  # read each line
-        label, pixel = line.split(';')
-
-        # get features and label
-        pixels_str = pixel.split(' ')
-
-        pixels_float = []
-        for each_pixel_str in pixels_str:
-            pixels_float.append(float(each_pixel_str))
-
-        # give data to paddle.
-        yield pixels_float, int(label)
-
-    f.close()  # close file
diff --git a/doc_cn/ui/data_provider/pydataprovider2.rst b/doc_cn/ui/data_provider/pydataprovider2.rst
deleted file mode 100644
index 80b40084d8f5037a76df0b3e01ed5742d8476bd0..0000000000000000000000000000000000000000
--- a/doc_cn/ui/data_provider/pydataprovider2.rst
+++ /dev/null
@@ -1,257 +0,0 @@
-PyDataProvider2的使用
-=====================
-
-PyDataProvider是PaddlePaddle使用Python提供数据的推荐接口。使用该接口用户可以只关注如何
-从文件中读取每一条数据，而不用关心数据如何传输给PaddlePaddle，数据如何存储等等。该数据
-接口使用多线程读取数据，并提供了简单的Cache功能。
-
-
-简单的使用场景
---------------
-
-这里以MNIST手写识别为例，来说明简单的PyDataProvider如何使用。MNIST是一个包含有
-70,000张灰度图片的数字分类数据集。对于MNIST而言，标签是0-9的数字，而特征即为
-28*28的像素灰度值。这里我们使用简单的文本文件表示MNIST图片，样例数据如下。
-
-..  literalinclude:: mnist_train.txt
-
-其数据使用;间隔，第一段数据为这张图片的label，第二段数据为这个图片的像素值。
-首先我们将这个数据文件(例如文件名是'mnist_train.txt')写入train.list。那么
-train.list即为
-
-..  literalinclude:: train.list
-
-那么对应的dataprovider既为
-
-..  literalinclude:: mnist_provider.py
-    :linenos:
-
-其中第一行是引入PaddlePaddle的PyDataProvider2包。主要函数是process函数。process函数
-具有两个参数，第一个参数是 settings 。这个参数在这个样例里没有使用，具
-体可以参考 settings 。第二个参数是filename，这个参数被PaddlePaddle进程传入，为
-train.list中的一行(即train.list若干数据文件路径的某一个路径)。
-
-:code:`@provider` 是一个Python的 `Decorator <http://www.learnpython.org/en/Decorators>`_
-。这行的作用是设置DataProvider的一些属性，并且标记process函数是一个DataProvider。
-如果不了解 `Decorator <http://www.learnpython.org/en/Decorators>`_ 是什么也没关系，
-只需要知道这只是一个标记属性的方法就可以了。
-
-属性 `input_types`_ 是设置这个DataProvider返回什么样的数据。这里设置的是返回一个
-28*28的稠密向量和一个[0-9]，10维的整数值。 `input_types`_ 具体可以设置成什么其他格
-式，请参考 `input_types`_ 的文档。
-
-process函数是实现数据输入的主函数，在这个函数中，实现了打开文本文件，从文本文件中读取
-每一行，并将每行转换成和 `input_types`_ 一致的特征，并在23行返回给PaddlePaddle进程。需要注意
-的是， 返回的顺序需要和 `input_types`_ 中定义的顺序一致。
-
-同时，返回数据在PaddlePaddle中是仅仅返回一条完整的训练样本，并且使用关键词 :code:`yield` 。
-在PyDataProvider中，可以为一个数据文件返回多条训练样本(就像这个样例一样)，只需要在
-process函数调用多次 :code:`yield` 即可。 :code:`yield` 是Python的一个关键词，相关的概
-念是 :code:`generator` 。使用这个关键词，可以在一个函数里，多次返回变量。
-
-在训练配置里，只需要使用一行代码即可以设置训练引用这个DataProvider。这个设置为
-
-..  literalinclude:: mnist_config.py
-
-这里说明了训练数据是 'train.list'，而没有测试数据。引用的DataProvider是 'mnist_provider' 
-这个模块中的 'process' 函数。
-
-同时，根据模型配置文件中 :code:`data_layer` 的名字，用户也可以显式指定返回的数据对应关系。例如:
-
-.. literalinclude:: mnist_provider.dict.py
-   :linenos:
-
-如果用户不指定返回数据的对应关系，那么PaddlePaddle会粗略的根据layer的声明顺序，
-来确定对应关系。这个对应关系可能不正确。所以推荐使用显式指定返回值和数据对应关系。
-
-至此，简单的PyDataProvider样例就说明完毕了。对于用户来说，讲数据发送给PaddlePaddle，仅仅需要
-知道如何从 **一个文件** 里面读取 **一条** 样本。而PaddlePaddle进程帮助用户做了
-
-* 将数据组合成Batch训练
-* Shuffle训练数据
-* 多线程数据读取
-* 缓存训练数据到内存(可选)
-* CPU->GPU双缓存
-
-是不是很简单呢？
-
-序列模型数据提供
-----------------
-
-序列模型是指数据的某一维度是一个序列形式，即包含时间步信息。所谓时间步信息，
-不一定和时间有关系，只是说明数据的顺序是重要的。例如，文本信息就是一个序列
-数据。
-
-这里举例的数据是英文情感分类的数据。数据是给一段英文文本，分类成正面情绪和
-负面情绪两类(用0和1表示)。样例数据为
-
-..  literalinclude:: sentimental_train.txt
-
-这里，DataProvider可以是
-
-..  literalinclude:: sentimental_provider.py
-
-这个序列模型比较复杂。主要是增加了初始化机制。其中 :code:`on_init` 函数是使用
-`@provider`_ 中的 `init_hook`_ 配置参数配置给DataProvider的。这个函数会在
-DataProvider创建的时候执行。这个初始化函数具有如下参数:
-
-* 第一个参数是 settings 对象。
-* 其他参数均使用key word argument形式传入。有部分参数是Paddle自动生成的，
-  参考 `init_hook`_ 。这里的 :code:`dictionary` 是从训练配置传入的dict对象。
-  即从单词字符串到单词id的字典。
-
-传入这个变量的方式为
-
-..  literalinclude:: sentimental_config.py
-
-这个声明基本上和mnist的样例一致。除了
-
-* 在配置中读取了字典
-* 在声明DataProvider的时候传入了dictionary作为参数。
-
-在 :code:`on_init` 函数中，配置了 `input_types` 。这个和在 `@provider`_ 中配置
-`input_types` 效果一致，但是在 `on_init` 中配置 `input_types` 是在运行时执行的，所以
-可以根据不同的数据配置不同的输入类型。这里的输入特征是词id的序列，所以将 :code:`seq_type`
-设置成了序列(同时，也可以使用 :code:`integer_sequence` 类型来设置)。
-
-同时，将字典存入了settings 对象。这个字典可以在 :code:`process` 函数中使用。 :code:`process`
-函数中的 settings 和 :code:`on_init` 中的settings 是同一个对象。
-
-而在 :code:`process` 函数中，基本的处理逻辑也和mnist逻辑一致。依次返回了文件中的每条数据。
-
-至此，基本的PyDataProvider使用介绍完毕了。具体DataProvider还具有什么功能，请参考下节reference。
-
-参考(Reference)
----------------
-
-@provider
-+++++++++
-
-:code:`@provider` 是一个Python的 `Decorator`_ ，他可以将某一个函数标记成一个PyDataProvider。它包含的参数有:
-
-*  `input_types`_ 是数据输入格式。具体有哪些格式，参考 `input_types`_ 。
-*  should_shuffle 是个DataProvider是不是要做shuffle，如果不设置的话，训练的时候默认shuffle，
-   测试的时候默认不shuffle。
-*  min_pool_size 是设置DataProvider在内存中最小暂存的数据条数。这个也是PaddlePaddle所能够保证的shuffle粒度。
-   设置成-1的话，会预先读取全部数据到内存中。
-*  pool_size 是设置DataProvider在内存中暂存的数据条数。设置成-1的话，即不在乎内存暂存多少条数据。
-*  can_over_batch_size 表示是否允许Paddle暂存略微多余pool_size的数据。这样做可以避免很多死锁问题。
-   一般推荐设置成True
-*  calc_batch_size 传入的是一个函数，这个函数以一条数据为参数，返回batch_size的大小。默认情况下一条数据
-   是一个batch size，但是有时为了计算均衡性，可以将一条数据设置成多个batch size
-*  cache 是数据缓存的策略，参考 `cache`_
-*  init_hook 是初始化时调用的函数，参考 `init_hook`_
-*  check 设置成true的话，会根据input_types检查数据的合法性。
-*  check_fail_continue 如果设置成true的话，即使在check中数据不合法，也会扔到这条数据，继续训练。 如果
-   check是false的话，没有作用。
-
-input_types
-+++++++++++
-
-PaddlePaddle的数据包括四种主要类型，和三种序列模式。其中，四种数据类型是
-
-* dense_vector 表示稠密的浮点数向量。
-* sparse_binary_vector 表示稀疏的零一向量，即大部分值为0，有值的位置只能取1
-* sparse_float_vector 表示稀疏的向量，即大部分值为0，有值的部分可以是任何浮点数
-* integer 表示整数标签。
-
-而三种序列模式为
-
-* SequenceType.NO_SEQUENCE 即不是一条序列
-* SequenceType.SEQUENCE 即是一条时间序列
-* SequenceType.SUB_SEQUENCE 即是一条时间序列，且序列的每一个元素还是一个时间序列。
-
-不同的数据类型和序列模式返回的格式不同，列表如下
-
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-|                      | NO_SEQUENCE         | SEQUENCE                          |  SUB_SEQUENCE                                  |
-+======================+=====================+===================================+================================================+
-| dense_vector         | [f, f, ...]         | [[f, ...], [f, ...], ...]         | [[[f, ...], ...], [[f, ...], ...],...]         |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| sparse_binary_vector | [i, i, ...]         | [[i, ...], [i, ...], ...]         | [[[i, ...], ...], [[i, ...], ...],...]         |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| sparse_float_vector  | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| integer_value        |  i                  | [i, i, ...]                       | [[i, ...], [i, ...], ...]                      |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-
-其中，f代表一个浮点数，i代表一个整数。
-
-init_hook
-+++++++++
-
-init_hook可以传入一个函数。这个函数在初始化的时候会被调用。这个函数的参数是:
-
-* 第一个参数是 settings 对象。这个对象和process的第一个参数一致。具有的属性有
-    * settings.input_types 设置输入类型。参考 `input_types`_
-    * settings.logger 一个logging对象
-* 其他参数都使用key word argument传入。这些参数包括paddle定义的参数，和用户传入的参数。
-    * Paddle定义的参数包括:
-        * is_train bool参数，表示这个DataProvider是训练用的DataProvider或者测试用的
-          DataProvider
-        * file_list 所有文件列表。
-    * 用户定义的参数使用args在训练配置中设置。
-
-注意，PaddlePaddle保留添加参数的权力，所以init_hook尽量使用 :code:`**kwargs` , 来接受不使用的
-函数来保证兼容性。
-
-cache
-+++++
-
-DataProvider提供了两种简单的Cache策略。他们是
-
-* CacheType.NO_CACHE 不缓存任何数据，每次都会从python端读取数据
-* CacheType.CACHE_PASS_IN_MEM 第一个pass会从python端读取数据，剩下的pass会直接从内存里
-  读取数据。 
-
-
-注意事项
---------
-
-可能的内存泄露问题
-++++++++++++++++++
-
-PaddlePaddle将train.list中的每一行，都传递给process函数，从而生成多个generator。
-即如果train.list中，有100个训练文件，即会生成100个generator。这个本身不是一个很
-严重的问题。
-
-但是，如果在训练时，每一条训练数据都是一个文件，并且，训练数据非常多的情况下，就
-会生成多个generator。每个generator在没有调用的时候，是几乎不占内存的。但是，当调
-用过一次的时候，generator便会存下当前的上下文(Context)。而这个Context可能会非常
-大。并且，generator至少调用两次才会知道是否停止。所以，即使在process里面只会有一
-个yield，也需要两次随机选择到同样的generator的时候，才会释放该段内存。
-
-..  code-block:: python
-
-    def func():
-        yield 0
-
-    f = func()  # 创建generator
-    tmp = next(f)  # 调用一次，返回0
-    tmp = next(f)  # 调用第二次的时候，才会Stop Iteration
-
-而如果按顺序调用这些generator就不会出现这个问题。
-
-所以最佳实践推荐不要将每一个样本都放入train.list。而是将样本的地址放入另一个文本
-文件，train.list写入那个文本文件的地址。 或者在python generator的上下文中尽量留
-下非常少的变量引用。例如
-
-..  code-block:: python
-
-    def real_process(fn):
-        # ... read from fn
-        return result   # 当函数返回的时候，python可以解除掉内部变量的引用。
-
-    def process(fn):
-        yield real_process(fn)
-
-这个问题是PyDataProvider读数据时候的逻辑问题，基本上不能整体修正。
-
-
-内存不够用的情况
-++++++++++++++++
-
-PyDataProvider2会尽量使用内存。所以如果对于内存比较小的机器，推荐设置
-:code:`pool_size` 变量，而这个变量推荐大于训练的batch size，并且在内存足够
-的情况下越大越好。
-
diff --git a/doc_cn/ui/data_provider/write_new_dataprovider.rst b/doc_cn/ui/data_provider/write_new_dataprovider.rst
deleted file mode 100644
index a2495fe66371eb0cf678434f43feb6f91d93f3cf..0000000000000000000000000000000000000000
--- a/doc_cn/ui/data_provider/write_new_dataprovider.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-自定义一个DataProvider
-====================
-
-TBD
\ No newline at end of file
diff --git a/doc_cn/ui/index.rst b/doc_cn/ui/index.rst
deleted file mode 100644
index 5aba272c627204110a56337f0f120f3f2cd37ae9..0000000000000000000000000000000000000000
--- a/doc_cn/ui/index.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-用户接口
-========
-
-数据提供
-''''''''
-
-..  toctree::
-    :maxdepth: 1
-
-    data_provider/index.rst
-
-
-命令行参数
-''''''''''
-* `Use Case <../../doc/ui/cmd_argument/use_case.html>`_
-* `Argument Outline <../../doc/ui/cmd_argument/argument_outline.html>`_
-* `Detail Description <../../doc/ui/cmd_argument/detail_introduction.html>`_
-
-
-预测
-''''
-
-..  toctree::
-
-    predict/swig_py_paddle.rst
diff --git a/doc_cn/ui/predict/swig_py_paddle.rst b/doc_cn/ui/predict/swig_py_paddle.rst
deleted file mode 100644
index 012ac4ff6e66a022fa7d8af798236f55b62011ec..0000000000000000000000000000000000000000
--- a/doc_cn/ui/predict/swig_py_paddle.rst
+++ /dev/null
@@ -1,48 +0,0 @@
-PaddlePaddle的Python预测接口
-==================================
-
-PaddlePaddle目前使用Swig对其常用的预测接口进行了封装，使在Python环境下的预测接口更加简单。
-在Python环境下预测结果，主要分为以下几个步骤。
-
-* 读入解析训练配置
-* 构造GradientMachine
-* 准备数据
-* 预测
-
-典型的预测代码如下，使用mnist手写识别作为样例, 完整代码见
-:code:`src_root/doc/ui/predict/predict_sample.py` 。
-
-..  literalinclude:: ../../../doc/ui/predict/predict_sample.py
-    :language: python
-    :lines: 15-18,90-100,101-104
-
-主要的软件包为py_paddle.swig_paddle，这个软件包文档相对完善。可以使用python的
-:code:`help()` 函数查询文档。主要步骤为:
-
-* 在程序开始阶段，使用 :code:`swig_paddle.initPaddle()` 传入命令行参数初始化
-  PaddlePaddle。详细的命令行参数请参考
-  `命令行参数 <../cmd_argument/detail_introduction.html>`_ 。
-* 接下来使用 :code:`parse_config()` 解析训练时的配置文件。这里要注意预测数据通常
-  不包含label, 而且预测网络通常直接输出最后一层的结果而不是像训练时一样以cost
-  layer作为输出，所以用于预测的配置文件要做相应的修改。
-* 使用 :code:`swig_paddle.GradientMachine.createFromConfigproto()` 根据上一步解
-  析好的配置创建神经网络。
-* 创建一个 :code:`DataProviderConverter` 对象converter。
-    - swig_paddle接受的原始数据是C++的Matrix，也就是直接写内存的float数组。
-      这个接口并不用户友好。所以，我们提供了一个工具类DataProviderConverter。
-      这个工具类接收和PyDataProvider2一样的输入数据，详情请参考
-      `PyDataProvider2文档 <../../../doc/ui/data_provider/pydataprovider2.html>`_ 。
-* 最后使用 :code:`forwardTest()` 直接提取出神经网络Output层的输出结果。典型的输出结果为\:
-
-..  code-block:: text
-
-    [{'id': None, 'value': array([[  5.53018653e-09,   1.12194102e-05,   1.96644767e-09,
-          1.43630644e-02,   1.51111044e-13,   9.85625684e-01,
-          2.08823112e-10,   2.32777140e-08,   2.00186201e-09,
-          1.15501715e-08],
-       [  9.99982715e-01,   1.27787406e-10,   1.72296313e-05,
-          1.49316648e-09,   1.36540484e-11,   6.93137714e-10,
-          2.70634608e-08,   3.48565123e-08,   5.25639710e-09,
-          4.48684503e-08]], dtype=float32)}]
-
-其中，value即为softmax层的输出。由于数据是两条，所以输出的value包含两个向量 。
diff --git a/doc_theme/static/css/override.css b/doc_theme/static/css/override.css
new file mode 100644
index 0000000000000000000000000000000000000000..438a87848a0176a7857177aeb672c59f35bd8d4b
--- /dev/null
+++ b/doc_theme/static/css/override.css
@@ -0,0 +1,506 @@
+body {
+    padding-top: 80px;
+    background-image: none !important;
+    font-family: Roboto;
+}
+a, a:focus, a:hover, a:visited {
+    color: #597cf1;
+}
+.site-header {
+    position: fixed;
+    top: 0;
+    width: 100%;
+    left: 0;
+    z-index: 99;
+    background: #333;
+    height: 80px;
+    display: -webkit-flex;
+    display: -ms-flex;
+    display: -o-flex;
+    display: flex;
+    flex-flow: row nowrap;
+    justify-content: space-between;
+    box-shadow: #ccc 0 3px 3px;
+}
+.site-header > div {
+    height: 80px;
+    display: inline-block;
+    background-color: #2f323a;
+    padding: 0 30px;
+}
+.site-header .site-logo {
+    line-height: 80px;
+    width: 290px;
+    flex: 0 1 290px;
+}
+.site-header .site-logo > a {
+    display: inline-block;
+    width: 230px;
+}
+.site-header .site-nav-links {
+    flex: 0 1 100%;
+}
+.site-header .site-nav-links .site-menu {
+    height: 30px;
+    line-height: 30px; 
+    font-size: 12px;
+    background: -webkit-linear-gradient(#282b33, #2f323a);
+    background: -o-linear-gradient(#282b33, #2f323a);
+    background: -moz-linear-gradient(#282b33, #2f323a);
+    background: linear-gradient(to left, #282b33, #2f323a);
+    margin-right: -30px;
+    padding-right: 30px;
+}
+.site-header .site-nav-links .site-menu .site-page-links {
+    display: inline-block;
+    float: right;
+    margin-right: 20px;
+}
+.site-header .site-nav-links .site-menu .site-page-links> li {
+    display: inline-block;
+    float: left;
+}
+.site-header .site-nav-links .site-menu .site-page-links > li > a {
+    color: #a7adbd;
+    display: inline-block;
+    height: 30px;
+    padding: 0 20px;
+    font-size: 12px;
+}
+.site-header .site-nav-links .site-menu .site-page-links > li:hover > a,
+.site-header .site-nav-links .site-menu .site-page-links > li.active > a {
+    background-color: #2f323a;
+    color: #bcc1d0;
+}
+.site-header .site-nav-links .site-menu .site-page-links > li.active > a {
+    font-weight: bold;
+}
+.site-header .site-nav-links .site-menu .fork-on-github {
+    color: #597cf1;
+    line-height: 30px;
+    display: inline-block;
+    padding: 0 0 0 20px;
+    float: right;
+    position: relative;
+}
+.site-header .site-nav-links .site-menu .fork-on-github .fa {
+    margin-right: 5px;
+    font-size: 16px;
+    vertical-align: middle;
+}
+.site-header .site-nav-links .site-menu .language-switcher {
+    height: 30px;
+    display: inline-block;
+    float: right;
+    line-height: 30px;
+    padding: 0 20px;
+    position: relative;
+}
+.site-header .site-nav-links .site-menu .language-switcher > a {
+    color: #a7adbd;
+}
+.site-header .site-nav-links .site-menu .language-switcher.open > a {
+    background-color: #24272f;
+    color: #bcc1d0;
+}
+.site-header .site-nav-links .site-menu .language-switcher .fa {
+    margin-left: 5px;
+}
+.site-header .site-nav-links .site-menu .language-switcher .fa-angle-down {
+    display: inline;
+}
+.site-header .site-nav-links .site-menu .language-switcher.open .fa-angle-down {
+    display: none;
+}
+.site-header .site-nav-links .site-menu .language-switcher .fa-angle-up {
+    display: none;
+}
+.site-header .site-nav-links .site-menu .language-switcher.open .fa-angle-up {
+    display: inline;
+}
+.site-header .site-nav-links .site-menu .fork-on-github:before,
+.site-header .site-nav-links .site-menu .language-switcher:before {
+    width: 1px;
+    height: 12px;
+    top: 9px;
+    background-color: #3a3d47;
+    left: 0;
+    display: inline-block;
+    position: absolute;
+    content: "";
+}
+.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu {
+    display: none;
+    position: absolute;
+    box-shadow: #ccc 0 0 5px;
+    background-color: #fff;
+    width: 100%;
+    left: 0;
+    top: 30px;
+}
+.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu > li {
+    line-height: 30px;
+    padding: 0 20px;
+}
+.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu > li:hover {
+    background-color: #f7f8fe;
+}
+.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu > li + li {
+    border-top: 1px solid #dedfe5;
+}
+.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu > li > a {
+    color: #2f323a;
+}
+.site-header .site-nav-links .site-menu .language-switcher.open .dropdown-menu {
+    display: inline-block;
+}
+.site-header .site-nav-links .doc-module {
+    display: block;
+    height: 50px;
+    line-height: 50px;
+}
+.site-header .site-nav-links .doc-module > ul > li {
+    display: inline-block;
+    float: left;
+}
+.site-header .site-nav-links .doc-module > ul > li > a {
+    color: #c9cbd0;
+    font-size: 14px;
+    display: inline-block;
+    height: 50px;
+    line-height: 50px;
+    border-bottom: 2px solid transparent;
+    padding: 0 20px;
+}
+.site-header .site-nav-links .doc-module > ul > li:hover > a {
+    color: #fff;
+}
+.site-header .site-nav-links .doc-module > ul > li.current > a {
+    border-bottom-color: #fff;
+    color: #fff;
+}
+.site-header .site-nav-links .doc-module [role="search"]{
+    float: right;
+}
+.site-header .site-nav-links .doc-module [role="search"] input {
+    background-color: #3a3d47;
+    border-radius: 15px;
+    color: #a7adbd;
+    border: 1px solid transparent;
+    padding: 6px 15px;
+    width: 180px;
+    box-shadow: none;
+    transition: all .2s;
+    -webkit-transition: all .2s;
+    -moz-transition: all .2s;
+    -o-transition: all .2s;
+    background-repeat: no-repeat;
+    background-position: 145px center;
+    background-image: url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAACXBIWXMAAA7EAAAOxAGVKw4bAAAKTWlDQ1BQaG90b3Nob3AgSUNDIHByb2ZpbGUAAHjanVN3WJP3Fj7f92UPVkLY8LGXbIEAIiOsCMgQWaIQkgBhhBASQMWFiApWFBURnEhVxILVCkidiOKgKLhnQYqIWotVXDjuH9yntX167+3t+9f7vOec5/zOec8PgBESJpHmomoAOVKFPDrYH49PSMTJvYACFUjgBCAQ5svCZwXFAADwA3l4fnSwP/wBr28AAgBw1S4kEsfh/4O6UCZXACCRAOAiEucLAZBSAMguVMgUAMgYALBTs2QKAJQAAGx5fEIiAKoNAOz0ST4FANipk9wXANiiHKkIAI0BAJkoRyQCQLsAYFWBUiwCwMIAoKxAIi4EwK4BgFm2MkcCgL0FAHaOWJAPQGAAgJlCLMwAIDgCAEMeE80DIEwDoDDSv+CpX3CFuEgBAMDLlc2XS9IzFLiV0Bp38vDg4iHiwmyxQmEXKRBmCeQinJebIxNI5wNMzgwAABr50cH+OD+Q5+bk4eZm52zv9MWi/mvwbyI+IfHf/ryMAgQAEE7P79pf5eXWA3DHAbB1v2upWwDaVgBo3/ldM9sJoFoK0Hr5i3k4/EAenqFQyDwdHAoLC+0lYqG9MOOLPv8z4W/gi372/EAe/tt68ABxmkCZrcCjg/1xYW52rlKO58sEQjFu9+cj/seFf/2OKdHiNLFcLBWK8ViJuFAiTcd5uVKRRCHJleIS6X8y8R+W/QmTdw0ArIZPwE62B7XLbMB+7gECiw5Y0nYAQH7zLYwaC5EAEGc0Mnn3AACTv/mPQCsBAM2XpOMAALzoGFyolBdMxggAAESggSqwQQcMwRSswA6cwR28wBcCYQZEQAwkwDwQQgbkgBwKoRiWQRlUwDrYBLWwAxqgEZrhELTBMTgN5+ASXIHrcBcGYBiewhi8hgkEQcgIE2EhOogRYo7YIs4IF5mOBCJhSDSSgKQg6YgUUSLFyHKkAqlCapFdSCPyLXIUOY1cQPqQ28ggMor8irxHMZSBslED1AJ1QLmoHxqKxqBz0XQ0D12AlqJr0Rq0Hj2AtqKn0UvodXQAfYqOY4DRMQ5mjNlhXIyHRWCJWBomxxZj5Vg1Vo81Yx1YN3YVG8CeYe8IJAKLgBPsCF6EEMJsgpCQR1hMWEOoJewjtBK6CFcJg4Qxwicik6hPtCV6EvnEeGI6sZBYRqwm7iEeIZ4lXicOE1+TSCQOyZLkTgohJZAySQtJa0jbSC2kU6Q+0hBpnEwm65Btyd7kCLKArCCXkbeQD5BPkvvJw+S3FDrFiOJMCaIkUqSUEko1ZT/lBKWfMkKZoKpRzame1AiqiDqfWkltoHZQL1OHqRM0dZolzZsWQ8ukLaPV0JppZ2n3aC/pdLoJ3YMeRZfQl9Jr6Afp5+mD9HcMDYYNg8dIYigZaxl7GacYtxkvmUymBdOXmchUMNcyG5lnmA+Yb1VYKvYqfBWRyhKVOpVWlX6V56pUVXNVP9V5qgtUq1UPq15WfaZGVbNQ46kJ1Bar1akdVbupNq7OUndSj1DPUV+jvl/9gvpjDbKGhUaghkijVGO3xhmNIRbGMmXxWELWclYD6yxrmE1iW7L57Ex2Bfsbdi97TFNDc6pmrGaRZp3mcc0BDsax4PA52ZxKziHODc57LQMtPy2x1mqtZq1+rTfaetq+2mLtcu0W7eva73VwnUCdLJ31Om0693UJuja6UbqFutt1z+o+02PreekJ9cr1Dund0Uf1bfSj9Rfq79bv0R83MDQINpAZbDE4Y/DMkGPoa5hpuNHwhOGoEctoupHEaKPRSaMnuCbuh2fjNXgXPmasbxxirDTeZdxrPGFiaTLbpMSkxeS+Kc2Ua5pmutG003TMzMgs3KzYrMnsjjnVnGueYb7ZvNv8jYWlRZzFSos2i8eW2pZ8ywWWTZb3rJhWPlZ5VvVW16xJ1lzrLOtt1ldsUBtXmwybOpvLtqitm63Edptt3xTiFI8p0in1U27aMez87ArsmuwG7Tn2YfYl9m32zx3MHBId1jt0O3xydHXMdmxwvOuk4TTDqcSpw+lXZxtnoXOd8zUXpkuQyxKXdpcXU22niqdun3rLleUa7rrStdP1o5u7m9yt2W3U3cw9xX2r+00umxvJXcM970H08PdY4nHM452nm6fC85DnL152Xlle+70eT7OcJp7WMG3I28Rb4L3Le2A6Pj1l+s7pAz7GPgKfep+Hvqa+It89viN+1n6Zfgf8nvs7+sv9j/i/4XnyFvFOBWABwQHlAb2BGoGzA2sDHwSZBKUHNQWNBbsGLww+FUIMCQ1ZH3KTb8AX8hv5YzPcZyya0RXKCJ0VWhv6MMwmTB7WEY6GzwjfEH5vpvlM6cy2CIjgR2yIuB9pGZkX+X0UKSoyqi7qUbRTdHF09yzWrORZ+2e9jvGPqYy5O9tqtnJ2Z6xqbFJsY+ybuIC4qriBeIf4RfGXEnQTJAntieTE2MQ9ieNzAudsmjOc5JpUlnRjruXcorkX5unOy553PFk1WZB8OIWYEpeyP+WDIEJQLxhP5aduTR0T8oSbhU9FvqKNolGxt7hKPJLmnVaV9jjdO31D+miGT0Z1xjMJT1IreZEZkrkj801WRNberM/ZcdktOZSclJyjUg1plrQr1zC3KLdPZisrkw3keeZtyhuTh8r35CP5c/PbFWyFTNGjtFKuUA4WTC+oK3hbGFt4uEi9SFrUM99m/ur5IwuCFny9kLBQuLCz2Lh4WfHgIr9FuxYji1MXdy4xXVK6ZHhp8NJ9y2jLspb9UOJYUlXyannc8o5Sg9KlpUMrglc0lamUycturvRauWMVYZVkVe9ql9VbVn8qF5VfrHCsqK74sEa45uJXTl/VfPV5bdra3kq3yu3rSOuk626s91m/r0q9akHV0IbwDa0b8Y3lG19tSt50oXpq9Y7NtM3KzQM1YTXtW8y2rNvyoTaj9nqdf13LVv2tq7e+2Sba1r/dd3vzDoMdFTve75TsvLUreFdrvUV99W7S7oLdjxpiG7q/5n7duEd3T8Wej3ulewf2Re/ranRvbNyvv7+yCW1SNo0eSDpw5ZuAb9qb7Zp3tXBaKg7CQeXBJ9+mfHvjUOihzsPcw83fmX+39QjrSHkr0jq/dawto22gPaG97+iMo50dXh1Hvrf/fu8x42N1xzWPV56gnSg98fnkgpPjp2Snnp1OPz3Umdx590z8mWtdUV29Z0PPnj8XdO5Mt1/3yfPe549d8Lxw9CL3Ytslt0utPa49R35w/eFIr1tv62X3y+1XPK509E3rO9Hv03/6asDVc9f41y5dn3m978bsG7duJt0cuCW69fh29u0XdwruTNxdeo94r/y+2v3qB/oP6n+0/rFlwG3g+GDAYM/DWQ/vDgmHnv6U/9OH4dJHzEfVI0YjjY+dHx8bDRq98mTOk+GnsqcTz8p+Vv9563Or59/94vtLz1j82PAL+YvPv655qfNy76uprzrHI8cfvM55PfGm/K3O233vuO+638e9H5ko/ED+UPPR+mPHp9BP9z7nfP78L/eE8/sl0p8zAAAAIGNIUk0AAHolAACAgwAA+f8AAIDpAAB1MAAA6mAAADqYAAAXb5JfxUYAAAEpSURBVHjanNO7K8dhFMfx1w8LBqVM5DLxF7hMTGSQpAwmJSkDizAZLSb5Ayi3clsMFgwWISGXkoSyGYRSym15fvr27duvH5/leTqd8+6c83ye1NLatohqMIgWVOEV+5jDAr7ElBO5j+IIH+hBJRqwjDHsoTQOyAvnCPpRi4tYziVmMY2dkPMc7aAG42hPKE7rAwMBNhEfYQgzOJNZ3xhGL4qigGasyk43OEdjFFCGe9nrNtT8Al5Q8AdAMd6jgFPU/QFwiN0oYD4sJzdLwBiuo4A5vGEKqQyF1ahPcuInOsJrrKMiwWx9OMAWWpOc+BD2MImr4Ik7FIb4AzqRH6zdhU1IxT4TlKAJ5XjCMU6CkaANi2lIXsKsj1jJsIsNdKc7yfE/pSGTPwMABBFCGflm+rsAAAAASUVORK5CYII=");
+}
+.site-header .site-nav-links .doc-module [role="search"] input:focus {
+   width: 300px;
+}
+.site-header .site-nav-links .doc-module [role="search"] input:focus {
+    background-position: 265px center;
+}
+.site-header .site-nav-links .doc-module [role="search"] input:hover,
+.site-header .site-nav-links .doc-module [role="search"] input:focus {
+   color: #fff;
+   border-color: #597cf1;
+   background-image: url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAACXBIWXMAAA7EAAAOxAGVKw4bAAAKTWlDQ1BQaG90b3Nob3AgSUNDIHByb2ZpbGUAAHjanVN3WJP3Fj7f92UPVkLY8LGXbIEAIiOsCMgQWaIQkgBhhBASQMWFiApWFBURnEhVxILVCkidiOKgKLhnQYqIWotVXDjuH9yntX167+3t+9f7vOec5/zOec8PgBESJpHmomoAOVKFPDrYH49PSMTJvYACFUjgBCAQ5svCZwXFAADwA3l4fnSwP/wBr28AAgBw1S4kEsfh/4O6UCZXACCRAOAiEucLAZBSAMguVMgUAMgYALBTs2QKAJQAAGx5fEIiAKoNAOz0ST4FANipk9wXANiiHKkIAI0BAJkoRyQCQLsAYFWBUiwCwMIAoKxAIi4EwK4BgFm2MkcCgL0FAHaOWJAPQGAAgJlCLMwAIDgCAEMeE80DIEwDoDDSv+CpX3CFuEgBAMDLlc2XS9IzFLiV0Bp38vDg4iHiwmyxQmEXKRBmCeQinJebIxNI5wNMzgwAABr50cH+OD+Q5+bk4eZm52zv9MWi/mvwbyI+IfHf/ryMAgQAEE7P79pf5eXWA3DHAbB1v2upWwDaVgBo3/ldM9sJoFoK0Hr5i3k4/EAenqFQyDwdHAoLC+0lYqG9MOOLPv8z4W/gi372/EAe/tt68ABxmkCZrcCjg/1xYW52rlKO58sEQjFu9+cj/seFf/2OKdHiNLFcLBWK8ViJuFAiTcd5uVKRRCHJleIS6X8y8R+W/QmTdw0ArIZPwE62B7XLbMB+7gECiw5Y0nYAQH7zLYwaC5EAEGc0Mnn3AACTv/mPQCsBAM2XpOMAALzoGFyolBdMxggAAESggSqwQQcMwRSswA6cwR28wBcCYQZEQAwkwDwQQgbkgBwKoRiWQRlUwDrYBLWwAxqgEZrhELTBMTgN5+ASXIHrcBcGYBiewhi8hgkEQcgIE2EhOogRYo7YIs4IF5mOBCJhSDSSgKQg6YgUUSLFyHKkAqlCapFdSCPyLXIUOY1cQPqQ28ggMor8irxHMZSBslED1AJ1QLmoHxqKxqBz0XQ0D12AlqJr0Rq0Hj2AtqKn0UvodXQAfYqOY4DRMQ5mjNlhXIyHRWCJWBomxxZj5Vg1Vo81Yx1YN3YVG8CeYe8IJAKLgBPsCF6EEMJsgpCQR1hMWEOoJewjtBK6CFcJg4Qxwicik6hPtCV6EvnEeGI6sZBYRqwm7iEeIZ4lXicOE1+TSCQOyZLkTgohJZAySQtJa0jbSC2kU6Q+0hBpnEwm65Btyd7kCLKArCCXkbeQD5BPkvvJw+S3FDrFiOJMCaIkUqSUEko1ZT/lBKWfMkKZoKpRzame1AiqiDqfWkltoHZQL1OHqRM0dZolzZsWQ8ukLaPV0JppZ2n3aC/pdLoJ3YMeRZfQl9Jr6Afp5+mD9HcMDYYNg8dIYigZaxl7GacYtxkvmUymBdOXmchUMNcyG5lnmA+Yb1VYKvYqfBWRyhKVOpVWlX6V56pUVXNVP9V5qgtUq1UPq15WfaZGVbNQ46kJ1Bar1akdVbupNq7OUndSj1DPUV+jvl/9gvpjDbKGhUaghkijVGO3xhmNIRbGMmXxWELWclYD6yxrmE1iW7L57Ex2Bfsbdi97TFNDc6pmrGaRZp3mcc0BDsax4PA52ZxKziHODc57LQMtPy2x1mqtZq1+rTfaetq+2mLtcu0W7eva73VwnUCdLJ31Om0693UJuja6UbqFutt1z+o+02PreekJ9cr1Dund0Uf1bfSj9Rfq79bv0R83MDQINpAZbDE4Y/DMkGPoa5hpuNHwhOGoEctoupHEaKPRSaMnuCbuh2fjNXgXPmasbxxirDTeZdxrPGFiaTLbpMSkxeS+Kc2Ua5pmutG003TMzMgs3KzYrMnsjjnVnGueYb7ZvNv8jYWlRZzFSos2i8eW2pZ8ywWWTZb3rJhWPlZ5VvVW16xJ1lzrLOtt1ldsUBtXmwybOpvLtqitm63Edptt3xTiFI8p0in1U27aMez87ArsmuwG7Tn2YfYl9m32zx3MHBId1jt0O3xydHXMdmxwvOuk4TTDqcSpw+lXZxtnoXOd8zUXpkuQyxKXdpcXU22niqdun3rLleUa7rrStdP1o5u7m9yt2W3U3cw9xX2r+00umxvJXcM970H08PdY4nHM452nm6fC85DnL152Xlle+70eT7OcJp7WMG3I28Rb4L3Le2A6Pj1l+s7pAz7GPgKfep+Hvqa+It89viN+1n6Zfgf8nvs7+sv9j/i/4XnyFvFOBWABwQHlAb2BGoGzA2sDHwSZBKUHNQWNBbsGLww+FUIMCQ1ZH3KTb8AX8hv5YzPcZyya0RXKCJ0VWhv6MMwmTB7WEY6GzwjfEH5vpvlM6cy2CIjgR2yIuB9pGZkX+X0UKSoyqi7qUbRTdHF09yzWrORZ+2e9jvGPqYy5O9tqtnJ2Z6xqbFJsY+ybuIC4qriBeIf4RfGXEnQTJAntieTE2MQ9ieNzAudsmjOc5JpUlnRjruXcorkX5unOy553PFk1WZB8OIWYEpeyP+WDIEJQLxhP5aduTR0T8oSbhU9FvqKNolGxt7hKPJLmnVaV9jjdO31D+miGT0Z1xjMJT1IreZEZkrkj801WRNberM/ZcdktOZSclJyjUg1plrQr1zC3KLdPZisrkw3keeZtyhuTh8r35CP5c/PbFWyFTNGjtFKuUA4WTC+oK3hbGFt4uEi9SFrUM99m/ur5IwuCFny9kLBQuLCz2Lh4WfHgIr9FuxYji1MXdy4xXVK6ZHhp8NJ9y2jLspb9UOJYUlXyannc8o5Sg9KlpUMrglc0lamUycturvRauWMVYZVkVe9ql9VbVn8qF5VfrHCsqK74sEa45uJXTl/VfPV5bdra3kq3yu3rSOuk626s91m/r0q9akHV0IbwDa0b8Y3lG19tSt50oXpq9Y7NtM3KzQM1YTXtW8y2rNvyoTaj9nqdf13LVv2tq7e+2Sba1r/dd3vzDoMdFTve75TsvLUreFdrvUV99W7S7oLdjxpiG7q/5n7duEd3T8Wej3ulewf2Re/ranRvbNyvv7+yCW1SNo0eSDpw5ZuAb9qb7Zp3tXBaKg7CQeXBJ9+mfHvjUOihzsPcw83fmX+39QjrSHkr0jq/dawto22gPaG97+iMo50dXh1Hvrf/fu8x42N1xzWPV56gnSg98fnkgpPjp2Snnp1OPz3Umdx590z8mWtdUV29Z0PPnj8XdO5Mt1/3yfPe549d8Lxw9CL3Ytslt0utPa49R35w/eFIr1tv62X3y+1XPK509E3rO9Hv03/6asDVc9f41y5dn3m978bsG7duJt0cuCW69fh29u0XdwruTNxdeo94r/y+2v3qB/oP6n+0/rFlwG3g+GDAYM/DWQ/vDgmHnv6U/9OH4dJHzEfVI0YjjY+dHx8bDRq98mTOk+GnsqcTz8p+Vv9563Or59/94vtLz1j82PAL+YvPv655qfNy76uprzrHI8cfvM55PfGm/K3O233vuO+638e9H5ko/ED+UPPR+mPHp9BP9z7nfP78L/eE8/sl0p8zAAAAIGNIUk0AAHolAACAgwAA+f8AAIDpAAB1MAAA6mAAADqYAAAXb5JfxUYAAAEpSURBVHjanNO9K4ZhFMfxz4MFg1Im8jJ5/gIvExMZJCnFpCRlYBEGGS0m+QMoLwOyGCwyWISEvJQklM0glFLeluvR3d3d08Nvua5O53w751y/K9Uz+SyiNIbRihq8Yh+LWMaXmPIi93Ec4QN9qEYjVjGBPZTHAQXhHMMg6nARy7nEAuawE3Keox2kMYWOhOKMPjAUYNPxEUYwjzPZ9Y1R9KMkCmjButx0g3M0RQEVuJe7bkPNL+AFRX8AlOI9CjhF/R8Ah9iNApbCcvJzBEzgOgpYxBtmkcpSWIuGJCd+ojO8xgaqEsw2gANsoy3JiQ9hDzO4Cp64Q3GIP6ALhcHa3diCVOwzQRmaUYknHOMkGAnasZKBFCTM+oi1LLvYRG+mkzz/UwYy8zMAmkpBg3fGpFUAAAAASUVORK5CYII=");
+}
+.doc-menu-vertical {
+    display: inline-block;
+    float: left;
+    width: 240px;
+    height: 100%;
+    background-color: #ecedee;
+    position: absolute;
+    left: 0;
+    top: 0;
+    overflow: hidden;
+    padding: 0;
+    border-right: 1px solid #dddfe3;
+}
+.doc-menu-vertical > ul {
+    display: none;
+}
+.doc-menu-vertical > ul.current{
+    display: block;
+}
+.doc-menu-vertical > ul.current > li.toctree-l1 {
+    display: none;
+}
+.doc-menu-vertical > ul.current > li.toctree-l1.current {
+    display: block;
+}
+.doc-menu-vertical > ul.current > li.toctree-l1.current > a {
+    display: none;
+}
+.doc-menu-vertical .toctree-l2  a {
+    width: 100%;
+    overflow: hidden;
+    text-overflow: ellipsis;
+    white-space: nowrap;
+    padding-right: 30px;
+}
+.doc-menu-vertical .toctree-l2 > a {
+    font-size: 14px;
+    color: #2f323a;
+    padding-left: 30px;
+    line-height: 50px;
+    display: block;
+    font-weight: bold;
+    border-bottom: 1px solid #dddfe3;
+}
+.doc-menu-vertical .toctree-l2.has-child > a:after {
+    font-family: "FontAwesome";
+    display: inline-block;
+    font-style: normal;
+    font-weight: normal;
+    text-decoration: inherit;
+    content: "";
+    float: right;
+    line-height: 50px;
+    color: #a7adbd;
+    position: absolute;
+    right: 15px;
+}
+.doc-menu-vertical .toctree-l2.has-child.current > a:after {
+    content: "";
+}
+.doc-menu-vertical .toctree-l2 > a + ul{
+    background-color: #e4e6e9;
+    height: 0;
+    overflow: hidden;
+}
+.doc-menu-vertical .toctree-l2.current > a + ul {
+    border-bottom: 1px solid #dddfe3;
+    height: auto;
+}
+.doc-menu-vertical .toctree-l2 li.active > a {
+    background-color: #597cf1;
+    color: #fff;
+}
+.doc-menu-vertical .toctree-l3 > a {
+    font-size: 12px;
+    color: #2f323a;
+    padding-left: 30px;
+    line-height: 40px;
+    display: block;
+}
+.doc-menu-vertical .toctree-l4 > a {
+    font-size: 12px;
+    color: #64697b;
+    padding-left: 50px;
+    line-height: 30px;
+    display: block;
+}
+.doc-menu-vertical .toctree-l5 > a {
+    font-size: 14px;
+    color: #ccc;
+    padding-left: 40px;
+    display: block;
+}
+.local-toc {
+    position: absolute;
+    height: 100%;
+    background-color: #f6f7f8;
+    top: 0;
+    left: 240px;
+    padding: 0;
+    z-index: 9;
+}
+.local-toc:after {
+    content: "";
+    position: absolute;
+    height: 100%;
+    width: 1px;
+    display: inline-block;
+    right: 0;
+    background-color: #dddfe3;
+    top: 0;
+    z-index: -1;
+}
+.local-toc:hover a {
+    width: auto;
+}
+.local-toc > ul > li a {
+    position: relative;
+    font-size: 12px;
+    overflow: hidden;
+    display: none;
+}
+.local-toc > ul > li > ul > li a {
+    display: block;
+    border-top: 1px solid transparent;
+    border-bottom: 1px solid transparent;
+    padding-right: 20px;
+    width: 50px;
+}
+.local-toc > ul > li > ul > li > ul > li > ul a {
+    display: none;
+}
+.local-toc > ul > li > ul li > a:after {
+    content: "";
+    display: inline-block;
+    width: 1px;
+    height: 100%;
+    background-color: transparent;
+    position: absolute;
+    right: 0;
+    top: 0;
+}
+.local-toc > ul > li > ul li a:hover{
+    background-color: #e6eaf7 !important;
+}
+.local-toc > ul > li > ul li a:hover:after {
+    background-color: #e6eaf7 !important;
+}
+.local-toc > ul > li > ul li.active > a {
+    color: #ff9711;
+    background-color: #fff;
+    border-top: 1px solid #dddfe3;
+    border-bottom: 1px solid #dddfe3;
+}
+.local-toc > ul > li > ul li.active > a:before {
+    background-color: #ff9711;
+    width: 10px;
+    height: 10px;
+    margin: 15px 20px;
+    border-radius: 5px;
+}
+.local-toc > ul > li > ul li.active > a:after {
+    background-color: #fff;
+}
+.local-toc > ul > li > ul > li {
+    position: relative;
+    line-height: 40px;
+    white-space: nowrap;
+}
+.local-toc > ul > li > ul > li > a {
+    color: #64697b;
+}
+.local-toc > ul > li > ul > li > a + ul {
+    display: none;
+}
+.local-toc > ul > li > ul > li > a:before {
+    display: inline-block;
+    content: "";
+    width: 6px;
+    height: 6px;
+    background-color: #ccc;
+    border-radius: 3px;
+    margin: 17px 22px;
+    float: left;
+}
+.local-toc > ul > li > ul > li > ul > li > a {
+    color: #a7adbd;
+}
+.local-toc > ul > li > ul > li > ul > li > a:before {
+    display: inline-block;
+    content: "";
+    width: 6px;
+    height: 6px;
+    background-color: #ccc;
+    border-radius: 3px;
+    margin: 17px 22px;
+    float: left;
+}
+.main-content-wrap {
+    position: absolute;
+    width: 100%;
+    top: 80px;
+    bottom: 0;
+    overflow: auto;
+    background-color: #f6f7f8;
+}
+.doc-content-wrap {
+    margin-left: 290px;
+    height: 100%;
+    position: relative;
+    padding-top: 60px;
+    background-color: #fff;
+}
+.doc-content-wrap > div[role='navigation'] {
+    position: absolute;
+    top: 0;
+    width: 100%;
+    left: 0;
+    padding: 0 30px;
+    height: 60px;
+}
+.wy-breadcrumbs {
+    line-height: 50px;
+    height: 60px;
+    background-image: url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAAUCAYAAABMDlehAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAA4ZpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADw/eHBhY2tldCBiZWdpbj0i77u/IiBpZD0iVzVNME1wQ2VoaUh6cmVTek5UY3prYzlkIj8+IDx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6bWV0YS8iIHg6eG1wdGs9IkFkb2JlIFhNUCBDb3JlIDUuNS1jMDIxIDc5LjE1NTc3MiwgMjAxNC8wMS8xMy0xOTo0NDowMCAgICAgICAgIj4gPHJkZjpSREYgeG1sbnM6cmRmPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1zeW50YXgtbnMjIj4gPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIgeG1sbnM6eG1wTU09Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC9tbS8iIHhtbG5zOnN0UmVmPSJodHRwOi8vbnMuYWRvYmUuY29tL3hhcC8xLjAvc1R5cGUvUmVzb3VyY2VSZWYjIiB4bWxuczp4bXA9Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC8iIHhtcE1NOk9yaWdpbmFsRG9jdW1lbnRJRD0ieG1wLmRpZDpjMjhmMGQ3ZC0wODU3LTQ0ZTctOGRhZi00NGU3OTc1ZmM2MzkiIHhtcE1NOkRvY3VtZW50SUQ9InhtcC5kaWQ6NzRBN0NEODRBRTM2MTFFNjlGMDI4RUM3M0VDQzY4NTkiIHhtcE1NOkluc3RhbmNlSUQ9InhtcC5paWQ6NzRBN0NEODNBRTM2MTFFNjlGMDI4RUM3M0VDQzY4NTkiIHhtcDpDcmVhdG9yVG9vbD0iQWRvYmUgUGhvdG9zaG9wIENDIDIwMTQgKE1hY2ludG9zaCkiPiA8eG1wTU06RGVyaXZlZEZyb20gc3RSZWY6aW5zdGFuY2VJRD0ieG1wLmlpZDozNWQwMzI1ZC01ZDAyLTQ1YTYtODUxOS1lNWUzNjU5NGFhMzAiIHN0UmVmOmRvY3VtZW50SUQ9ImFkb2JlOmRvY2lkOnBob3Rvc2hvcDozZGVmZmY0OS1mNjA4LTExNzktYTRlZC1kZjJiNGY3N2YwNzMiLz4gPC9yZGY6RGVzY3JpcHRpb24+IDwvcmRmOlJERj4gPC94OnhtcG1ldGE+IDw/eHBhY2tldCBlbmQ9InIiPz7FGmP1AAAAKUlEQVR42mK4/+DpfwY9Q0tBJgYGhv8g4h8uFoKLEGOAc9FYSARAgAEAUgMQYBNmQ7sAAAAASUVORK5CYII=");
+    background-repeat: repeat no-repeat;
+    background-position: center 50px;
+}
+.wy-breadcrumbs > li {
+    color: #ccc;
+}
+.wy-breadcrumbs > li a {
+    color: #ff9711;
+    padding: 0;
+}
+.wy-breadcrumbs > li:first-child a {
+    color: #597cf1;
+}
+.wy-nav-content{
+    max-width: none;
+    overflow: auto;
+    position: relative;
+    padding: 30px;
+    background-color: #fff;
+}
+.wy-nav-content h1 {
+    font-size: 24px;
+    color: #2f323a;
+    margin-bottom: 30px;
+}
+.wy-nav-content h2 {
+    font-size: 20px;
+    color: #2f323a;
+    margin-bottom: 30px;
+}
+.wy-nav-content h3 {
+    font-size: 18px;
+    color: #2f323a;
+    margin-bottom: 30px;
+}
+.wy-nav-content h4 {
+    font-size: 16px;
+    color: #2f323a;
+    margin-bottom: 30px;
+}
+.wy-nav-content p + h1,
+.wy-nav-content p + h2,
+.wy-nav-content p + h3,
+.wy-nav-content p + h4 {
+    margin-top: 20px;
+}
+.wy-nav-content p{
+    color: #2f323a;
+    margin-bottom: 20px;
+    font-size: 14px;
+}
+#search-results h2 {
+    font-size: 24px;
+    margin: 20px 0 10px 0;
+}
+#search-results p {
+    color: #a7adbd;
+}
+#search-results ul.search > li {
+    border-bottom: none;
+}
+#search-results ul.search > li > a {
+    color: #597cf1;
+}
+.rst-content .highlighted{
+    background-color: transparent;
+    color: #ff9711;
+    padding: 0;
+}
diff --git a/doc_theme/static/images/PP_w.png b/doc_theme/static/images/PP_w.png
new file mode 100644
index 0000000000000000000000000000000000000000..bc58b0b458135773fcde5ee941ea095e3d4d07a0
Binary files /dev/null and b/doc_theme/static/images/PP_w.png differ
diff --git a/doc_theme/static/js/paddle_doc_init.js b/doc_theme/static/js/paddle_doc_init.js
new file mode 100644
index 0000000000000000000000000000000000000000..153ce30745a0a21097fb385f2d66f12e6c8d5be5
--- /dev/null
+++ b/doc_theme/static/js/paddle_doc_init.js
@@ -0,0 +1,31 @@
+$(document).ready(function(){
+    $('.local-toc').on('click' ,'a.reference.internal', function (){
+        $('.local-toc li.active').removeClass('active');
+        $(this).parent('li').addClass('active');
+    });
+
+    if ($('.local-toc a:visible').length) {
+        $('.local-toc > ul').addClass('nav nav-stacked');
+        $('#doc-content').scrollspy({
+            target: '.local-toc'
+        });
+		$('.local-toc').perfectScrollbar();
+    } else {
+		$('.doc-content-wrap').css('margin-left', '-=50px');
+        $('.local-toc').remove();
+    }
+
+    if (!$('.doc-menu-vertical > ul > li.current > ul').length) {
+        $('.doc-content-wrap').css('margin-left', '-=240px');
+        $('.doc-menu-vertical').remove();
+        $('.local-toc').css('left', '0');
+    }
+
+	$('.doc-menu-vertical .toctree-l2').each(function (i, e){
+        $(e).toggleClass('has-child', !!$(e).find('ul').length);
+    });
+
+    $('.doc-menu-vertical').find('li.current').last().addClass('active');
+
+    $('.doc-menu-vertical').perfectScrollbar();
+});
diff --git a/doc_theme/templates/breadcrumbs.html b/doc_theme/templates/breadcrumbs.html
new file mode 100644
index 0000000000000000000000000000000000000000..22f773a8e975dd901c5604b51cd9f2d67b3c9a1f
--- /dev/null
+++ b/doc_theme/templates/breadcrumbs.html
@@ -0,0 +1,24 @@
+{# Support for Sphinx 1.3+ page_source_suffix, but don't break old builds. #}
+
+{% if page_source_suffix %} 
+{% set suffix = page_source_suffix %}
+{% else %}
+{% set suffix = source_suffix %}
+{% endif %}
+
+{% if meta is defined and 'github_url' in meta %}
+{% set display_github = True %}
+{% endif %}
+
+{% if meta is defined and 'bitbucket_url' in meta %}
+{% set display_bitbucket = True %}
+{% endif %}
+
+<div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+      {% for doc in parents %}
+        <li><a href="{{ doc.link|e }}">{{ doc.title }}</a> > </li>
+      {% endfor %}
+    <li>{{ title }}</li>
+  </ul>
+</div>
diff --git a/doc_theme/templates/layout.html b/doc_theme/templates/layout.html
new file mode 100644
index 0000000000000000000000000000000000000000..034740369ed10a748856e2205d3315f51a7de62f
--- /dev/null
+++ b/doc_theme/templates/layout.html
@@ -0,0 +1,191 @@
+{# TEMPLATE VAR SETTINGS #}
+{%- set url_root = pathto('', 1) %}
+{%- if url_root == '#' %}{% set url_root = '' %}{% endif %}
+{%- if not embedded and docstitle %}
+  {%- set titlesuffix = " &mdash; "|safe + docstitle|e %}
+{%- else %}
+  {%- set titlesuffix = "" %}
+{%- endif %}
+
+<!DOCTYPE html>
+<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
+<head>
+  <meta charset="utf-8">
+  {{ metatags }}
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  {% block htmltitle %}
+  <title>{{ title|striptags|e }}{{ titlesuffix }}</title>
+  {% endblock %}
+
+  {# FAVICON #}
+  {% if favicon %}
+    <link rel="shortcut icon" href="{{ pathto('_static/' + favicon, 1) }}"/>
+  {% endif %}
+
+  {# CSS #}
+
+  {# OPENSEARCH #}
+  {% if not embedded %}
+    {% if use_opensearch %}
+      <link rel="search" type="application/opensearchdescription+xml" title="{% trans docstitle=docstitle|e %}Search within {{ docstitle }}{% endtrans %}" href="{{ pathto('_static/opensearch.xml', 1) }}"/>
+    {% endif %}
+
+  {% endif %}
+
+  {# RTD hosts this file, so just load on non RTD builds #}
+  {% if not READTHEDOCS %}
+    <link rel="stylesheet" href="{{ pathto('_static/' + style, 1) }}" type="text/css" />
+  {% endif %}
+
+  {% for cssfile in css_files %}
+    <link rel="stylesheet" href="{{ pathto(cssfile, 1) }}" type="text/css" />
+  {% endfor %}
+  {% for cssfile in extra_css_files %}
+    <link rel="stylesheet" href="{{ pathto(cssfile, 1) }}" type="text/css" />
+  {% endfor %}
+
+  {%- block linktags %}
+    {%- if hasdoc('about') %}
+        <link rel="author" title="{{ _('About these documents') }}"
+              href="{{ pathto('about') }}"/>
+    {%- endif %}
+    {%- if hasdoc('genindex') %}
+        <link rel="index" title="{{ _('Index') }}"
+              href="{{ pathto('genindex') }}"/>
+    {%- endif %}
+    {%- if hasdoc('search') %}
+        <link rel="search" title="{{ _('Search') }}" href="{{ pathto('search') }}"/>
+    {%- endif %}
+    {%- if hasdoc('copyright') %}
+        <link rel="copyright" title="{{ _('Copyright') }}" href="{{ pathto('copyright') }}"/>
+    {%- endif %}
+    <link rel="top" title="{{ docstitle|e }}" href="{{ pathto('index') }}"/>
+    {%- if parents %}
+        <link rel="up" title="{{ parents[-1].title|striptags|e }}" href="{{ parents[-1].link|e }}"/>
+    {%- endif %}
+    {%- if next %}
+        <link rel="next" title="{{ next.title|striptags|e }}" href="{{ next.link|e }}"/>
+    {%- endif %}
+    {%- if prev %}
+        <link rel="prev" title="{{ prev.title|striptags|e }}" href="{{ prev.link|e }}"/>
+    {%- endif %}
+  {%- endblock %}
+  {%- block extrahead %} 
+
+  <link rel="stylesheet" href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" type="text/css" />
+  <link rel="stylesheet" href="{{pathto('_static/css/override.css', 1)}}" type="text/css" />
+  <script>
+  var _hmt = _hmt || [];
+  (function() {
+    var hm = document.createElement("script");
+    hm.src = "//hm.baidu.com/hm.js?b9a314ab40d04d805655aab1deee08ba";
+    var s = document.getElementsByTagName("script")[0]; 
+    s.parentNode.insertBefore(hm, s);
+  })();
+  </script>
+
+  {% endblock %}
+
+  {# Keep modernizr in head - http://modernizr.com/docs/#installing #}
+  <script src="{{ pathto('_static/js/modernizr.min.js', 1) }}"></script>
+
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  {% block extrabody %}
+  <header class="site-header">
+    <div class="site-logo">
+      <a href="/"><img src="{{pathto('_static/images/PP_w.png', 1)}}"></a>
+    </div>
+    <div class="site-nav-links">
+      <div class="site-menu">
+        <a class="fork-on-github" href="https://github.com/PaddlePaddle/Paddle" target="_blank"><i class="fa fa-github"></i>Folk me on Github</a>
+        <div class="language-switcher dropdown">
+          <a type="button" data-toggle="dropdown">
+            <span>English</span>
+            <i class="fa fa-angle-up"></i>
+            <i class="fa fa-angle-down"></i>
+          </a>
+          <ul class="dropdown-menu">
+            <li><a href="/doc_cn">中文</a></li>
+            <li><a href="/doc">English</a></li>
+          </ul>
+        </div>
+        <ul class="site-page-links">
+          <li><a>Home</a></li>
+          <li><a>Get Started</a></li>
+          <li class="active"><a>Documentation</a></li>
+          <li><a>About Us</a></li>
+        </ul>
+      </div>
+      <div class="doc-module">
+        {%set modules = toctree(maxdepth=0, collapse=False, titles_only=True)%}
+        {{modules}}
+        {% include "searchbox.html" %}        
+      </div>
+    </div>
+  </header>
+  {% endblock %}
+  <div class="main-content-wrap">
+
+    {# SIDE NAV, TOGGLES ON MOBILE #}
+    <nav class="doc-menu-vertical" role="navigation">
+        {% block menu %}
+          {% set toctree = toctree(maxdepth=-1, collapse=False,titles_only=True, includehidden=True) %}
+          {{ toctree }}
+        {% endblock %}
+    </nav>
+    {% if toc %}
+    <nav class="local-toc">{{ toc }}</nav>
+    {% endif %}
+    <section class="doc-content-wrap">
+
+      {% include "breadcrumbs.html" %}
+      {# PAGE CONTENT #}
+      <div class="wy-nav-content" id="doc-content">
+        <div class="rst-content">
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+           <div itemprop="articleBody">
+            {% block body %}{% endblock %}
+           </div>
+          </div>
+          {% include "footer.html" %}
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+  {% include "versions.html" %}
+
+  {% if not embedded %}
+
+    <script type="text/javascript">
+        var DOCUMENTATION_OPTIONS = {
+            URL_ROOT:'{{ url_root }}',
+            VERSION:'{{ release|e }}',
+            COLLAPSE_INDEX:false,
+            FILE_SUFFIX:'{{ '' if no_search_suffix else file_suffix }}',
+            HAS_SOURCE:  {{ has_source|lower }}
+        };
+    </script>
+    {%- for scriptfile in script_files %}
+      <script type="text/javascript" src="{{ pathto(scriptfile, 1) }}"></script>
+    {%- endfor %}
+       
+  {% endif %}
+
+  {# RTD hosts this file, so just load on non RTD builds #}
+  {% if not READTHEDOCS %}
+    <script type="text/javascript" src="{{ pathto('_static/js/theme.js', 1) }}"></script>
+  {% endif %}
+  
+  <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
+  <script src="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/js/perfect-scrollbar.jquery.min.js"></script>
+  <script src="{{ pathto('_static/js/paddle_doc_init.js', 1) }}"></script>
+  {%- block footer %} {% endblock %}
+
+</body>
+</html>
diff --git a/doc_theme/templates/search.html b/doc_theme/templates/search.html
new file mode 100644
index 0000000000000000000000000000000000000000..171ab915988db06bdf6bf88f8a7930d48f9e7bee
--- /dev/null
+++ b/doc_theme/templates/search.html
@@ -0,0 +1,52 @@
+{#
+    basic/search.html
+    ~~~~~~~~~~~~~~~~~
+
+    Template for the search page.
+
+    :copyright: Copyright 2007-2013 by the Sphinx team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+#}
+{%- extends "layout.html" %}
+{% set title = _('Search') %}
+{% set script_files = script_files + ['_static/searchtools.js'] %}
+{% block footer %}
+  <script type="text/javascript">
+    jQuery(function() { Search.loadIndex("{{ pathto('searchindex.js', 1) }}"); });
+    jQuery('.doc-content-wrap > div[role="navigation"]').remove();
+    jQuery('.doc-content-wrap').css('padding-top', 0);
+  </script>
+  {# this is used when loading the search index using $.ajax fails,
+     such as on Chrome for documents on localhost #}
+  <script type="text/javascript" id="searchindexloader"></script>
+  {{ super() }}
+{% endblock %}
+{% block body %}
+  <noscript>
+  <div id="fallback" class="admonition warning">
+    <p class="last">
+      {% trans %}Please activate JavaScript to enable the search
+      functionality.{% endtrans %}
+    </p>
+  </div>
+  </noscript>
+
+  {% if search_performed %}
+    <h2>{{ _('Search Results') }}</h2>
+    {% if not search_results %}
+      <p>{{ _('Your search did not match any documents. Please make sure that all words are spelled correctly and that you\'ve selected enough categories.') }}</p>
+    {% endif %}
+  {% endif %}
+  <div id="search-results">
+  {% if search_results %}
+    <ul>
+    {% for href, caption, context in search_results %}
+      <li>
+        <a href="{{ pathto(item.href) }}">{{ caption }}</a>
+        <p class="context">{{ context|e }}</p>
+      </li>
+    {% endfor %}
+    </ul>
+  {% endif %}
+  </div>
+{% endblock %}
diff --git a/paddle/.common_test_util.sh b/paddle/.common_test_util.sh
index dc1525061590808e3cc9c7b606aca5d5d9195a3a..8d024bc7d01a26624ee5eaef339f216cdadbe2e2 100644
--- a/paddle/.common_test_util.sh
+++ b/paddle/.common_test_util.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/.set_port.sh b/paddle/.set_port.sh
index 33596fac600ed5952798b5119f7ecaa999c7d5a7..617ac79a24889eef23b66235ace20be80e1ff4dc 100755
--- a/paddle/.set_port.sh
+++ b/paddle/.set_port.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/.set_python_path.sh b/paddle/.set_python_path.sh
index 657fdf65e92c9ddde7bb3b4590b48b45e0152870..fa7baccc86e0b56e57d52a40c95cfe1b98fececc 100755
--- a/paddle/.set_python_path.sh
+++ b/paddle/.set_python_path.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
index b539374cd4aa5a9510cdb728c1b22edf65a9f880..0cafbd896e2d88aee4406bd0305878ce489bc18d 100644
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -27,11 +27,6 @@ Arguments* Arguments::createArguments(size_t slotNum) {
 
 void Arguments::resize(size_t slotNum) { m->outputs.resize(slotNum); }
 
-Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return Matrix::createByPaddleMatrixPtr(&a.value);
-}
-
 Arguments::Arguments() : m(new ArgumentsPrivate()) {}
 
 Arguments::~Arguments() { delete m; }
@@ -43,6 +38,16 @@ Arguments* Arguments::createByPaddleArgumentVector(void* ptr) {
   return args;
 }
 
+Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return Matrix::createByPaddleMatrixPtr(&a.value);
+}
+
+Matrix* Arguments::getSlotGrad(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return Matrix::createByPaddleMatrixPtr(&a.grad);
+}
+
 IVector* Arguments::getSlotIds(size_t idx) const throw(RangeError) {
   auto& a = m->getArg(idx);
   return IVector::createByPaddleVectorPtr(&a.ids);
@@ -58,6 +63,11 @@ void Arguments::setSlotValue(size_t idx, Matrix* mat) throw(RangeError) {
   a.value = m->cast<paddle::Matrix>(mat->getSharedPtr());
 }
 
+void Arguments::setSlotGrad(size_t idx, Matrix* mat) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.grad = m->cast<paddle::Matrix>(mat->getSharedPtr());
+}
+
 void Arguments::setSlotIn(size_t idx, Matrix* mat) throw(RangeError) {
   auto& a = m->getArg(idx);
   a.in = m->cast<paddle::Matrix>(mat->getSharedPtr());
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index 9b2d122a09adabd766014a9d21a167eec5b2de32..6ad1d79e59b11b2c1f7aacf22d13347b3fd8e0e2 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -17,22 +17,18 @@ add_library(paddle_api STATIC
         ${API_SOURCES})
 add_dependencies(paddle_api gen_proto_cpp)
 
+list(LENGTH "${GFLAGS_LIBRARIES}" GFLAGS_LIBRARIES_LENGTH)
 
-if(WITH_GFLAGS)
-  list(LENGTH "${GFLAGS_LIBRARIES}" GFLAGS_LIBRARIES_LENGTH)
-
-  if(${GFLAGS_LIBRARIES_LENGTH} EQUAL 0 AND TARGET "${GFLAGS_LIBRARIES}")
-    # Because gflags compiled by cmake, so it is imported by cmake target,
-    # not a real library path. Get the real library path here.
-    message(STATUS "GFLAGS Libraries is ${GFLAGS_LIBRARIES}")
-    get_target_property(GFLAGS_LOCATION ${GFLAGS_LIBRARIES} LOCATION)
-    message(STATUS "GFLAGS Target location is ${GFLAGS_LOCATION}")
-  else()
-    set(GFLAGS_LOCATION ${GFLAGS_LIBRARIES})
-  endif()
+if(${GFLAGS_LIBRARIES_LENGTH} EQUAL 0 AND TARGET "${GFLAGS_LIBRARIES}")
+# Because gflags compiled by cmake, so it is imported by cmake target,
+# not a real library path. Get the real library path here.
+message(STATUS "GFLAGS Libraries is ${GFLAGS_LIBRARIES}")
+get_target_property(GFLAGS_LOCATION ${GFLAGS_LIBRARIES} LOCATION)
+message(STATUS "GFLAGS Target location is ${GFLAGS_LOCATION}")
+else()
+set(GFLAGS_LOCATION ${GFLAGS_LIBRARIES})
 endif()
 
-
 configure_file(
     paddle_api_config.py.in
     ${PROJ_ROOT}/paddle/api/paddle_api_config.py
@@ -57,7 +53,7 @@ add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
             paddle_trainer
             paddle_api
             paddle_cuda
-	    ${PY_PADDLE_PYTHON_FILES}
+        ${PY_PADDLE_PYTHON_FILES}
 )
 
 install(DIRECTORY ${PROJ_ROOT}/paddle/dist/
diff --git a/paddle/api/ConfigParser.cpp b/paddle/api/ConfigParser.cpp
index bc40d871d180a6bfe21200c866181dc161f5f078..2f45173bfd401ddda26d61ab7fcfe131d079f710 100644
--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
index 9a4846d80980e23e97f89b6134e15af71207ae6b..297eaa19bb9981c7f07c90763d76494b7910af93 100644
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
 
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "Internal.h"
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 
 std::vector<int> GradientMachine::defaultParamTypes = {
     PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM};
diff --git a/paddle/api/Internal.h b/paddle/api/Internal.h
index 66a13bc603ed5098997f168d3f527160ac3822ef..d48dd3a04c14f559e3c8ceb67226ddb36272e444 100644
--- a/paddle/api/Internal.h
+++ b/paddle/api/Internal.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,14 +16,13 @@ limitations under the License. */
 
 #include "PaddleAPI.h"
 
-#include <vector>
 #include <algorithm>
+#include <vector>
 
 template <typename T1, typename T2>
 void staticCastVector(std::vector<T2>* dest, const std::vector<T1>& src) {
   dest->resize(src.size());
-  std::transform(src.begin(),
-                 src.end(),
-                 dest->begin(),
-                 [](T1 t) { return static_cast<T2>(t); });
+  std::transform(src.begin(), src.end(), dest->begin(), [](T1 t) {
+    return static_cast<T2>(t);
+  });
 }
diff --git a/paddle/api/Matrix.cpp b/paddle/api/Matrix.cpp
index f257ee65aa4a12dfcd1914ddbf0e16461a9b128c..7c375e5cfb91fc5824f823346af6f80c90b36821 100644
--- a/paddle/api/Matrix.cpp
+++ b/paddle/api/Matrix.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "PaddleAPI.h"
 #include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/math/CpuSparseMatrix.h"
-#include <iostream>
 #include <cstring>
+#include <iostream>
+#include "PaddleAPI.h"
+#include "paddle/math/CpuSparseMatrix.h"
+#include "paddle/math/SparseMatrix.h"
 
 struct MatrixPrivate {
   std::shared_ptr<paddle::Matrix> mat;
diff --git a/paddle/api/Paddle.swig b/paddle/api/Paddle.swig
index 6a0fbc537d9345f2221ab65d90733f4696be6880..9194a6371be9e00c037967464ee2b63c1e4f6192 100644
--- a/paddle/api/Paddle.swig
+++ b/paddle/api/Paddle.swig
@@ -193,5 +193,4 @@ namespace std {
 %ignore OptimizationConfigPrivate;
 %ignore ParameterTraverseCallbackPrivate;
 %include "utils/GlobalConstants.h"
-%include "api/PaddleAPI.h"
-
+%include "api/PaddleAPI.h"
\ No newline at end of file
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index c07facdb1292b34ac31247160a4347ea359e718b..84a66719c33678fc4aeb038bb81a6b7c5d0c93fb 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <stddef.h>
 #include <stdint.h>
-#include <string>
 #include <stdexcept>
+#include <string>
 #include <vector>
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/TypeDefs.h"
@@ -156,12 +156,15 @@ public:
    *  @param dim1  dimension of data.
    *  @param dim2  dimension of data.
    *  @param copy  true if copy into a new matrix, false will create
-   *               matrix inplace.
+   *               matrix inplace. copy = false should be used with extreme
+   *               care because Matrix will share the memory with the given
+   *               numpy array. If the numpy array object is no longer valid,
+   *               the memory space will not be usable.
    */
   static Matrix* createCpuDenseFromNumpy(float* data,
                                          int dim1,
                                          int dim2,
-                                         bool copy = false);
+                                         bool copy = true);
 
   /// Create Gpu Dense Matrix from numpy matrix, dtype=float32
   static Matrix* createGpuDenseFromNumpy(float* data, int dim1, int dim2);
@@ -271,11 +274,18 @@ public:
    */
   static Vector* createCpuVectorFromNumpy(float* data,
                                           int dim,
-                                          bool copy = false);
+                                          bool copy = true);
 
   /// Create Gpu Vector from numpy array, which dtype=float32
   static Vector* createGpuVectorFromNumpy(float* data, int dim);
 
+  /**
+   * copy from another vector
+   * throw(RangeError) if size of src vector is different from size of this
+   * vector
+   */
+  void copyFrom(Vector* src) throw(RangeError);
+
   /// Cast to numpy array inplace.
   void toNumpyArrayInplace(float** view_data, int* dim1) throw(UnsupportError);
 
@@ -339,7 +349,7 @@ public:
    */
   static IVector* createCpuVectorFromNumpy(int* data,
                                            int dim,
-                                           bool copy = false);
+                                           bool copy = true);
   /**
    * Create Gpu IVector from numpy array, which dtype=int32
    */
@@ -418,6 +428,7 @@ public:
    * the param idx is the slot id
    */
   Matrix* getSlotValue(size_t idx) const throw(RangeError);
+  Matrix* getSlotGrad(size_t idx) const throw(RangeError);
   IVector* getSlotIds(size_t idx) const throw(RangeError);
   Matrix* getSlotIn(size_t idx) const throw(RangeError);
   IVector* getSlotSequenceStartPositions(size_t idx) const throw(RangeError);
@@ -434,6 +445,7 @@ public:
    * The other param is the input Matrix or vector.
    */
   void setSlotValue(size_t idx, Matrix* mat) throw(RangeError);
+  void setSlotGrad(size_t idx, Matrix* mat) throw(RangeError);
   void setSlotIn(size_t idx, Matrix* mat) throw(RangeError);
   void setSlotIds(size_t idx, IVector* vec) throw(RangeError);
   void setSlotSequenceStartPositions(size_t idx,
@@ -535,6 +547,7 @@ public:
   size_t getID() const;
 
   ParameterConfig* getConfig();
+  void setValueUpdated();
 
 private:
   static Parameter* createFromRawPtr(void* ptr);
diff --git a/paddle/api/PaddleAPIPrivate.h b/paddle/api/PaddleAPIPrivate.h
index 5ffeff6a9726c7445db36c7c1bec7c74825884a0..d2b56fc41c8aadb136ad6812f848e764e031073c 100644
--- a/paddle/api/PaddleAPIPrivate.h
+++ b/paddle/api/PaddleAPIPrivate.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp
index c5876bb1c71438578831ffffd85840c706b6224c..4eed00a84a695f2c48ff93b33419ae2b3dd03768 100644
--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "PaddleAPI.h"
 #include "paddle/parameter/Parameter.h"
+#include "PaddleAPI.h"
 
 struct ParameterPrivate {
   std::shared_ptr<paddle::Parameter> sharedPtr;
@@ -68,3 +68,5 @@ ParameterConfig* Parameter::getConfig() {
 }
 
 size_t Parameter::getID() const { return m->getPtr()->getID(); }
+
+void Parameter::setValueUpdated() { m->getPtr()->setValueUpdated(); }
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp
index 21d031e4bcb897eb693e5cff56bc77a637dc6bd2..21b851dd5e26c4752888067b20d0b1e16a4ab52d 100644
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
 #include "paddle/parameter/ParameterOptimizer.h"
-#include "Internal.h"
 #include <algorithm>
+#include "Internal.h"
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
 
 struct ParameterOptimizerPrivate {
   std::unique_ptr<paddle::ParameterOptimizer> optimizer;
@@ -36,16 +36,13 @@ struct ParameterTraverseCallbackPrivate {
              size_t sparseId) {
     std::vector<paddle::VectorPtr> real_vecs;
     real_vecs.resize(vecs.size());
-    std::transform(vecs.begin(),
-                   vecs.end(),
-                   real_vecs.begin(),
-                   [](Vector* v) {
-                     if (v) {
-                       return *(paddle::VectorPtr*)(v->getSharedPtr());
-                     } else {
-                       return paddle::VectorPtr();
-                     }
-                   });
+    std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
+      if (v) {
+        return *(paddle::VectorPtr*)(v->getSharedPtr());
+      } else {
+        return paddle::VectorPtr();
+      }
+    });
 
     paddle::ParameterConfig& real_conf =
         *(paddle::ParameterConfig*)(const_cast<ParameterConfig&>(conf)
diff --git a/paddle/api/SequenceGenerator.cpp b/paddle/api/SequenceGenerator.cpp
index d51be78d45902967107f4bf0af995958faed931a..8428edc60df6219fd1d3aebf74b0911a79d370cb 100644
--- a/paddle/api/SequenceGenerator.cpp
+++ b/paddle/api/SequenceGenerator.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <algorithm>
+#include <iterator>
+#include <sstream>
+#include <vector>
 #include "PaddleAPI.h"
 #include "paddle/gserver/gradientmachines/GradientMachine.h"
 #include "paddle/parameter/Argument.h"
 #include "paddle/utils/Flags.h"
-#include <vector>
-#include <sstream>
-#include <algorithm>
-#include <iterator>
 
 // used to represent partial sequence
 struct Path {
diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp
index 7a6aa69fb652313748b1fa787847ffd74fda7a22..d83dc380beeec3747451a483f4811eb833e8c226 100644
--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,20 +16,20 @@ limitations under the License. */
 #include "PaddleAPIPrivate.h"
 
 #include <stdlib.h>
-#include <memory>
 #include <atomic>
+#include <memory>
 
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "paddle/trainer/ParamUtil.h"
 #include "paddle/trainer/Trainer.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "paddle/trainer/TrainerInternal.h"
 #include "paddle/utils/Flags.h"
 
 using paddle::real;
 
-P_DECLARE_string(config);
-P_DECLARE_string(init_model_path);
-P_DECLARE_int32(start_pass);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_int32(start_pass);
 
 struct TrainerPrivate : public paddle::Trainer {
   bool _trainOneBatch(size_t batchSize);
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index 1bba1df2e1c0a2d3cd2d8307ed3a0d784bb949b4..c3f739568f50b6ee8b0894d06a4d7f91c7816879 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,16 +14,16 @@ limitations under the License. */
 
 #include "PaddleAPI.h"
 
-#include "paddle/utils/Util.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Excepts.h"
 #include "paddle/parameter/Parameter.h"
+#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Util.h"
 
 #include <fenv.h>
+#include <algorithm>
 #include <iostream>
 #include <iterator>
-#include <algorithm>
 
 void initPaddle(int argc, char** argv) {
   paddle::initMain(argc, argv);
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index cc1c098223826a06fea291a95730d7fc1fd1beb3..874f2fd044e9e86b44f8ca69f08bdfd3287d4749 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -281,6 +281,13 @@ FloatArray Vector::getData() const {
   }
 }
 
+void Vector::copyFrom(Vector* src) throw(RangeError) {
+  if (src->m->vec->getSize() != m->vec->getSize()) {
+    throw RangeError();
+  }
+  m->vec->copyFrom(*src->m->vec);
+}
+
 bool Vector::isGpu() const {
   return std::dynamic_pointer_cast<paddle::GpuVector>(m->vec) != nullptr;
 }
diff --git a/paddle/api/__init__.py b/paddle/api/__init__.py
index c90af2ee000d46a032984ee23559e7e99b49ddad..f662d6826321eb840739382558f76327d27b5847 100644
--- a/paddle/api/__init__.py
+++ b/paddle/api/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/api/paddle_api_config.py.in b/paddle/api/paddle_api_config.py.in
index a2352250c31efa7ee3c4c8338d95dce5a5b9a511..23542b952b7699d66cf64b47d0354e9078ae06d9 100644
--- a/paddle/api/paddle_api_config.py.in
+++ b/paddle/api/paddle_api_config.py.in
@@ -8,9 +8,7 @@ CMAKE_DL_LIBS="@CMAKE_DL_LIBS@"
 
 WITH_PYTHON="@WITH_PYTHON@"
 PYTHON_LIBRARIES="@PYTHON_LIBRARIES@"
-WITH_GLOG="@WITH_GLOG@"
 LIBGLOG_LIBRARY="@LIBGLOG_LIBRARY@"
-WITH_GFLAGS="@WITH_GFLAGS@"
 GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
 GFLAGS_LOCATION="@GFLAGS_LOCATION@"
 CBLAS_LIBRARIES="@CBLAS_LIBS@"
diff --git a/paddle/api/paddle_ld_flags.py b/paddle/api/paddle_ld_flags.py
index ebe00798e8b7169ecbbef53e287ab4b78334bcf9..51d7dfee58b786512201577872559ae510051ba9 100644
--- a/paddle/api/paddle_ld_flags.py
+++ b/paddle/api/paddle_ld_flags.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -47,10 +47,8 @@ try:
             self.with_python = PaddleLDFlag.cmake_bool(WITH_PYTHON)
             self.python_libs = PYTHON_LIBRARIES
 
-            self.with_glog = PaddleLDFlag.cmake_bool(WITH_GLOG)
             self.glog_libs = LIBGLOG_LIBRARY
 
-            self.with_gflags = PaddleLDFlag.cmake_bool(WITH_GFLAGS)
             self.with_coverage = PaddleLDFlag.cmake_bool(WITH_COVERALLS)
             self.gflags_libs = GFLAGS_LIBRARIES
             self.gflags_location = GFLAGS_LOCATION
@@ -88,6 +86,8 @@ try:
                 "-lpaddle_cuda",
                 "-lpaddle_api",
                 self.normalize_flag(self.protolib),
+                self.normalize_flag(self.glog_libs),
+                self.normalize_flag(self.gflags_libs),
                 self.normalize_flag(self.zlib),
                 self.normalize_flag(self.thread),
                 self.normalize_flag(self.dl_libs),
@@ -96,10 +96,6 @@ try:
 
             if self.with_python:
                 libs.append(self.normalize_flag(self.python_libs))
-            if self.with_glog:
-                libs.append(self.normalize_flag(self.glog_libs))
-            if self.with_gflags:
-                libs.append(self.normalize_flag(self.gflags_libs))
             if self.with_gpu:
                 libs.append(self.normalize_flag(self.curt))
             if self.with_coverage:
diff --git a/paddle/api/test/run_tests.sh b/paddle/api/test/run_tests.sh
index a4814f98f89c2e24195074369bc897b8b4bd2d9b..2f12ba026430ba7adb6f4dee11ed17ea3ad3f36d 100755
--- a/paddle/api/test/run_tests.sh
+++ b/paddle/api/test/run_tests.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,11 +20,7 @@ popd > /dev/null
 
 cd $SCRIPTPATH
 
-if [ ! -f ../../dist/*.whl ] ; then  # Swig not compiled.
-  exit 0
-fi
-
-rm .test_env -rf
+rm -rf .test_env
 virtualenv .test_env
 source .test_env/bin/activate
 
diff --git a/paddle/api/test/testArguments.py b/paddle/api/test/testArguments.py
index 70fb169fd5c43d5768e67ad8e4c62a9f4d302eaf..8cabecd242fb4eb98c0fe468687ef179245e4535 100644
--- a/paddle/api/test/testArguments.py
+++ b/paddle/api/test/testArguments.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/api/test/testGradientMachine.py b/paddle/api/test/testGradientMachine.py
index e12613fbb8a66545dd3ad20d59b0b951e86e8683..b81eafa9673ca34f1b7e06401098d55bdb1b35a5 100644
--- a/paddle/api/test/testGradientMachine.py
+++ b/paddle/api/test/testGradientMachine.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/api/test/testMatrix.py b/paddle/api/test/testMatrix.py
index 0432345edd659f13bddb1b99f62622c5ea64a4cb..37666bdccc9aedfe8f8079124129aad2ade53a43 100644
--- a/paddle/api/test/testMatrix.py
+++ b/paddle/api/test/testMatrix.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -68,7 +68,7 @@ class TestMatrix(unittest.TestCase):
 
     def test_numpyCpu(self):
         numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
-        m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat)
+        m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat, copy=False)
         self.assertEqual((int(m.getHeight()), int(m.getWidth())),
                          numpy_mat.shape)
 
@@ -100,11 +100,12 @@ class TestMatrix(unittest.TestCase):
 
             for a, e in zip(gpu_m.getData(), [1.0, 3.23, 3.0, 4.0, 5.0, 6.0]):
                 self.assertAlmostEqual(a, e)
-    
+
     def test_numpy(self):
         numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
         m = swig_paddle.Matrix.createDenseFromNumpy(numpy_mat)
-        self.assertEqual((int(m.getHeight()), int(m.getWidth())), numpy_mat.shape)
+        self.assertEqual((int(m.getHeight()), int(m.getWidth())),
+                         numpy_mat.shape)
         self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
         for a, e in zip(m.getData(), [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]):
             self.assertAlmostEqual(a, e)
diff --git a/paddle/api/test/testTrain.py b/paddle/api/test/testTrain.py
index a3ba4eaaa69b39b75e7ece3095b6f236c1248d41..a90d15c272a3a2b56e35c979e053deb2b54eebc1 100644
--- a/paddle/api/test/testTrain.py
+++ b/paddle/api/test/testTrain.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/api/test/testTrainer.py b/paddle/api/test/testTrainer.py
index edd5a2da5785c405b46c2559ee93837ac68d7c3a..a76cbf02d83ac5ad82a96deee43c4afd104266a2 100644
--- a/paddle/api/test/testTrainer.py
+++ b/paddle/api/test/testTrainer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/api/test/testVector.py b/paddle/api/test/testVector.py
index 48aaa1d73da9e6c207ad5fa2be14a531267bd901..1ab095c1d3d0d2c84d2d2f95a03f172b901de209 100644
--- a/paddle/api/test/testVector.py
+++ b/paddle/api/test/testVector.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,24 +26,24 @@ class TestIVector(unittest.TestCase):
             self.assertEqual(m[i], 0)
             m[i] = i
             self.assertEqual(m[i], i)
-        
+
         m = swig_paddle.IVector.createZero(10)
         self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
-        self.assertEqual(m.getData(), [0]*10)
+        self.assertEqual(m.getData(), [0] * 10)
 
     def test_create(self):
         m = swig_paddle.IVector.create(range(10), False)
         self.assertIsNotNone(m)
         for i in xrange(10):
             self.assertEqual(m[i], i)
-        
+
         m = swig_paddle.IVector.create(range(10))
         self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
         self.assertEqual(m.getData(), range(10))
 
     def test_cpu_numpy(self):
         vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
-        iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec)
+        iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec, copy=False)
         self.assertEqual(vec.shape[0], int(iv.__len__()))
         vec[4] = 832
         for i in xrange(len(iv)):
@@ -69,7 +69,7 @@ class TestIVector(unittest.TestCase):
             expect_vec = range(0, 10)
             expect_vec[4] = 7
             self.assertEqual(vec.getData(), expect_vec)
-    
+
     def test_numpy(self):
         vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
         iv = swig_paddle.IVector.createVectorFromNumpy(vec)
@@ -85,10 +85,10 @@ class TestVector(unittest.TestCase):
             self.assertTrue(util.doubleEqual(v[i], 0))
             v[i] = i
             self.assertTrue(util.doubleEqual(v[i], i))
-        
+
         v = swig_paddle.Vector.createZero(10)
         self.assertEqual(v.isGpu(), swig_paddle.isUsingGpu())
-        self.assertEqual(v.getData(), [0]*10)
+        self.assertEqual(v.getData(), [0] * 10)
 
     def testCreate(self):
         v = swig_paddle.Vector.create([x / 100.0 for x in xrange(100)], False)
@@ -96,18 +96,17 @@ class TestVector(unittest.TestCase):
         for i in xrange(len(v)):
             self.assertTrue(util.doubleEqual(v[i], i / 100.0))
         self.assertEqual(100, len(v))
-        
+
         v = swig_paddle.Vector.create([x / 100.0 for x in xrange(100)])
         self.assertEqual(v.isGpu(), swig_paddle.isUsingGpu())
         self.assertEqual(100, len(v))
         vdata = v.getData()
         for i in xrange(len(v)):
             self.assertTrue(util.doubleEqual(vdata[i], i / 100.0))
-        
 
     def testCpuNumpy(self):
         numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
-        vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr)
+        vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr, copy=False)
         assert isinstance(vec, swig_paddle.Vector)
         numpy_arr[0] = 0.1
         for n, v in zip(numpy_arr, vec):
@@ -128,7 +127,7 @@ class TestVector(unittest.TestCase):
 
         for i in xrange(1, len(numpy_3)):
             util.doubleEqual(numpy_3[i], vec[i])
-    
+
     def testNumpy(self):
         numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
         vec = swig_paddle.Vector.createVectorFromNumpy(numpy_arr)
@@ -136,7 +135,6 @@ class TestVector(unittest.TestCase):
         vecData = vec.getData()
         for n, v in zip(numpy_arr, vecData):
             self.assertTrue(util.doubleEqual(n, v))
-        
 
     def testCopyFromNumpy(self):
         vec = swig_paddle.Vector.createZero(1, False)
@@ -152,4 +150,4 @@ if __name__ == '__main__':
     unittest.TextTestRunner().run(suite)
     if swig_paddle.isGpuVersion():
         swig_paddle.setUseGpu(True)
-        unittest.main()
\ No newline at end of file
+        unittest.main()
diff --git a/paddle/api/test/util.py b/paddle/api/test/util.py
index 93a01b242f9f9a4c939cfbf9c4c7c47bb0e4e9cf..9f4631c53e11d55f9a2638f98c52ba2f5e955b37 100644
--- a/paddle/api/test/util.py
+++ b/paddle/api/test/util.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,7 +24,9 @@ def doubleEqual(a, b):
 
 def __readFromFile():
     for i in xrange(10002):
-        yield np.random.rand(784), random.randint(0, 9)
+        label = np.random.randint(0, 9)
+        sample = np.random.rand(784) + 0.1 * label
+        yield sample, label
 
 
 def loadMNISTTrainData(batch_size=100):
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index 11dbfb54b268774405ade1e532bef9a0e8c7ada9..aa1ff4a771c4a1c64be86893e7b2261ae65f0f94 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -15,19 +15,24 @@ else()
 endif()
 
 set(CUDA_CXX_WITH_GPU_SOURCES
+    src/hl_cudart_wrap.cc
     src/hl_cuda_cublas.cc
     src/hl_cuda_cudnn.cc
     src/hl_cuda_device.cc)
 
-set_source_files_properties(${CUDA_CXX_WITH_GPU_SOURCES}
-                            PROPERTIES COMPILE_FLAGS "-D__NVCC__")
-
-set_source_files_properties(${AVX_SOURCES}
-                            PROPERTIES COMPILE_FLAGS "-mavx")
+if(WITH_GPU)
+    set(CUDA_CXX_SOURCES
+        src/hl_dso_loader.cc
+        src/hl_warpctc_wrap.cc
+        ${CUDA_CXX_WITH_GPU_SOURCES})
 
-set(CUDA_DSO_SOURCES
-    src/hl_dso_loader.cc
-    src/hl_cudart_wrap.cc)
+    set_source_files_properties(${CUDA_CXX_SOURCES}
+                                PROPERTIES COMPILE_FLAGS "-D__NVCC__")
+else()
+    set(CUDA_CXX_SOURCES
+        src/hl_dso_loader.cc
+        src/hl_warpctc_wrap.cc)
+endif()
 
 set(CUDA_CU_SOURCES
     src/hl_perturbation_util.cu
@@ -44,6 +49,7 @@ set(CUDA_CU_SOURCES
 set(CUDA_HEADERS
     include/hl_time.h
     include/hl_dso_loader.h
+    include/hl_warpctc_wrap.h
     include/hl_sequence.h
     include/hl_cuda_cublas.h
     include/hl_batch_transpose.h
@@ -75,14 +81,14 @@ if(WITH_GPU)
     cuda_add_library(paddle_cuda
         ${CUDA_SOURCES}
         ${CUDA_CU_SOURCES}
-        ${CUDA_DSO_SOURCES}
-        ${CUDA_CXX_WITH_GPU_SOURCES})
+        ${CUDA_CXX_SOURCES})
 else()
-    add_library(paddle_cuda ${CUDA_SOURCES})
+    add_library(paddle_cuda
+                ${CUDA_SOURCES}
+                ${CUDA_CXX_SOURCES})
 endif()
 
 add_style_check_target(paddle_cuda
                        ${CUDA_SOURCES}
                        ${CUDA_HEADERS}
-                       ${CUDA_DSO_SOURCES}
-                       ${CUDA_CXX_WITH_GPU_SOURCES})
+                       ${CUDA_CXX_SOURCES})
diff --git a/paddle/cuda/include/hl_activation_functions.h b/paddle/cuda/include/hl_activation_functions.h
index 03e15b2223a50625c6999f6b081ae984e76b182b..cdb2dba06cb4123da4be2088e290c6a740e0375b 100644
--- a/paddle/cuda/include/hl_activation_functions.h
+++ b/paddle/cuda/include/hl_activation_functions.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_aggregate.h b/paddle/cuda/include/hl_aggregate.h
index a6d9ff8483eee28b2c8a380f0aca097c7662a02e..d2189de689f75d737be96d958e5da071cfeca5cf 100644
--- a/paddle/cuda/include/hl_aggregate.h
+++ b/paddle/cuda/include/hl_aggregate.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_avx_functions.h b/paddle/cuda/include/hl_avx_functions.h
index ed339e312a7639cf9b78f130a43d67a7446576bb..35f4eabb4c07c6cc9d2edded02e5b6290b1232f8 100644
--- a/paddle/cuda/include/hl_avx_functions.h
+++ b/paddle/cuda/include/hl_avx_functions.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
index a076952467a5ce10dc1f58007dda2170aa694fbb..84c5f2d5c91feb7896643d2c5f60a279ebe944e7 100644
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -223,9 +223,9 @@ typedef struct {
 
 #ifdef __NVCC__
 
-#include "paddle/utils/Logging.h"
-#include "hl_cuda.h"
 #include "cuda_runtime.h"
+#include "hl_cuda.h"
+#include "paddle/utils/Logging.h"
 
 extern __thread bool g_sync_flag;
 extern __thread cudaStream_t default_stream;
diff --git a/paddle/cuda/include/hl_batch_transpose.h b/paddle/cuda/include/hl_batch_transpose.h
index f3630e9762508fd39935e62e0007de04f9140fff..e2e958cd67aa72b0f654e758e1da4753412aab3f 100644
--- a/paddle/cuda/include/hl_batch_transpose.h
+++ b/paddle/cuda/include/hl_batch_transpose.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index cffaac634f0f64be5ddab961d549ae43775bb7b0..06ee3b3654b576ec57dc437582e37ed0cea328ee 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_cpu_gru.cuh b/paddle/cuda/include/hl_cpu_gru.cuh
index d39cf67448b4f226a2a10223ec1dee64bcaa82c6..c0a37ced2a72a1ab410025e2aa45313c23f1349a 100644
--- a/paddle/cuda/include/hl_cpu_gru.cuh
+++ b/paddle/cuda/include/hl_cpu_gru.cuh
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_cpu_lstm.cuh b/paddle/cuda/include/hl_cpu_lstm.cuh
index 65a174d85ba2cd4dd0d703f14175aed865945894..0e412fcdf57fe99f596f8dd597d67e10624f459c 100644
--- a/paddle/cuda/include/hl_cpu_lstm.cuh
+++ b/paddle/cuda/include/hl_cpu_lstm.cuh
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
index 239a2419918f9a1397e7ec79ac89a44accad6f3a..f35bfbc5c8253d632f8089f5037421f527633aad 100644
--- a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_cuda.h b/paddle/cuda/include/hl_cuda.h
index 357286e3188a6f3184bc56e75232bf2e1ec54e44..5383c1130bba30a0c553afe8d6fc34a6400f236b 100644
--- a/paddle/cuda/include/hl_cuda.h
+++ b/paddle/cuda/include/hl_cuda.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,8 +15,8 @@ limitations under the License. */
 #ifndef HL_CUDA_H_
 #define HL_CUDA_H_
 
-#include "hl_base.h"
 #include <string>
+#include "hl_base.h"
 
 /**
  * @brief   HPPL event.
@@ -332,4 +332,14 @@ extern bool hl_cuda_event_is_ready(hl_event_t event);
  */
 extern void hl_device_synchronize();
 
+/**
+ * @brief   gpu profiler start
+ */
+extern void hl_profiler_start();
+
+/**
+ * @brief   gpu profiler stop
+ */
+extern void hl_profiler_end();
+
 #endif  // HL_CUDA_H_
diff --git a/paddle/cuda/include/hl_cuda.ph b/paddle/cuda/include/hl_cuda.ph
index 9e0537aaf168713113649c07e068fe1f57d0f85f..701916b27922f980d0aa70eec8358eb10f006df5 100644
--- a/paddle/cuda/include/hl_cuda.ph
+++ b/paddle/cuda/include/hl_cuda.ph
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_cuda_cublas.h b/paddle/cuda/include/hl_cuda_cublas.h
index db8c03c2c01c67788622d37b5330e22c31e03f34..e206e42b2aba9c83296de60ca7ef4784fb869c61 100644
--- a/paddle/cuda/include/hl_cuda_cublas.h
+++ b/paddle/cuda/include/hl_cuda_cublas.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_cuda_cudnn.h b/paddle/cuda/include/hl_cuda_cudnn.h
index 3a2f916210277145efa8f6d7663a2698ea546b0b..db18e4912b63ec18dcfff3ef3aaf0c7947e0af18 100644
--- a/paddle/cuda/include/hl_cuda_cudnn.h
+++ b/paddle/cuda/include/hl_cuda_cudnn.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_cuda_cudnn.ph b/paddle/cuda/include/hl_cuda_cudnn.ph
index c0e82abe1785a030abe2f441b00c74ce8658aaa9..61378937cee78f9f2fb5c7f176555d3ead87da6c 100644
--- a/paddle/cuda/include/hl_cuda_cudnn.ph
+++ b/paddle/cuda/include/hl_cuda_cudnn.ph
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_device_functions.cuh b/paddle/cuda/include/hl_device_functions.cuh
index 159c26f443cb17116da2d2d5282f883d875a85be..e0b5632f23edc5cb4769a6e60c8e461d6011a3e0 100755
--- a/paddle/cuda/include/hl_device_functions.cuh
+++ b/paddle/cuda/include/hl_device_functions.cuh
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h
index 1eb9f9ca888d3a93f04621e10346b5f9ff34cdca..20c13f21e61a92b0635b686f6f724ae2b44518cc 100644
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,12 +16,8 @@ limitations under the License. */
 #define HL_DSO_LOADER_H_
 
 #include <dlfcn.h>
-#include <string>
 #include <memory>
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <curand.h>
-#include <cudnn.h>
+#include <string>
 #include "hl_base.h"
 
 /**
@@ -56,4 +52,12 @@ void GetCudartDsoHandle(void** dso_handle);
  */
 void GetCurandDsoHandle(void** dso_handle);
 
+/**
+ * @brief    load the DSO of warp-ctc
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetWarpCTCDsoHandle(void** dso_handle);
+
 #endif  // HL_DSO_LOADER_H_
diff --git a/paddle/cuda/include/hl_functions.h b/paddle/cuda/include/hl_functions.h
index 91ce9a0678463597df88c548aeac322ee19d95de..0d7e80a855d83f1b03046c04eea1a47a7a59e550 100644
--- a/paddle/cuda/include/hl_functions.h
+++ b/paddle/cuda/include/hl_functions.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h
index 3be0df3b93b69811fb9c36dae223cbd927b02559..ede2670882ee2b93f610a2261a4ecc1784bc2d0c 100644
--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,27 +15,28 @@ limitations under the License. */
 #ifndef HL_GPU_H_
 #define HL_GPU_H_
 
+#include "hl_aggregate.h"
 #include "hl_base.h"
+#include "hl_cnn.h"
 #include "hl_cuda.h"
 #include "hl_cuda_cublas.h"
 #include "hl_cuda_cudnn.h"
-#include "hl_matrix.h"
-#include "hl_aggregate.h"
-#include "hl_cnn.h"
-#include "hl_sparse.h"
 #include "hl_lstm.h"
+#include "hl_matrix.h"
 #include "hl_sequence.h"
+#include "hl_sparse.h"
+#include "hl_warpctc_wrap.h"
 
 #ifdef HPPL_STUB_FUNC
-#include "stub/hl_cuda_stub.h"
-#include "stub/hl_cuda_cublas_stub.h"
-#include "stub/hl_cuda_cudnn_stub.h"
-#include "stub/hl_matrix_stub.h"
 #include "stub/hl_aggregate_stub.h"
 #include "stub/hl_cnn_stub.h"
-#include "stub/hl_sparse_stub.h"
+#include "stub/hl_cuda_cublas_stub.h"
+#include "stub/hl_cuda_cudnn_stub.h"
+#include "stub/hl_cuda_stub.h"
 #include "stub/hl_lstm_stub.h"
+#include "stub/hl_matrix_stub.h"
 #include "stub/hl_sequence_stub.h"
+#include "stub/hl_sparse_stub.h"
 #endif
 
 #endif /* HL_GPU_H_ */
diff --git a/paddle/cuda/include/hl_gpu_functions.cuh b/paddle/cuda/include/hl_gpu_functions.cuh
index a2c5ebd18a4403a0eab5341b509ed65148f4aa88..8e64cbe360eb9f2ddfb0b30f2d73408b182b4b3c 100644
--- a/paddle/cuda/include/hl_gpu_functions.cuh
+++ b/paddle/cuda/include/hl_gpu_functions.cuh
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_gpu_gru.cuh b/paddle/cuda/include/hl_gpu_gru.cuh
index 3e0cfdbe4f771782ca669470d819b923f67e2a37..6668e135d2b8c793b510bf06797dbc5ffb747dc9 100644
--- a/paddle/cuda/include/hl_gpu_gru.cuh
+++ b/paddle/cuda/include/hl_gpu_gru.cuh
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_gpu_lstm.cuh b/paddle/cuda/include/hl_gpu_lstm.cuh
index 07806e11c18a2b47d79237587a0e882d7bf2a1d2..5dceba2f5b82a2b33273cb027049894998a5dfcf 100644
--- a/paddle/cuda/include/hl_gpu_lstm.cuh
+++ b/paddle/cuda/include/hl_gpu_lstm.cuh
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_gpu_matrix_kernel.cuh b/paddle/cuda/include/hl_gpu_matrix_kernel.cuh
index 201c5c25f19cd87e9ffd26fbbbb32b77ba5127a9..9bbdf5fa72678ef7ded0c4343bbd97306b78fe9a 100644
--- a/paddle/cuda/include/hl_gpu_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_gpu_matrix_kernel.cuh
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_gru_ops.cuh b/paddle/cuda/include/hl_gru_ops.cuh
index 3c137d8d44bfe131fd91aa361541c0aec8c37318..45f66ad533ebd00171dfa86eaa66dc6612c72403 100644
--- a/paddle/cuda/include/hl_gru_ops.cuh
+++ b/paddle/cuda/include/hl_gru_ops.cuh
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_lstm.h b/paddle/cuda/include/hl_lstm.h
index 7e527a79025969320f1aca75d313fd9d0194efd1..857756e5cd4c77ce618b03eddc9ca94948bd123a 100644
--- a/paddle/cuda/include/hl_lstm.h
+++ b/paddle/cuda/include/hl_lstm.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_lstm_ops.cuh b/paddle/cuda/include/hl_lstm_ops.cuh
index a5ea018dbcf77c74b2036b3a6670fe9cc00ba5a3..2601060cc2e50b876d2731c2b05ce54ff47d9315 100644
--- a/paddle/cuda/include/hl_lstm_ops.cuh
+++ b/paddle/cuda/include/hl_lstm_ops.cuh
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
index 96648661e345d8fa5d50cb2aae3a56ee53921f90..abd5eb3a0cf338c689680dd0f7192be7b2530383 100644
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_matrix_apply.cuh b/paddle/cuda/include/hl_matrix_apply.cuh
index 927212c83d2e5a1d586bd132b06b75198fc619ee..b10d177b970e259e98972bbe465a1b8cdb563c0e 100644
--- a/paddle/cuda/include/hl_matrix_apply.cuh
+++ b/paddle/cuda/include/hl_matrix_apply.cuh
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_matrix_base.cuh b/paddle/cuda/include/hl_matrix_base.cuh
index a3645ef51e6ef7d3c7e204e4f29f68442101ad3e..db35ee2037433163ebb3673edb350e3fab71fba9 100644
--- a/paddle/cuda/include/hl_matrix_base.cuh
+++ b/paddle/cuda/include/hl_matrix_base.cuh
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_matrix_base_sse.cuh b/paddle/cuda/include/hl_matrix_base_sse.cuh
index dd55b848849404c85b3b8d1a947ec0a63771c159..db6c9cca03a8974a15cd2e7fbaf73033e3a57f4b 100644
--- a/paddle/cuda/include/hl_matrix_base_sse.cuh
+++ b/paddle/cuda/include/hl_matrix_base_sse.cuh
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_matrix_ops.cuh b/paddle/cuda/include/hl_matrix_ops.cuh
index 3e5e1bc7010ec53cfd23b93652aae2498ae23773..fc29201357ce564dcf20b12d998223f7a4cc22b4 100644
--- a/paddle/cuda/include/hl_matrix_ops.cuh
+++ b/paddle/cuda/include/hl_matrix_ops.cuh
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/cuda/include/hl_matrix_type.cuh
index 51e483d1fb2ff311ae68219d823411a3d707a30e..59213eee75f50d3c054ed8684a9a0e1053342a0a 100644
--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_MATRIX_TYPE_CUH_
 #define HL_MATRIX_TYPE_CUH_
 
 #include "hl_base.h"
 
 #ifdef __CUDA_ARCH__
-// typedef void*  vecType;
 #include <vector_types.h>
 #ifndef PADDLE_TYPE_DOUBLE
 typedef float4 vecType;
@@ -37,4 +35,10 @@ typedef __m128d vecType;
 #endif
 #endif
 
-#endif /* HL_MATRIX_TYPE_CUH_ */
+#ifdef __CUDA_ARCH__
+#define INLINE   __device__ inline
+#else
+#define INLINE   inline
+#endif
+
+#endif  // HL_MATRIX_TYPE_CUH_
diff --git a/paddle/cuda/include/hl_perturbation_util.cuh b/paddle/cuda/include/hl_perturbation_util.cuh
index 90fc1cb06035a292ae2d146222868400f031343e..93b81bf0358694219c55bf076c454790286c2418 100644
--- a/paddle/cuda/include/hl_perturbation_util.cuh
+++ b/paddle/cuda/include/hl_perturbation_util.cuh
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_recurrent_apply.cuh b/paddle/cuda/include/hl_recurrent_apply.cuh
index 0ccbf01f1c58daaed6731b62e8d7255eb2829ba2..113446cf75658f603200a57f3d8cd8a9e47fe728 100644
--- a/paddle/cuda/include/hl_recurrent_apply.cuh
+++ b/paddle/cuda/include/hl_recurrent_apply.cuh
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
index bb5124df44b492bd8fdeb2a0c75ebcf74d2c8157..9bcd25b0623e569052e08c0befc8e09f937fa4bd 100644
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -172,6 +172,39 @@ extern void hl_sequence2batch_add(real* batch,
                                   int batchCount,
                                   bool seq2batch);
 
+/**
+ * @brief   Memory copy from sequence to batch,
+ *          while padding all sequences to the same length.
+ *
+ * if seq2batch == true
+ *
+ *    copy from sequence to batch:
+ *        batch[i] = sequence[sequenceStartPositions[i]]
+ *
+ * if seq2batch == false
+ *
+ *    copy from batch to sequence:
+ *        sequence[sequenceStartPositions[i]] = batch[i]
+ *
+ * @param[in,out]   batch                   batch matrix.
+ * @param[in,out]   sequence                sequence matrix.
+ * @param[in]       sequenceStartPositions  index vector.
+ * @param[in]       sequenceWidth           width of sequence.
+ * @param[in]       maxSequenceLength       maximum length of sequences.
+ * @param[in]       numSequences            number of sequences.
+ * @param[in]       normByTimes             whether dividing sequence's length.
+ * @param[in]       seq2batch               copy direction.
+ *
+ */
+extern void hl_sequence2batch_copy_padding(real* batch,
+                                           real* sequence,
+                                           const int* sequenceStartPositions,
+                                           const size_t sequenceWidth,
+                                           const size_t maxSequenceLength,
+                                           const size_t numSequences,
+                                           bool normByTimes,
+                                           bool seq2batch);
+
 /**
  * @brief  dst = Op(src), src is sequence.
  *
diff --git a/paddle/cuda/include/hl_sparse.h b/paddle/cuda/include/hl_sparse.h
index c4e0be23e2031cbcb124b532216a23d8a344668d..67fe701c109db43eec844944efa76cd3cb79edee 100644
--- a/paddle/cuda/include/hl_sparse.h
+++ b/paddle/cuda/include/hl_sparse.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_sparse.ph b/paddle/cuda/include/hl_sparse.ph
index d3bc73c80d3741a47997337351c2a56cb42ea0bf..13bba17811630ff12e9fcd85c520e83f44a086df 100644
--- a/paddle/cuda/include/hl_sparse.ph
+++ b/paddle/cuda/include/hl_sparse.ph
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_sse_matrix_kernel.cuh b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
index 45db2f313e0d6e5d991a9bcc3ccd71262818ad09..9e50580669d2d4523dda239e90b4ed18a9214e2f 100644
--- a/paddle/cuda/include/hl_sse_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_table_apply.h b/paddle/cuda/include/hl_table_apply.h
index b4ac83a66af13c2a843872faba2ebd972008a738..2170b97f4d286608fb4b69ecbd9150e19b74834f 100644
--- a/paddle/cuda/include/hl_table_apply.h
+++ b/paddle/cuda/include/hl_table_apply.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_tensor_ops.h b/paddle/cuda/include/hl_tensor_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..7945b98201b1812790fb0d53123e9ee007640485
--- /dev/null
+++ b/paddle/cuda/include/hl_tensor_ops.h
@@ -0,0 +1,334 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_TENSOR_OPS_H_
+#define HL_TENSOR_OPS_H_
+
+#include <cmath>
+#include "hl_matrix_type.cuh"
+
+namespace hppl {
+namespace unary {
+
+template <class T>
+class add_scale {
+private:
+  const T p;
+
+public:
+  INLINE add_scale(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a + p; }
+};
+
+template <class T>
+class sub_scale {
+private:
+  const T p;
+
+public:
+  INLINE sub_scale(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a - p; }
+};
+
+template <class T>
+class mul_scale {
+private:
+  const T p;
+
+public:
+  INLINE mul_scale(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a * p; }
+};
+
+template <class T>
+class div_scale {
+private:
+  const T p;
+
+public:
+  INLINE div_scale(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a / p; }
+};
+
+template <class T>
+class neg {
+public:
+  INLINE T operator()(const T a) const { return -a; }
+};
+
+template <class T>
+class exp_op {
+public:
+  INLINE T operator()(const T a) const { return std::exp(a); }
+};
+
+template <class T>
+class log_op {
+public:
+  INLINE T operator()(const T a) const { return std::log(a); }
+};
+
+template <class T>
+class sqrt_op {
+public:
+  INLINE T operator()(const T a) const { return std::sqrt(a); }
+};
+
+template <class T>
+class square {
+public:
+  INLINE T operator()(const T a) const { return a * a; }
+};
+
+template <class T>
+class reciprocal {
+public:
+  INLINE T operator()(const T a) const { return T(1) / a; }
+};
+
+template <class T>
+class abs {
+public:
+  INLINE T operator()(const T a) const { return a > 0 ? a : -a; }
+};
+
+template <class T>
+class sign {
+public:
+  INLINE T operator()(const T a) const { return (a > 0) - (a < 0); }
+};
+
+template <class T>
+class min {
+private:
+  const T p;
+
+public:
+  INLINE min(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a > p ? p : a; }
+};
+
+template <class T>
+class max {
+private:
+  const T p;
+
+public:
+  INLINE max(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a < p ? p : a; }
+};
+
+template <class T>
+class pow_op {
+private:
+  const T p;
+
+public:
+  INLINE pow_op(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return std::pow(a, p); }
+};
+
+template <class T>
+class constant {
+private:
+  const T p;
+
+public:
+  INLINE constant(const T s) : p(s) {}
+  INLINE T operator()(int i) const { return p; }
+  INLINE T operator()(int i, int j) const { return p; }
+};
+
+template <class T>
+class cmp_eq {
+private:
+  const T p;
+
+public:
+  INLINE cmp_eq(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a == p; }
+};
+
+template <class T>
+class cmp_ne {
+private:
+  const T p;
+
+public:
+  INLINE cmp_ne(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a != p; }
+};
+
+template <class T>
+class cmp_le {
+private:
+  const T p;
+
+public:
+  INLINE cmp_le(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a <= p; }
+};
+
+template <class T>
+class cmp_lt {
+private:
+  const T p;
+
+public:
+  INLINE cmp_lt(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a < p; }
+};
+
+template <class T>
+class cmp_ge {
+private:
+  const T p;
+
+public:
+  INLINE cmp_ge(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a >= p; }
+};
+
+template <class T>
+class cmp_gt {
+private:
+  const T p;
+
+public:
+  INLINE cmp_gt(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a > p; }
+};
+
+template <class T>
+class and_op {
+private:
+  const T p;
+
+public:
+  INLINE and_op(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a && p; }
+};
+
+template <class T>
+class or_op {
+private:
+  const T p;
+
+public:
+  INLINE or_op(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a || p; }
+};
+
+}  // namespace unary
+
+namespace binary {
+template <class T>
+class add {
+public:
+  INLINE T operator()(const T a, const T b) const { return a + b; }
+};
+
+template <class T>
+class add_scale {
+private:
+  const T p1;
+  const T p2;
+
+public:
+  INLINE add_scale(const T s1, const T s2) : p1(s1), p2(s2) {}
+  INLINE T operator()(const T a, const T b) const { return p1 * a + p2 * b; }
+};
+
+template <class T>
+class sub {
+public:
+  INLINE T operator()(const T a, const T b) const { return a - b; }
+};
+
+template <class T>
+class mul {
+public:
+  INLINE T operator()(const T a, const T b) const { return a * b; }
+};
+
+template <class T>
+class div {
+public:
+  INLINE T operator()(const T a, const T b) const { return a / b; }
+};
+
+template <class T>
+class cmp_eq {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a == b; }
+};
+
+template <class T>
+class cmp_ne {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a != b; }
+};
+
+template <class T>
+class cmp_le {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a <= b; }
+};
+
+template <class T>
+class cmp_lt {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a < b; }
+};
+
+template <class T>
+class cmp_ge {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a >= b; }
+};
+
+template <class T>
+class cmp_gt {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a > b; }
+};
+
+template <class T>
+class and_op {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a && b; }
+};
+
+template <class T>
+class or_op {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a || b; }
+};
+
+template <class T>
+class min {
+public:
+  INLINE T operator()(const T a, const T b) const { return a > b ? b : a; }
+};
+
+template <class T>
+class max {
+public:
+  INLINE T operator()(const T a, const T b) const { return a < b ? b : a; }
+};
+
+}  // namespace binary
+}  // namespace hppl
+
+#endif  // HL_TENSOR_OPS_H_
diff --git a/paddle/cuda/include/hl_thread.ph b/paddle/cuda/include/hl_thread.ph
index 0cfc45993676d996e41a62ec34207aeb03151aa9..a3830ff8d8af5dfd3b932677c7f1e6ffcee97d47 100644
--- a/paddle/cuda/include/hl_thread.ph
+++ b/paddle/cuda/include/hl_thread.ph
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_time.h b/paddle/cuda/include/hl_time.h
index b0a88c66a12fcfec6ea96b877423f907dac8dfa1..f63f02582060156562061f73c429fc7bbd878d2c 100644
--- a/paddle/cuda/include/hl_time.h
+++ b/paddle/cuda/include/hl_time.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifndef HL_TIME_H_
 #define HL_TIME_H_
-
+#include <cstdint>
 /**
  * @brief   High resolution timer.
  *
diff --git a/paddle/cuda/include/hl_top_k.h b/paddle/cuda/include/hl_top_k.h
index e8cfebbf6a3bd27c10a71d7817238bc304681fa4..77949ed295a6eaf7cc535853e53bef066ffac37c 100644
--- a/paddle/cuda/include/hl_top_k.h
+++ b/paddle/cuda/include/hl_top_k.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/hl_warpctc_wrap.h b/paddle/cuda/include/hl_warpctc_wrap.h
new file mode 100644
index 0000000000000000000000000000000000000000..79bf6c3db7f876009d98a62b6523588f021886e8
--- /dev/null
+++ b/paddle/cuda/include/hl_warpctc_wrap.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_WARPCTC_WRAP_H_
+#define HL_WARPCTC_WRAP_H_
+
+#include "hl_base.h"
+#include "warp-ctc/include/ctc.h"
+
+typedef ctcStatus_t hl_warpctc_status_t;
+typedef ctcOptions hl_warpctc_options_t;
+
+/**
+ * @brief Init ctc options.
+ *
+ * @param[in]   blank     blank label used in ctc loss function.
+ * @param[in]   useGpu    whether use gpu.
+ * @param[out]  options   handle to store cpu or gpu informations.
+ *
+ */
+extern void hl_warpctc_init(const size_t blank,
+                            bool useGpu,
+                            hl_warpctc_options_t* options);
+
+/**
+ * @brief Compute the connectionist temporal classification loss,
+ *        and optionally compute the gradient with respect to the inputs.
+ *
+ * if batchGrad == nullptr
+ *
+ *    only compute the ctc loss.
+ *
+ * if batchGrad != nullptr
+ *
+ *    compute both ctc loss and gradient.
+ *
+ * @param[in]   batchInput      batch matrix of input probabilities,
+ *                              in maxSequenceLength x numSequence x numClasses
+ *                              (row-major) format.
+ * @param[out]  batchGrad       batch matrix of gradient.
+ * @param[in]   cpuLabels       labels always in CPU memory.
+ * @param[in]   cpuLabelLengths length of all labels in CPU memory.
+ * @param[in]   cpuInputLengths length of all sequences in CPU memory.
+ * @param[in]   numClasses      number of possible output symbols.
+ * @param[in]   numSequences    number of sequence.
+ * @param[out]  cpuCosts        cost of each sequence in CPU memory.
+ * @param[out]  workspace       workspace to store some temporary results.
+ * @param[in]   options         handle to store cpu or gpu informations.
+ *
+ */
+extern void hl_warpctc_compute_loss(const real* batchInput,
+                                    real* batchGrad,
+                                    const int* cpuLabels,
+                                    const int* cpuLabelLengths,
+                                    const int* cpuInputLengths,
+                                    const size_t numClasses,
+                                    const size_t numSequences,
+                                    real* cpuCosts,
+                                    void* workspace,
+                                    hl_warpctc_options_t* options);
+
+/**
+ * @brief Compute the required workspace size.
+ *        There is no memory allocated operations within warp-ctc.
+ *
+ * @param[in]   cpuLabelLengths length of all labels in CPU memory.
+ * @param[in]   cpuInputLengths length of all sequences in CPU memory.
+ * @param[in]   numClasses      number of possible output symbols.
+ * @param[in]   numSequences    number of sequence.
+ * @param[in]   options         handle to store cpu or gpu informations.
+ * @param[out]  bytes           pointer to a scalar where the memory
+ *                              requirement in bytes will be placed.
+ *
+ */
+extern void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
+                                          const int* cpuInputLengths,
+                                          const size_t numClasses,
+                                          const size_t numSequences,
+                                          hl_warpctc_options_t* options,
+                                          size_t* bytes);
+
+#endif  // HL_WARPCTC_WRAP_H_
diff --git a/paddle/cuda/include/stub/hl_aggregate_stub.h b/paddle/cuda/include/stub/hl_aggregate_stub.h
index bb53fc581e09905aa7a9b2d8dfe44b04c677c40a..bbfa9b8fad5b49290c05eeeeeeea4fe4b85a683f 100644
--- a/paddle/cuda/include/stub/hl_aggregate_stub.h
+++ b/paddle/cuda/include/stub/hl_aggregate_stub.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 2f73b9671edd3609996aebff2913f5262805f869..52c978735279ed804c44f0e93472355637e8b98d 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
index 85f7c390c47397127487b16fdc933f0afe2fb880..e86fd853f407de8bbcec5a2629d6ae600f696eb3 100644
--- a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
index 3beb0e5b5170261a6c453936b8b0347f3e97dbff..abd0d6b09901a7cd124c245e359f9d38f52bda26 100644
--- a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/stub/hl_cuda_stub.h b/paddle/cuda/include/stub/hl_cuda_stub.h
index 1f91068cdf8b3d472c4b403d1ec7d5293c28c07e..5246a8d5a48fa7a6ac075f12a312d9c5383cdf8c 100644
--- a/paddle/cuda/include/stub/hl_cuda_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_stub.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -90,4 +90,8 @@ inline bool hl_cuda_event_is_ready(hl_event_t event) { return true; }
 
 inline void hl_device_synchronize() {}
 
+inline void hl_profiler_start() {}
+
+inline void hl_profiler_end() {}
+
 #endif  // HL_CUDA_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_lstm_stub.h b/paddle/cuda/include/stub/hl_lstm_stub.h
index 7ccda032d26f2fbbe99136e8481416daea557a78..246ba79f6324dd48a423d3f748bf8f551ec3d376 100644
--- a/paddle/cuda/include/stub/hl_lstm_stub.h
+++ b/paddle/cuda/include/stub/hl_lstm_stub.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
index 1bd78d23fbaf46e6265ba0db25ea399a204bd96f..0b669f6735cb9771fd63ed8e3b45602db0db447c 100644
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
index 381f0a6f26c5669465f029e972c6ca8b0e6e1776..d6b07556f8958a62bd47f0b47b75bbebafeb58d3 100644
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -70,6 +70,15 @@ inline void hl_sequence2batch_add(real* batch,
                                   int batchCount,
                                   bool seq2batch) {}
 
+inline void hl_sequence2batch_copy_padding(real* batch,
+                                           real* sequence,
+                                           const int* sequenceStartPositions,
+                                           const size_t sequenceWidth,
+                                           const size_t maxSequenceLength,
+                                           const size_t numSequences,
+                                           bool normByTimes,
+                                           bool seq2batch) {}
+
 inline void hl_sequence_avg_forward(real* dst,
                                     real* src,
                                     const int* starts,
diff --git a/paddle/cuda/include/stub/hl_sparse_stub.h b/paddle/cuda/include/stub/hl_sparse_stub.h
index d47bdd2c47d097c4c68b7b7e88ef888bc18270c2..bd17461d88f716a0ed96e90ae36bb0f65099e110 100644
--- a/paddle/cuda/include/stub/hl_sparse_stub.h
+++ b/paddle/cuda/include/stub/hl_sparse_stub.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/src/hl_avx_functions.cc b/paddle/cuda/src/hl_avx_functions.cc
index c1e0c7f9d9e7958a6b4ba3617ca488e49af20655..906647587642f3815915efbeca7aab6458d25a70 100644
--- a/paddle/cuda/src/hl_avx_functions.cc
+++ b/paddle/cuda/src/hl_avx_functions.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/src/hl_batch_transpose.cu b/paddle/cuda/src/hl_batch_transpose.cu
index 00fd18e7f3936c2b2374d747745f296c7b9cf728..f047403da17e66960f029f2fee7312210009c952 100644
--- a/paddle/cuda/src/hl_batch_transpose.cu
+++ b/paddle/cuda/src/hl_batch_transpose.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/src/hl_cpu_functions.cc b/paddle/cuda/src/hl_cpu_functions.cc
index af00f352e536bf342e15315d1f6804225b87eb0b..c2117a7315fed506d0f746ddfceaf304942cf5a2 100644
--- a/paddle/cuda/src/hl_cpu_functions.cc
+++ b/paddle/cuda/src/hl_cpu_functions.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/src/hl_cuda_aggregate.cu b/paddle/cuda/src/hl_cuda_aggregate.cu
index 4eb775eb7971e467ef8b3a059af6f0d35b77e8ff..97034a917708487d1c5dc59e6ebbf45bad1c3227 100644
--- a/paddle/cuda/src/hl_cuda_aggregate.cu
+++ b/paddle/cuda/src/hl_cuda_aggregate.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index ae387a8bc0e0791995810df9e5f2556264d869b1..0992286f360fb8be22e3c35b632e4b7163036277 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <float.h>
 #include "hl_base.h"
 #include "hl_cnn.h"
+#include "hl_device_functions.cuh"
 
 __global__ void KeFeature2col(size_t n, size_t height, const real* data_im,
                               size_t blockH, size_t blockW, size_t width,
@@ -641,10 +642,10 @@ __global__ void KeBilinearInterpBw(real* in,
     real* inPos =
       &in[outIdH * inputW + channelId * inImgSize + inImgIdy * inImgW + inImgIdx];
     const real* outPos = &out[outIdH * outputW + outIdW];
-    atomicAdd(&inPos[0], h2lambda * w2lambda * outPos[0]);
-    atomicAdd(&inPos[wId], h2lambda * w1lambda * outPos[0]);
-    atomicAdd(&inPos[hId * inImgW], h1lambda * w2lambda * outPos[0]);
-    atomicAdd(&inPos[hId * inImgW + wId], h1lambda * w1lambda * outPos[0]);
+    paddle::paddleAtomicAdd(&inPos[0], h2lambda * w2lambda * outPos[0]);
+    paddle::paddleAtomicAdd(&inPos[wId], h2lambda * w1lambda * outPos[0]);
+    paddle::paddleAtomicAdd(&inPos[hId * inImgW], h1lambda * w2lambda * outPos[0]);
+    paddle::paddleAtomicAdd(&inPos[hId * inImgW + wId], h1lambda * w1lambda * outPos[0]);
   }
 }
 
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
index e8ba232d44b3f66254d4749d4abbcfbe46d1fd0e..182e8ab218cce18448f8a08f5c1a1dab7e38f2b6 100644
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "hl_cuda_cublas.h"
 #include <sys/time.h>
 #include <mutex>
 #include "hl_cuda.h"
-#include "hl_cuda_cublas.h"
-#include "hl_thread.ph"
 #include "hl_dso_loader.h"
+#include "hl_thread.ph"
 #include "paddle/utils/Logging.h"
 
 namespace dynload {
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index 9d4ff08a78d641896e946e9bf04590d4ba93350f..8cddf10d40c6277c6bb29a4fe11e5845a2770213 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "hl_cuda_cudnn.h"
 #include <cudnn.h>
 #include <mutex>
-#include "hl_cuda_cudnn.h"
 #include "hl_cuda_cudnn.ph"
-#include "hl_thread.ph"
 #include "hl_dso_loader.h"
-#include "paddle/utils/Logging.h"
+#include "hl_thread.ph"
 #include "paddle/utils/CommandLineParser.h"
+#include "paddle/utils/Logging.h"
 
-P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
-               4096,
-               "Specify cuDNN max workspace limit, in units MB, "
-               "4096MB=4GB by default.");
+DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
+             4096,
+             "Specify cuDNN max workspace limit, in units MB, "
+             "4096MB=4GB by default.");
 
 namespace dynload {
 
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index 745be35b56278ed2e0033d5fd2806320d3164d7c..a71eecba2736234dafaf6b67e5efac5358a30871 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,16 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <sys/time.h>
+// clang-format off
+// Because clang-format 4.X and clang-format 3.8+ format
+// following lines in different. So disable clang-format.
+#include "hl_cuda.h"
+#include <cuda_profiler_api.h>
 #include <string.h>
-#include <unistd.h>
 #include <sys/syscall.h>
+#include <sys/time.h>
+#include <unistd.h>
 #include <mutex>
 #include "hl_cuda.h"
 #include "hl_cuda.ph"
-#include "hl_thread.ph"
 #include "hl_dso_loader.h"
+#include "hl_thread.ph"
 #include "paddle/utils/Logging.h"
+// clang-format on
 
 namespace dynload {
 
@@ -133,7 +139,9 @@ void *cudart_dso_handle = nullptr;
   __macro(cudaGetLastError)               \
   __macro(cudaFuncSetCacheConfig)         \
   __macro(cudaRuntimeGetVersion)          \
-  __macro(cudaGetErrorString)
+  __macro(cudaGetErrorString)             \
+  __macro(cudaProfilerStart)              \
+  __macro(cudaProfilerStop)
 // clang-format on
 
 CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
@@ -742,3 +750,7 @@ bool hl_cuda_event_is_ready(hl_event_t event) {
   }
   return true;
 }
+
+void hl_profiler_start() { CHECK_CUDA(dynload::cudaProfilerStart()); }
+
+void hl_profiler_end() { CHECK_CUDA(dynload::cudaProfilerStop()); }
diff --git a/paddle/cuda/src/hl_cuda_lstm.cu b/paddle/cuda/src/hl_cuda_lstm.cu
index cf009620bf69d05397c5e03de3f7f2856bf4ff6b..b869d903ba3cfb188f823518ba8ee7d17f9b2440 100644
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index 0b7cd3375671d58464dac93458ec6659add8b730..2b4c6f7c39cff78c0e76cc1dfd41e1c7ef334f11 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index 63824eaa4c201c50ea20521801cd12de685aa3b9..4e33ac443c1f78b7fa50a15784875cbadfcf7497 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -447,6 +447,112 @@ void hl_sequence2batch_add(real *batch,
   CHECK_SYNC("hl_sequence2batch_add failed");
 }
 
+template<bool normByTimes, bool seq2batch>
+__global__
+void KeSequence2BatchPadding(real* batch,
+                             real* sequence,
+                             const int* sequenceStartPositions,
+                             const size_t sequenceWidth,
+                             const size_t maxSequenceLength,
+                             const size_t numSequences) {
+  int batchIdx = blockIdx.y;
+  int sequenceStart = sequenceStartPositions[batchIdx];
+  int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart;
+
+  int sequenceIdx = blockIdx.x * blockDim.y + threadIdx.y;
+  int batchBaseIdx = (sequenceIdx * numSequences + batchIdx) * sequenceWidth;
+  int sequenceBaseIdx = (sequenceStart + sequenceIdx) * sequenceWidth;
+
+  real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f;
+
+  if (sequenceIdx < sequenceLength) {
+    if (seq2batch) {
+      /* sequence -> batch */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i];
+      }
+    } else {
+      /* batch -> sequence */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i];
+      }
+    }
+  } else if (sequenceIdx < maxSequenceLength) {
+    if (seq2batch) {
+      /* sequence -> batch */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        batch[batchBaseIdx + i] = 0;
+      }
+    }
+  }
+}
+
+void hl_sequence2batch_copy_padding(real* batch,
+                                    real* sequence,
+                                    const int* sequenceStartPositions,
+                                    const size_t sequenceWidth,
+                                    const size_t maxSequenceLength,
+                                    const size_t numSequences,
+                                    bool normByTimes,
+                                    bool seq2batch) {
+  CHECK_NOTNULL(batch);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(sequenceStartPositions);
+
+  if (!normByTimes && numSequences == 1) {
+    size_t elementCount = maxSequenceLength * sequenceWidth;
+    if (seq2batch) {
+      /* sequence -> batch */
+      hl_memcpy_device2device(batch, sequence, sizeof(real) * elementCount);
+    } else {
+      /* batch -> sequence */
+      hl_memcpy_device2device(sequence, batch, sizeof(real) * elementCount);
+    }
+    return;
+  }
+
+  const int CUDA_BLOCK_SIZE = 512;
+
+  /* At least use 32 threads to copy sequenceWidth elements,
+     and at least 8 elements for each thread. */
+  int blockDimX = ((((sequenceWidth + 7) >> 3) + 31) >> 5) << 5;
+  blockDimX = (blockDimX < CUDA_BLOCK_SIZE) ? blockDimX : CUDA_BLOCK_SIZE;
+
+  int blockDimY = CUDA_BLOCK_SIZE / blockDimX;
+  dim3 threads(blockDimX, blockDimY);
+
+  int gridDimX = (maxSequenceLength * blockDimX + CUDA_BLOCK_SIZE - 1) /
+      CUDA_BLOCK_SIZE;
+  int gridDimY = numSequences;
+  dim3 grid(gridDimX, gridDimY);
+
+  if (seq2batch) {
+    /* sequence -> batch */
+    if (normByTimes) {
+      KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    } else {
+      KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    }
+  } else {
+    /* batch -> sequence */
+    if (normByTimes) {
+      KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    } else {
+      KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    }
+  }
+
+  CHECK_SYNC("hl_sequence2batch_copy_padding failed");
+}
+
 __device__ inline float my_rsqrt(float x) {
   return rsqrtf(x);
 }
diff --git a/paddle/cuda/src/hl_cuda_sparse.cu b/paddle/cuda/src/hl_cuda_sparse.cu
index 1687fcc221ab85aff943ccf8c5be7c1ed918f853..ab9ab57c884137f117c25c2752b5603b2e8b7135 100644
--- a/paddle/cuda/src/hl_cuda_sparse.cu
+++ b/paddle/cuda/src/hl_cuda_sparse.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/src/hl_cuda_sparse.cuh b/paddle/cuda/src/hl_cuda_sparse.cuh
index 9cf2d5a843343075c33d19bf34d9ed315299de83..72572756a671b0d43f9b5e68bab2a5e5aef5f762 100644
--- a/paddle/cuda/src/hl_cuda_sparse.cuh
+++ b/paddle/cuda/src/hl_cuda_sparse.cuh
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc
index ff6b830b7addc5c87af0d55070260c279a046a75..ecc03a729dde2f2b4f8f004234a47d9272997a50 100644
--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ b/paddle/cuda/src/hl_cudart_wrap.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #ifdef PADDLE_USE_DSO
 
+#include <cuda_runtime.h>
 #include <mutex>
 #include "hl_dso_loader.h"
 
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index 1a3ce08619fc3a5787576b30e9f4c13336990e74..54c7620fc081f681d9d33bcd711008fa5029df05 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,19 +16,21 @@ limitations under the License. */
 #include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Logging.h"
 
-P_DEFINE_string(cudnn_dir,
-                "",
-                "Specify path for loading libcudnn.so. For instance, "
-                "/usr/local/cudnn/lib. If empty [default], dlopen "
-                "will search cudnn from LD_LIBRARY_PATH");
+DEFINE_string(cudnn_dir,
+              "",
+              "Specify path for loading libcudnn.so. For instance, "
+              "/usr/local/cudnn/lib. If empty [default], dlopen "
+              "will search cudnn from LD_LIBRARY_PATH");
 
-P_DEFINE_string(cuda_dir,
-                "",
-                "Specify path for loading cuda library, such as libcublas, "
-                "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
-                "libcudart can not be specified by cuda_dir, since some "
-                "build-in function in cudart already ran before main entry). "
-                "If default, dlopen will search cuda from LD_LIBRARY_PATH");
+DEFINE_string(cuda_dir,
+              "",
+              "Specify path for loading cuda library, such as libcublas, "
+              "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
+              "libcudart can not be specified by cuda_dir, since some "
+              "build-in function in cudart already ran before main entry). "
+              "If default, dlopen will search cuda from LD_LIBRARY_PATH");
+
+DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
@@ -92,27 +94,28 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
     *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
     // if not found, search from default path
     if (nullptr == *dso_handle) {
-      LOG(WARNING) << "Failed to find cuda library: " << dlPath;
+      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
+                   << dlerror() << ")";
       dlPath = dso_name;
       GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
     }
   }
 
-  CHECK(nullptr != *dso_handle) << "Failed to find cuda library: " << dlPath
-                                << std::endl
+  CHECK(nullptr != *dso_handle) << "Failed to find dynamic library: " << dlPath
+                                << " (" << dlerror() << ") \n"
                                 << "Please specify its path correctly using "
-                                   "one of the following ways: \n"  // NOLINT
+                                   "one of the following ways: \n"
 
                                 << "Method 1. set cuda and cudnn lib path at "
                                    "runtime. "
                                 << "http://www.paddlepaddle.org/doc/ui/"
                                    "cmd_argument/"
-                                   "argument_outline.html \n"  // NOLINT
+                                   "argument_outline.html \n"
                                 << "For instance, issue command: paddle train "
                                    "--use_gpu=1 "
                                 << "--cuda_dir=/usr/local/cuda/lib64 "
                                    "--cudnn_dir=/usr/local/cudnn/lib "
-                                   "...\n"  // NOLINT
+                                   "...\n"
 
                                 << "Method 2. set environment variable "
                                    "LD_LIBRARY_PATH on Linux or "
@@ -124,7 +127,7 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
                                    "DYLD_LIBRARY_PATH is impossible "
                                 << "unless System Integrity Protection (SIP) "
                                    "is disabled. However, "
-                                   "method 1 "  // NOLINT
+                                   "method 1 "
                                 << "always work well.";
 }
 
@@ -159,3 +162,11 @@ void GetCurandDsoHandle(void** dso_handle) {
   GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
 #endif
 }
+
+void GetWarpCTCDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
+#endif
+}
diff --git a/paddle/cuda/src/hl_math.cc b/paddle/cuda/src/hl_math.cc
index f4bf888bab4e92dd940714ef1b7aeee9242eb817..3048693fb870d7032fb690570d7e31e17001507a 100644
--- a/paddle/cuda/src/hl_math.cc
+++ b/paddle/cuda/src/hl_math.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/src/hl_perturbation_util.cu b/paddle/cuda/src/hl_perturbation_util.cu
index a10d06f8a97cd8c807d173d8a8c29ac698b26bd5..2a945bcdb87fe49c121890128ef77b084ebe8e60 100644
--- a/paddle/cuda/src/hl_perturbation_util.cu
+++ b/paddle/cuda/src/hl_perturbation_util.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/src/hl_table_apply.cu b/paddle/cuda/src/hl_table_apply.cu
index 52ee4610edf670dc339e0ece66d58153c0164499..61edbe3ccc7028fd8779c4119f33c4cb5afe0564 100644
--- a/paddle/cuda/src/hl_table_apply.cu
+++ b/paddle/cuda/src/hl_table_apply.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/src/hl_time.cc b/paddle/cuda/src/hl_time.cc
index d52b2a1df07374f632def12eb52e10e10ca86028..7e5d7e8aaecbcdc61c1e5b5006a2958d4dc84460 100644
--- a/paddle/cuda/src/hl_time.cc
+++ b/paddle/cuda/src/hl_time.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <chrono>
+#include "hl_time.h"
 #include <stdlib.h>
+#include <chrono>
+#include <cstdint>
 #include <iostream>
-#include "hl_time.h"
 
 using std::chrono::high_resolution_clock;
 
diff --git a/paddle/cuda/src/hl_top_k.cu b/paddle/cuda/src/hl_top_k.cu
index ed74787b610ca13fb527348fb6ebcc68af0fefe8..f0ef0cc3c51f9e7935dc3c40f630e4d70960802a 100644
--- a/paddle/cuda/src/hl_top_k.cu
+++ b/paddle/cuda/src/hl_top_k.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9ae8bc0f220e143a5c59d8c3ead012a20369e7b9
--- /dev/null
+++ b/paddle/cuda/src/hl_warpctc_wrap.cc
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_warpctc_wrap.h"
+#include <mutex>
+#include "hl_dso_loader.h"
+#include "paddle/utils/Logging.h"
+
+namespace dynload {
+
+std::once_flag warpctc_dso_flag;
+void* warpctc_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load warpctc routine
+ * via operator overloading. When PADDLE_USE_DSO is
+ * false, you need to add the path of libwarp-ctc.so to
+ * the linked-libs of paddle or to LD_PRELOAD.
+ */
+#ifdef PADDLE_USE_DSO
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                              \
+  struct DynLoad__##__name {                                           \
+    template <typename... Args>                                        \
+    auto operator()(Args... args) -> decltype(__name(args...)) {       \
+      using warpctcFunc = decltype(__name(args...)) (*)(Args...);      \
+      std::call_once(                                                  \
+          warpctc_dso_flag, GetWarpCTCDsoHandle, &warpctc_dso_handle); \
+      void* p_##_name = dlsym(warpctc_dso_handle, #__name);            \
+      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);        \
+    }                                                                  \
+  } __name;  // struct DynLoad__##__name
+#else
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                        \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      return __name(args...);                                    \
+    }                                                            \
+  } __name;  // struct DynLoad__##__name
+#endif
+
+// include all needed warp-ctc functions
+DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version)
+DYNAMIC_LOAD_WARPCTC_WRAP(ctcGetStatusString)
+DYNAMIC_LOAD_WARPCTC_WRAP(compute_ctc_loss)
+DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size)
+
+#undef DYNAMIC_LOAD_WARPCTC_WRAP
+
+} /* namespace dynload */
+
+#define WARPCTC_GET_VERSION dynload::get_warpctc_version
+#define WARPCTC_GET_STATUS_STRING dynload::ctcGetStatusString
+
+#ifndef PADDLE_TYPE_DOUBLE
+#define WARPCTC_COMPUTE_LOSS dynload::compute_ctc_loss
+#define WARPCTC_GET_WORKSPACE_SIZE dynload::get_workspace_size
+#else
+#define WARPCTC_LOG_FATAL                                \
+  LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion \
+             << "] Error: not support double precision."
+#define WARPCTC_COMPUTE_LOSS(...) WARPCTC_LOG_FATAL(__VA_ARGS__)
+#define WARPCTC_GET_WORKSPACE_SIZE(...) WARPCTC_LOG_FATAL(__VA_ARGS__)
+#endif
+
+/**
+ * Check build-in warp-ctc function using glog and it also
+ * support << operator for more details error info.
+ */
+static int g_warpctcVersion = -1;
+#define CHECK_WARPCTC(warpctcStat)                \
+  CHECK_EQ(CTC_STATUS_SUCCESS, warpctcStat)       \
+      << "warp-ctc [version " << g_warpctcVersion \
+      << "] Error: " << WARPCTC_GET_STATUS_STRING(warpctcStat) << " "
+
+void hl_warpctc_init(const size_t blank,
+                     bool useGpu,
+                     hl_warpctc_options_t* options) {
+  CHECK_NOTNULL(options);
+
+  g_warpctcVersion = WARPCTC_GET_VERSION();
+
+  if (useGpu) {
+#ifdef __NVCC__
+    options->loc = CTC_GPU;
+    options->stream = STREAM_DEFAULT;
+#else
+    LOG(FATAL) << "[warpctc init] GPU is not enabled.";
+#endif
+  } else {
+    options->loc = CTC_CPU;
+    options->num_threads = 1;
+  }
+
+  options->blank_label = blank;
+}
+
+void hl_warpctc_compute_loss(const real* batchInput,
+                             real* batchGrad,
+                             const int* cpuLabels,
+                             const int* cpuLabelLengths,
+                             const int* cpuInputLengths,
+                             const size_t numClasses,
+                             const size_t numSequences,
+                             real* cpuCosts,
+                             void* workspace,
+                             hl_warpctc_options_t* options) {
+  CHECK_NOTNULL(batchInput);
+  CHECK_NOTNULL(cpuLabels);
+  CHECK_NOTNULL(cpuLabelLengths);
+  CHECK_NOTNULL(cpuInputLengths);
+  CHECK_NOTNULL(cpuCosts);
+  CHECK_NOTNULL(workspace);
+  CHECK_NOTNULL(options);
+
+  CHECK_WARPCTC(WARPCTC_COMPUTE_LOSS(batchInput,
+                                     batchGrad,
+                                     cpuLabels,
+                                     cpuLabelLengths,
+                                     cpuInputLengths,
+                                     numClasses,
+                                     numSequences,
+                                     cpuCosts,
+                                     workspace,
+                                     *options));
+}
+
+void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
+                                   const int* cpuInputLengths,
+                                   const size_t numClasses,
+                                   const size_t numSequences,
+                                   hl_warpctc_options_t* options,
+                                   size_t* bytes) {
+  CHECK_NOTNULL(cpuLabelLengths);
+  CHECK_NOTNULL(cpuInputLengths);
+  CHECK_NOTNULL(options);
+  CHECK_NOTNULL(bytes);
+
+  CHECK_WARPCTC(WARPCTC_GET_WORKSPACE_SIZE(cpuLabelLengths,
+                                           cpuInputLengths,
+                                           numClasses,
+                                           numSequences,
+                                           *options,
+                                           bytes));
+}
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index f1bb94216c44b3e915f87a3ae49bdfd3ef812916..f8c4bcac2f8eb41400659dc24ba81768e7ae3640 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,13 +15,13 @@ limitations under the License. */
 #include "ActivationFunction.h"
 
 #include <algorithm>
-#include <memory>
 #include <iostream>
-#include <type_traits>
+#include <memory>
 #include <string>
 #include <thread>
-#include "paddle/utils/ClassRegistrar.h"
+#include <type_traits>
 #include "paddle/parameter/Argument.h"
+#include "paddle/utils/ClassRegistrar.h"
 
 #include "paddle/utils/Logging.h"
 
@@ -289,7 +289,7 @@ void forward(Argument& act) {
                          useGpu(act.deviceId));
 
   act.in->copyFrom(*act.value);
-  act.value->abs(*act.value);
+  act.value->abs2(*act.value);
 }
 
 void backward(Argument& act) { act.grad->absDerivative(*act.in); }
@@ -311,7 +311,7 @@ void forward(Argument& act) {
                          useGpu(act.deviceId));
 
   act.in->copyFrom(*act.value);
-  act.value->square(*act.value);
+  act.value->square2(*act.value);
 }
 
 void backward(Argument& act) { act.grad->squareDerivative(*act.in); }
@@ -324,7 +324,7 @@ END_DEFINE_ACTIVATION(square)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(exponential)
-void forward(Argument& act) { act.value->exp(*act.value); }
+void forward(Argument& act) { act.value->exp2(*act.value); }
 
 void backward(Argument& act) { act.grad->expDerivative(*act.value); }
 END_DEFINE_ACTIVATION(exponential)
@@ -345,7 +345,7 @@ void forward(Argument& act) {
                          useGpu(act.deviceId));
 
   act.in->copyFrom(*act.value);
-  act.value->log(*act.value);
+  act.value->log2(*act.value);
 }
 
 void backward(Argument& act) { act.grad->dotDiv(*act.grad, *act.in); }
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
index e9ed5c619ab5e4dd9c52c0dac24478c2a57aa1bf..601e3b6c0cd401ec007e8cf51e44416f82832e58 100644
--- a/paddle/gserver/activations/ActivationFunction.h
+++ b/paddle/gserver/activations/ActivationFunction.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
index e6cc4a246a8494d287f8638674f4ae213f38f657..0478256f9cd81f4a99eb0cbcbd1a5a21de5cf14b 100644
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #include "DataProvider.h"
 
-#include "paddle/utils/Util.h"
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Logging.h"
-#include <algorithm>
 #include <unistd.h>
+#include <algorithm>
 #include "ProtoDataProvider.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/StringUtil.h"
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 8b7fb27f821a47d830413eced79b3352a6969c90..9b7f7e36cedaa230ae0694d87cc033bd6fa6e652 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,28 +14,28 @@ limitations under the License. */
 
 #pragma once
 
-#include <vector>
-#include <memory>
-#include <mutex>
-#include <iostream>
-#include <fstream>
 #include <stdint.h>
-#include <string.h>
-#include <stdlib.h>
 #include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <vector>
 
+#include "DataConfig.pb.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/math/Vector.h"
+#include "paddle/parameter/Argument.h"
+#include "paddle/utils/ClassRegistrar.h"
+#include "paddle/utils/Locks.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Queue.h"
-#include "paddle/utils/Locks.h"
 #include "paddle/utils/ThreadLocal.h"
 #include "paddle/utils/TypeDefs.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
 #include "paddle/utils/Util.h"
-#include "paddle/math/Vector.h"
-#include "DataConfig.pb.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/parameter/Argument.h"
 
 namespace paddle {
 /**
@@ -271,7 +271,9 @@ public:
   void finishAsyncLoad() {
     stopping_ = true;
     taskReadySem_.post();
-    asyncLoader_->join();
+    if (asyncLoader_) {
+      asyncLoader_->join();
+    }
   }
 
   void setPending(bool pending) { pending_ = pending; }
diff --git a/paddle/gserver/dataproviders/DataProviderGroup.h b/paddle/gserver/dataproviders/DataProviderGroup.h
index 6c178e29ee714a6bd7f58861d7cf15716fee848d..69ac2590b9cc8de95228fe2a83480e2c1e03db0a 100644
--- a/paddle/gserver/dataproviders/DataProviderGroup.h
+++ b/paddle/gserver/dataproviders/DataProviderGroup.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.cpp b/paddle/gserver/dataproviders/MultiDataProvider.cpp
index 51fb1f26668c55dc1c2aecd5389f327e2569a52f..46fe053768e480c5f69f597c49f363cb966a4168 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.cpp
+++ b/paddle/gserver/dataproviders/MultiDataProvider.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
 #include "MultiDataProvider.h"
-#include "paddle/utils/Logging.h"
 #include <algorithm>
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.h b/paddle/gserver/dataproviders/MultiDataProvider.h
index 876467c04f074cf37e48fdfa9b24f236fcfe8ba1..4c8fb2cd0ddf5d3b6304445508195b5c95dd6ab2 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.h
+++ b/paddle/gserver/dataproviders/MultiDataProvider.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
index 0a7ff802461f2ded0e6e842c088bddf218361f79..c6f5cab1915b7f41d505c37a7fef762a392bad7f 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,18 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "ProtoDataProvider.h"
-#include "paddle/utils/Util.h"
-#include "paddle/utils/StringUtil.h"
 #include <algorithm>
 #include <fstream>
 #include <istream>
+#include "paddle/utils/StringUtil.h"
+#include "paddle/utils/Util.h"
 
-#include "paddle/utils/Logging.h"
 #include "DataProviderGroup.h"
+#include "paddle/utils/Logging.h"
 
-P_DEFINE_double(memory_threshold_on_load_data,
-                1.0,
-                "stop loading data when memory is not sufficient");
+DEFINE_double(memory_threshold_on_load_data,
+              1.0,
+              "stop loading data when memory is not sufficient");
 
 namespace paddle {
 
@@ -562,16 +562,16 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
         auto mat = cpuArguments[slot].value;
         mat->resize(size, dim);
         if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(),
-                         slots_[slot].indices.data(),
-                         slots_[slot].sparseNonValueData.data(),
-                         HPPL_STREAM_1);
+          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
+              dataPos.data(),
+              slots_[slot].indices.data(),
+              slots_[slot].sparseNonValueData.data(),
+              HPPL_STREAM_1);
         } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(),
-                         slots_[slot].indices.data(),
-                         slots_[slot].sparseNonValueData.data());
+          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
+              dataPos.data(),
+              slots_[slot].indices.data(),
+              slots_[slot].sparseNonValueData.data());
         } else {
           LOG(FATAL) << "Not Supported";
         }
@@ -598,16 +598,16 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
         auto mat = cpuArguments[slot].value;
         mat->resize(size, dim);
         if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(),
-                         slots_[slot].indices.data(),
-                         slots_[slot].sparseFloatValueData.data(),
-                         HPPL_STREAM_1);
+          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
+              dataPos.data(),
+              slots_[slot].indices.data(),
+              slots_[slot].sparseFloatValueData.data(),
+              HPPL_STREAM_1);
         } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(),
-                         slots_[slot].indices.data(),
-                         slots_[slot].sparseFloatValueData.data());
+          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
+              dataPos.data(),
+              slots_[slot].indices.data(),
+              slots_[slot].sparseFloatValueData.data());
         } else {
           LOG(FATAL) << "Not Supported";
         }
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.h b/paddle/gserver/dataproviders/ProtoDataProvider.h
index ffdcc8fdc977f53e29dc9f03fa3cf7af56acb92f..7dd45e062248f20d24c633dd4e1c8b7eebcbfa1b 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.h
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <vector>
 
-#include "paddle/utils/Stat.h"
 #include "DataFormat.pb.h"
+#include "paddle/utils/Stat.h"
 
 #include "DataProvider.h"
 #include "ProtoReader.h"
diff --git a/paddle/gserver/dataproviders/ProtoReader.h b/paddle/gserver/dataproviders/ProtoReader.h
index b8fca3cd7f3c5efaea35dc8e09f7ca0ec250830f..4e6f58a5292bec276994fde0764278d12d7ae9d5 100644
--- a/paddle/gserver/dataproviders/ProtoReader.h
+++ b/paddle/gserver/dataproviders/ProtoReader.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,10 +16,10 @@ limitations under the License. */
 
 #include <memory>
 
-#include <google/protobuf/message_lite.h>
 #include <google/protobuf/io/coded_stream.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
 #include <google/protobuf/io/gzip_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/message_lite.h>
 
 namespace paddle {
 
diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp
index bee6ca14a2ec3995a3b432fc5a39419a5dd8a8ce..5bdd55309c8bf8d5dcf84f5dcef2c5c85249a668 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "PyDataProvider.h"
-#include "paddle/utils/PythonUtil.h"
 #include <fenv.h>
-#include "paddle/utils/Util.h"
 #include "paddle/utils/Excepts.h"
+#include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
@@ -316,16 +316,16 @@ void PyDataProvider::handleSparseNonValueSlot(
   auto mat = cpuArguments[slotIndex].value;
   mat->resize(slot.sampleNum, dim, slot.sampleNum, NO_VALUE, SPARSE_CSR);
   if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(),
-                   slot.indices.data(),
-                   slot.sparseNonValueData.data(),
-                   HPPL_STREAM_1);
+    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
+        slot.sampleSequenceIdVec.data(),
+        slot.indices.data(),
+        slot.sparseNonValueData.data(),
+        HPPL_STREAM_1);
   } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(),
-                   slot.indices.data(),
-                   slot.sparseNonValueData.data());
+    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
+        slot.sampleSequenceIdVec.data(),
+        slot.indices.data(),
+        slot.sparseNonValueData.data());
   } else {
     LOG(FATAL) << "Not Supported";
   }
@@ -347,16 +347,16 @@ void PyDataProvider::handleSparseValueSlot(
   auto mat = cpuArguments[slotIndex].value;
   mat->resize(slot.sampleNum, dim, slot.sampleNum, FLOAT_VALUE, SPARSE_CSR);
   if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(),
-                   slot.indices.data(),
-                   slot.sparseFloatValueData.data(),
-                   HPPL_STREAM_DEFAULT);
+    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
+        slot.sampleSequenceIdVec.data(),
+        slot.indices.data(),
+        slot.sparseFloatValueData.data(),
+        HPPL_STREAM_DEFAULT);
   } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(),
-                   slot.indices.data(),
-                   slot.sparseFloatValueData.data());
+    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
+        slot.sampleSequenceIdVec.data(),
+        slot.indices.data(),
+        slot.sparseFloatValueData.data());
   } else {
     LOG(FATAL) << "Not Supported";
   }
diff --git a/paddle/gserver/dataproviders/PyDataProvider.h b/paddle/gserver/dataproviders/PyDataProvider.h
index 6bb7c831fdd451abc5241199d6a4d1b1ad814517..1401c13a1e611fda3000ec4af826ba0e66a3998a 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.h
+++ b/paddle/gserver/dataproviders/PyDataProvider.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index 967fc9026a39967477d606862e060b680512901a..460efc5adc6f017e91dc9daff6ab32312e4460c1 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,18 +15,18 @@ limitations under the License. */
 #ifndef PADDLE_NO_PYTHON
 
 #include <Python.h>
+#include <numpy/numpyconfig.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <unordered_set>
 #include <list>
-#include <numpy/numpyconfig.h>
+#include <unordered_set>
 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 #include <numpy/ndarrayobject.h>
 
 #include "DataProvider.h"
 
-#include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Locks.h"
+#include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
@@ -400,10 +400,9 @@ private:
 
       if (this->loadThread_) {  // wait poolActualSize < poolSize;
         std::unique_lock<std::mutex> l(mtx_);
-        pushCV_.wait(l,
-                     [this, additionalBatchSize] {
-                       return this->poolActualSize_ < poolSize_;
-                     });
+        pushCV_.wait(l, [this, additionalBatchSize] {
+          return this->poolActualSize_ < poolSize_;
+        });
       }
 
       {
@@ -529,12 +528,10 @@ public:
                         // but, loading from cache, cache object should ensure
                         // data pool ready.
       std::unique_lock<std::mutex> l(mtx_);
-      pullCV_.wait(l,
-                   [this, &size] {
-                     return this->poolActualSize_ >=
-                                std::max(size, this->minPoolSize_) ||
-                            callingContexts_.empty();
-                   });
+      pullCV_.wait(l, [this, &size] {
+        return this->poolActualSize_ >= std::max(size, this->minPoolSize_) ||
+               callingContexts_.empty();
+      });
 
       if (unittest::OnPoolFilled) {
         (*unittest::OnPoolFilled)(this->poolActualSize_);
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
index 8f7d2fb80e9b6f2b4c83d90a04dab5219435d344..05aa6c012ae2bc0afcbaf23f8ff78b3c782d050c 100644
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
index 923e77fc9df919794902daed6113792e7f89a552..3d8af5bcd419e76fb2026eddc95dc409a33c9d92 100644
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
index f5df2b18dedde9022d04b034912e59be00f15413..2f9928191170aa6cf25417362cb360b5e2865b69 100644
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
 #include "paddle/gserver/evaluators/Evaluator.h"
+#include "paddle/utils/Stat.h"
 
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 
-P_DECLARE_int32(trainer_id);
+DECLARE_int32(trainer_id);
 
 namespace paddle {
 
@@ -842,9 +842,9 @@ void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
   auto start = predictArray.begin();
   while (start != predictArray.end()) {
     auto end = std::find_if(
-        start + 1,
-        predictArray.end(),
-        [=](const PredictionResult& x) { return x.queryid != start->queryid; });
+        start + 1, predictArray.end(), [=](const PredictionResult& x) {
+          return x.queryid != start->queryid;
+        });
     CHECK(end != start);
     stat(start - predictArray.begin(),
          end - predictArray.begin(),
diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h
index 732abb6079523b1cce8d0727c94ef65581842b4c..5770847309670ef1856cfb9255fa847c24513b56 100644
--- a/paddle/gserver/evaluators/Evaluator.h
+++ b/paddle/gserver/evaluators/Evaluator.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pserver/ParameterClient2.h"
-#include "paddle/utils/ClassRegistrar.h"
+#include <fstream>
 #include "ModelConfig.pb.h"
 #include "paddle/parameter/Argument.h"
-#include <fstream>
+#include "paddle/pserver/ParameterClient2.h"
+#include "paddle/utils/ClassRegistrar.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/gradientmachines/GradientMachine.cpp b/paddle/gserver/gradientmachines/GradientMachine.cpp
index 3761fda5f370e3b1aef0e394286c49d8ec831694..36ca05b919b136c162105cf4f1fb7705ae7ca7f3 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/GradientMachine.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,16 +14,16 @@ limitations under the License. */
 
 #include "GradientMachine.h"
 
-#include "paddle/utils/Logging.h"
 #include <fstream>
+#include "paddle/utils/Logging.h"
 
-#include "hl_gpu.h"
-#include "NeuralNetwork.h"
-#include "ParallelNeuralNetwork.h"
+#include "GradientMachineMode.h"
 #include "MultiGradientMachine.h"
-#include "NeuralNetwork.h"
 #include "MultiNetwork.h"
-#include "GradientMachineMode.h"
+#include "NeuralNetwork.h"
+#include "NeuralNetwork.h"
+#include "ParallelNeuralNetwork.h"
+#include "hl_gpu.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
index 27cdf7f7890673673d5be63fecdd61d5d2a11447..579eca71d4cdd2545a3a8be1c7f1dacfdd5ef66b 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,15 +17,15 @@ limitations under the License. */
 #include <iostream>
 #include <vector>
 
-#include "paddle/math/Matrix.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/ParameterUpdaterBase.h"
-#include "paddle/utils/Thread.h"
-#include "TrainerConfig.pb.h"
 #include "ModelConfig.pb.h"
+#include "TrainerConfig.pb.h"
 #include "paddle/gserver/dataproviders/DataProvider.h"
 #include "paddle/gserver/evaluators/Evaluator.h"
 #include "paddle/gserver/layers/Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/parameter/Parameter.h"
+#include "paddle/parameter/ParameterUpdaterBase.h"
+#include "paddle/utils/Thread.h"
 
 namespace paddle {
 /**
diff --git a/paddle/gserver/gradientmachines/GradientMachineMode.cpp b/paddle/gserver/gradientmachines/GradientMachineMode.cpp
index 4a90a4a566ac9b9a5eb627a1b895151fb9e9cfc0..3583fb4de88acd1c68b1a1f56ea393bb630ba356 100644
--- a/paddle/gserver/gradientmachines/GradientMachineMode.cpp
+++ b/paddle/gserver/gradientmachines/GradientMachineMode.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/gradientmachines/GradientMachineMode.h b/paddle/gserver/gradientmachines/GradientMachineMode.h
index f2f55a70671858145572e4a5c0f1c4b609145f98..7bc885fe99ed3831aa111561dd1cd20d499d5e53 100644
--- a/paddle/gserver/gradientmachines/GradientMachineMode.h
+++ b/paddle/gserver/gradientmachines/GradientMachineMode.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 148451f18dceb0c470dadab01ff91915f994c68f..88c098b3559d8d2918309aa48329af067f79bdd5 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -21,11 +21,11 @@ limitations under the License. */
 #include "NeuralNetwork.h"
 #include "ParallelNeuralNetwork.h"
 
-P_DEFINE_bool(allow_only_one_model_on_one_gpu,
-              true,
-              "If true, do not allow multiple models on one GPU device");
+DEFINE_bool(allow_only_one_model_on_one_gpu,
+            true,
+            "If true, do not allow multiple models on one GPU device");
 #ifdef PADDLE_METRIC_LEARNING
-P_DECLARE_bool(external);
+DECLARE_bool(external);
 #endif
 
 namespace paddle {
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
index 58c5486810cf280c48c62f2256480c1a4bb047bc..5f9855c4be869aa73aaebfc2e75ee51f050f2722 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -18,9 +18,9 @@ limitations under the License. */
 
 #include "GradientMachine.h"
 
-#include "paddle/utils/Queue.h"
-#include "paddle/utils/Locks.h"
 #include "hl_gpu.h"
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/Queue.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.cpp b/paddle/gserver/gradientmachines/MultiNetwork.cpp
index e5be19cad6b450850de4cc5776017b79d3243681..6eb3d8db962161ed4123b4ef4a4bb42147bfdf19 100644
--- a/paddle/gserver/gradientmachines/MultiNetwork.cpp
+++ b/paddle/gserver/gradientmachines/MultiNetwork.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <algorithm>
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
-#include <algorithm>
 
 #include "MultiNetwork.h"
 
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.h b/paddle/gserver/gradientmachines/MultiNetwork.h
index 779a2267f55c8e1b5d120d9fd1e2a0d455cc5c59..89fbf32b4f90bceab60b8335c27b369806faaee1 100644
--- a/paddle/gserver/gradientmachines/MultiNetwork.h
+++ b/paddle/gserver/gradientmachines/MultiNetwork.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 9932ea655ebdceb2eb1ae8920f4d320163d14262..ee36a87b9d848edcc37f89221141de3f939e1110 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,15 +14,15 @@ limitations under the License. */
 
 #include "paddle/utils/Util.h"
 
-#include "paddle/utils/Logging.h"
 #include "paddle/utils/CustomStackTrace.h"
+#include "paddle/utils/Logging.h"
 
-#include "paddle/utils/Stat.h"
-#include "hl_gpu.h"
+#include "MultiNetwork.h"
 #include "NeuralNetwork.h"
 #include "RecurrentGradientMachine.h"
-#include "MultiNetwork.h"
+#include "hl_gpu.h"
 #include "paddle/gserver/layers/AgentLayer.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 void parameterInitNN(int paramId,
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index 55ef45c5eeddc770ec3bc8fd0055d561eaf3b754..384ca88f47ffb20ca7d16a276a190b063158d273 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,18 +14,18 @@ limitations under the License. */
 
 #pragma once
 
-#include <memory>
-#include <map>
 #include <functional>
+#include <map>
+#include <memory>
 
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/parameter/Parameter.h"
 #include "ModelConfig.pb.h"
+#include "paddle/gserver/dataproviders/DataProvider.h"
 #include "paddle/gserver/gradientmachines/GradientMachine.h"
 #include "paddle/gserver/layers/CostLayer.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/dataproviders/DataProvider.h"
 #include "paddle/gserver/layers/Layer.h"
+#include "paddle/parameter/Parameter.h"
+#include "paddle/utils/ClassRegistrar.h"
 
 namespace paddle {
 /*
@@ -57,14 +57,13 @@ void parameterInitNN(int paramId,
 
 class NeuralNetwork : public GradientMachine {
 public:
-  virtual void init(
-      const ModelConfig& config,
-      ParamInitCallback callback = nullptr,
-      const std::vector<ParameterType>&
-          parameterTypes = std::vector<ParameterType>{PARAMETER_VALUE,
-                                                      PARAMETER_GRADIENT,
-                                                      PARAMETER_MOMENTUM},
-      bool useGpu = FLAGS_use_gpu);
+  virtual void init(const ModelConfig& config,
+                    ParamInitCallback callback = nullptr,
+                    const std::vector<ParameterType>& parameterTypes =
+                        std::vector<ParameterType>{PARAMETER_VALUE,
+                                                   PARAMETER_GRADIENT,
+                                                   PARAMETER_MOMENTUM},
+                    bool useGpu = FLAGS_use_gpu);
 
   /**
    * Connect two submodels and
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
index 9dbf418c31b0969eef7477a22b6f1bf63dab9b03..980a5851a2734ce42b3417d16a37987dc5ed6b24 100644
--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
index 71488bc3b7a52d851d0e3fb77c48f3fd36bdce83..8f445b1ded3eb8960dc06512dd3f80b00d284acc 100644
--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -37,14 +37,13 @@ public:
                         NeuralNetwork *rootNetwork = nullptr)
       : NeuralNetwork(subModelName, rootNetwork) {}
 
-  virtual void init(
-      const ModelConfig &config,
-      ParamInitCallback callback = nullptr,
-      const std::vector<ParameterType>
-          &parameterTypes = std::vector<ParameterType>{PARAMETER_VALUE,
-                                                       PARAMETER_GRADIENT,
-                                                       PARAMETER_MOMENTUM},
-      bool useGpu = FLAGS_use_gpu);
+  virtual void init(const ModelConfig &config,
+                    ParamInitCallback callback = nullptr,
+                    const std::vector<ParameterType> &parameterTypes =
+                        std::vector<ParameterType>{PARAMETER_VALUE,
+                                                   PARAMETER_GRADIENT,
+                                                   PARAMETER_MOMENTUM},
+                    bool useGpu = FLAGS_use_gpu);
 
   virtual void forward(const std::vector<Argument> &inArgs,
                        std::vector<Argument> *outArgs,
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 516b61757698923eb0fde1f3b1d28074cac10044..8f68b3d66bd263b8df34801878efee3e2de2622d 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-#include "paddle/utils/Flags.h"
+#include "RecurrentGradientMachine.h"
+#include <dlfcn.h>
 #include <algorithm>
+#include <cmath>
 #include <functional>
-#include <dlfcn.h>
 #include <limits>
-#include <cmath>
-#include "RecurrentGradientMachine.h"
 #include "NeuralNetwork.h"
 #include "paddle/gserver/layers/AgentLayer.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
 
-P_DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");
+DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");
 
 static const char* DIY_CALC_PROB_SYMBOL_NAME = "calc_prob";
 static const char* DIY_START_CALC_PROB_SYMBOL_NAME = "start_calc_prob";
@@ -78,20 +78,22 @@ static inline SymbolType loadDiySymbol(const char* symbolName) {
   return reinterpret_cast<SymbolType>(sym);
 }
 
-static InitFunction __init__diy_prob_method([] {
-  std::string soName = FLAGS_diy_beam_search_prob_so;
-  if (!soName.empty()) {
-    gDiyProbHandle = dlopen(soName.c_str(), RTLD_LAZY);
-    CHECK(gDiyProbHandle) << "Cannot Open DIY Prob So " << soName;
-    atexit(exit_diy_prob);
-    gDiyProbMethod =
-        loadDiySymbol<decltype(gDiyProbMethod)>(DIY_CALC_PROB_SYMBOL_NAME);
-    gDiyProbStart =
-        loadDiySymbol<decltype(gDiyProbStart)>(DIY_START_CALC_PROB_SYMBOL_NAME);
-    gDiyProbStop =
-        loadDiySymbol<decltype(gDiyProbStop)>(DIY_FINISH_CALC_PROB_SYMBOL_NAME);
-  }
-}, std::numeric_limits<int>::max());
+static InitFunction __init__diy_prob_method(
+    [] {
+      std::string soName = FLAGS_diy_beam_search_prob_so;
+      if (!soName.empty()) {
+        gDiyProbHandle = dlopen(soName.c_str(), RTLD_LAZY);
+        CHECK(gDiyProbHandle) << "Cannot Open DIY Prob So " << soName;
+        atexit(exit_diy_prob);
+        gDiyProbMethod =
+            loadDiySymbol<decltype(gDiyProbMethod)>(DIY_CALC_PROB_SYMBOL_NAME);
+        gDiyProbStart = loadDiySymbol<decltype(gDiyProbStart)>(
+            DIY_START_CALC_PROB_SYMBOL_NAME);
+        gDiyProbStop = loadDiySymbol<decltype(gDiyProbStop)>(
+            DIY_FINISH_CALC_PROB_SYMBOL_NAME);
+      }
+    },
+    std::numeric_limits<int>::max());
 
 class BeamSearchControlCallbacks {
 public:
@@ -1281,10 +1283,9 @@ void RecurrentGradientMachine::beamSearch(size_t batchSize) {
       std::vector<std::vector<int>*> prefixes;
       prefixes.resize(paths.size());
       std::transform(
-          paths.begin(),
-          paths.end(),
-          prefixes.begin(),
-          [](const Path& p) { return const_cast<std::vector<int>*>(&p.ids); });
+          paths.begin(), paths.end(), prefixes.begin(), [](const Path& p) {
+            return const_cast<std::vector<int>*>(&p.ids);
+          });
       beamSearchCtrlCallbacks_->beamSearchCandidateAdjust(
           prefixes, frames_[machineCur].get(), i);
     }
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index cb74a67e52f5f48d106b9fe93b1230a1675d3341..db7d8aff6d3150dd272a924c20e16bfe28d11442 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <functional>
 #include "GradientMachine.h"
 #include "NeuralNetwork.h"
-#include <functional>
 
 #include "paddle/utils/Locks.h"
 
diff --git a/paddle/gserver/layers/AddtoLayer.cpp b/paddle/gserver/layers/AddtoLayer.cpp
index 8a9aecfa19b815814a985183ee28344a6f4f9712..5338530113e8428ceff58a340394a3bbb21f644e 100644
--- a/paddle/gserver/layers/AddtoLayer.cpp
+++ b/paddle/gserver/layers/AddtoLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/AddtoLayer.h b/paddle/gserver/layers/AddtoLayer.h
index 883d186f3e63f3a60789c0a4f0e05db1202f3ec8..53d3f99cdd3439a1ba85f54526ca65005986c634 100644
--- a/paddle/gserver/layers/AddtoLayer.h
+++ b/paddle/gserver/layers/AddtoLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
index eb89281cb1c75cb9b0679bd40ed4cfd4e2224188..2d300290279d6aafc162f11dbc809537a308ca79 100644
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h
index 0186653c0f26cd2b53fc6d96d0dfad09dab6fa5b..41683ad6712d5df710737cf71c600790fcc8786f 100644
--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/AverageLayer.cpp b/paddle/gserver/layers/AverageLayer.cpp
index af64e15fe3ba68c62f164c45400f55fcaa937068..b8955ab04f209629c855ed66f8e8e9701b7224a3 100644
--- a/paddle/gserver/layers/AverageLayer.cpp
+++ b/paddle/gserver/layers/AverageLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h
index 1edc2ace492c5b96da3255c7e93e257830789985..b3c4ecec8bc6f56b4563ee9f1ada91e4d8f2cbb5 100644
--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp
index 2d5bcff29fd5ad33c8eba85fc803bbf89803782e..1ceaaaa206ee3cbc5421238574c7f310011ccaa5 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
-#include "Layer.h"
 #include "BatchNormBaseLayer.h"
 #include "BatchNormalizationLayer.h"
+#include "Layer.h"
+#include "paddle/utils/Stat.h"
 #ifndef PADDLE_ONLY_CPU
 #include "CudnnBatchNormLayer.h"
 #endif
@@ -60,18 +60,16 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap,
 
 void BatchNormBaseLayer::calFeatureMapSize() {
   const ImageConfig& conf = config_.inputs(0).image_conf();
-  if (inputLayers_[0]->getOutput().getFrameHeight() == 0 &&
-      inputLayers_[0]->getOutput().getFrameWidth() == 0) {
-    imgSize_ = conf.img_size();
-    imageH_ = imgSize_;
-    imageW_ = imgSize_;
+  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imageH_ == 0 && imageW_ == 0) {
+    imageH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+    imageW_ = conf.img_size();
   } else {
-    imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
-    imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
+    getOutput().setFrameHeight(imageH_);
+    getOutput().setFrameWidth(imageW_);
   }
   imgPixels_ = imageH_ * imageW_;
-  getOutput().setFrameHeight(imageH_);
-  getOutput().setFrameWidth(imageW_);
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h
index d65882d39df2bb93920dad37ebc78342e31aef85..75bda95de1472b08538b48072ddf9ea607b83299 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/Stat.h"
 #include "Layer.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 
@@ -77,9 +77,8 @@ protected:
   MatrixPtr savedMean_;
   MatrixPtr savedInvVar_;
 
-  /// Height or width of input image feature, now height is equal to width.
-  /// imgSize is 1 if the input is fully-connected layer.
-  int imgSize_;
+  /// Height or width of input image feature.
+  /// Both of them are 1 if the input is fully-connected layer.
   int imageH_;
   int imageW_;
   /// Height * Width.
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/gserver/layers/BatchNormalizationLayer.cpp
index e431c033117c5d405324e7440b84d0e79018b52a..e6a0624636380e0e8ed5e6ee5066fbcf0439f507 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
   savedMean_->mulScalar(1.0 / numSamples);  // E[x]
 
   tmpMat_->assign(*mat);
-  tmpMat_->square();
+  tmpMat_->square2();
   savedInvVar_->zeroMem();
   savedInvVar_->accumulateColSum(*tmpMat_);
   savedInvVar_->mulScalar(1.0 / numSamples);   // E[x^2]
@@ -54,7 +54,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
   calMovingMeanAndVar();
 
   savedInvVar_->subScalar(-EPS);
-  savedInvVar_->sqrt(*savedInvVar_);
+  savedInvVar_->sqrt2(*savedInvVar_);
 }
 
 void BatchNormalizationLayer::calMovingMeanAndVar() {
@@ -85,7 +85,7 @@ void BatchNormalizationLayer::setMeanAndStd() {
   savedInvVar_->downClip(real(0.0));
 
   savedInvVar_->subScalar(-EPS);
-  savedInvVar_->sqrt(*savedInvVar_);
+  savedInvVar_->sqrt2(*savedInvVar_);
 }
 
 void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h
index 36925a5ed2d56e4a5c58525cc238164f72bef40c..052c2077322be59f9d41966c1c8b6ab20c8f85bb 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "Layer.h"
 #include "BatchNormBaseLayer.h"
+#include "Layer.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/BilinearInterpLayer.cpp b/paddle/gserver/layers/BilinearInterpLayer.cpp
index c30e26dc031378ce792534c5eec6c24fc0d20ef9..1976cb00175d467ca0d94660d754264582bfe832 100644
--- a/paddle/gserver/layers/BilinearInterpLayer.cpp
+++ b/paddle/gserver/layers/BilinearInterpLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -26,15 +26,15 @@ size_t BilinearInterpLayer::getSize() {
 
   const BilinearInterpConfig& conf = config_.inputs(0).bilinear_interp_conf();
   if (inImgH_ == 0) {
-    inImgH_ = conf.img_size_y();
+    inImgH_ = conf.image_conf().img_size_y();
   }
   if (inImgW_ == 0) {
-    inImgW_ = conf.img_size_x();
+    inImgW_ = conf.image_conf().img_size();
   }
 
   outImgH_ = conf.out_size_y();
   outImgW_ = conf.out_size_x();
-  numChannels_ = conf.num_channels();
+  numChannels_ = conf.image_conf().channels();
 
   CHECK(outImgH_ > 0 && outImgW_ > 0);
   CHECK(inImgH_ > 0 && inImgW_ > 0);
diff --git a/paddle/gserver/layers/BilinearInterpLayer.h b/paddle/gserver/layers/BilinearInterpLayer.h
index eba3c054fa8e7521e83d7c8dd1d87079a52b3967..4ff4b0ea793dc901d099bf73d55aa15463e62094 100644
--- a/paddle/gserver/layers/BilinearInterpLayer.h
+++ b/paddle/gserver/layers/BilinearInterpLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp
index 17d77879b27be332a49eae4e476b776ec2f5c8e2..2bafeb92158c56efe32f90742807f0af07bda5af 100644
--- a/paddle/gserver/layers/BlockExpandLayer.cpp
+++ b/paddle/gserver/layers/BlockExpandLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
index 1496fb681acd7ca7190e43cce38c7eb347932d29..cc96fdd03fcac6925a16f0fb91045f065f74e803 100644
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/CRFDecodingLayer.cpp b/paddle/gserver/layers/CRFDecodingLayer.cpp
index 8986741dc307ba765707d6e5817a2e376b27828e..fdb46aba68e924480a6595b02c04ff4d1edd914d 100644
--- a/paddle/gserver/layers/CRFDecodingLayer.cpp
+++ b/paddle/gserver/layers/CRFDecodingLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/CRFDecodingLayer.h b/paddle/gserver/layers/CRFDecodingLayer.h
index 1914062011d3bceba2f8765fb3cfd2d29ca6d6e9..1fd444ad10e71df2bb6d8bdb839e6f02b33d647f 100644
--- a/paddle/gserver/layers/CRFDecodingLayer.h
+++ b/paddle/gserver/layers/CRFDecodingLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/gserver/layers/CRFLayer.cpp
index ed4f864ba9167129db1a3f56403940d9d7807a15..02b7aaf17e89d889ca0030f9de2b5d7431a28fd3 100644
--- a/paddle/gserver/layers/CRFLayer.cpp
+++ b/paddle/gserver/layers/CRFLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h
index 21c7fc61e168cea438339db4e7abce59082fc58d..d21b32b68c1a40c814af3aa2c285612a5f938d79 100644
--- a/paddle/gserver/layers/CRFLayer.h
+++ b/paddle/gserver/layers/CRFLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/CTCLayer.cpp b/paddle/gserver/layers/CTCLayer.cpp
index be5d2c8c75d6eb2381a2c1758088de0eff462200..14ec851551848b0e8182909537a26d37a1286ac3 100644
--- a/paddle/gserver/layers/CTCLayer.cpp
+++ b/paddle/gserver/layers/CTCLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/CTCLayer.h b/paddle/gserver/layers/CTCLayer.h
index 18ba12583b5a22849f1ee849a3cce7249730fdaf..70d429bad656ade3c05256472d799ae72e128be5 100644
--- a/paddle/gserver/layers/CTCLayer.h
+++ b/paddle/gserver/layers/CTCLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/ConcatenateLayer.cpp b/paddle/gserver/layers/ConcatenateLayer.cpp
index 910eec8bbc10ef10f5dd4e4688eef5e87c21f506..d19adace7d58af16736fc2b6e536f5fd69a19863 100644
--- a/paddle/gserver/layers/ConcatenateLayer.cpp
+++ b/paddle/gserver/layers/ConcatenateLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
 #include "Layer.h"
 #include "Projection.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index 30dbf168fb6e439048e0168af572d1f20a303e79..7ac56e3a2ab2a2a7f2219b8bfd34c16a84c427c0 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
 #include "ContextProjection.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/ContextProjection.h b/paddle/gserver/layers/ContextProjection.h
index 188dec0fb31bf468c76b9b922e0972c86e819a2d..2df43bd04fec868924b5d45f9def231a48ee7f04 100644
--- a/paddle/gserver/layers/ContextProjection.h
+++ b/paddle/gserver/layers/ContextProjection.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp
index 7637e245a38959220f0d1d52e1f705d86a7c7303..7b234dc2a6663dc677affcae7dc6306c104c1250 100644
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "ConvBaseLayer.h"
 #include "paddle/math/MathUtils.h"
+#include "paddle/utils/Logging.h"
 namespace paddle {
 
 bool ConvBaseLayer::init(const LayerMap& layerMap,
@@ -38,11 +38,12 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
     filterSizeY_.push_back(conf.filter_size_y());
     filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
     channels_.push_back(conf.channels());
-    imgSizeH_.push_back(conf.img_size());
+    imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y()
+                                              : conf.img_size());
     imgSizeW_.push_back(conf.img_size());
     groups_.push_back(conf.groups());
     filterChannels_.push_back(conf.filter_channels());
-    outputH_.push_back(conf.output_x());
+    outputH_.push_back(conf.has_output_y() ? conf.output_y() : conf.output_x());
     outputW_.push_back(conf.output_x());
   }
 
@@ -91,16 +92,19 @@ size_t ConvBaseLayer::calOutputSize() {
     for (size_t i = 0; i < inputLayers_.size(); i++) {
       inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
       inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
+      const ConvConfig& conf = config_.inputs(i).conv_conf();
       if (isDeconv_) {
-        if (inH[i] == 0) inH[i] = config_.inputs(i).conv_conf().output_x();
-        if (inW[i] == 0) inW[i] = config_.inputs(i).conv_conf().output_x();
+        if (inH[i] == 0)
+          inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x();
+        if (inW[i] == 0) inW[i] = conf.output_x();
         outH.push_back(imageSize(
             inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_));
         outW.push_back(imageSize(
             inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_));
       } else {
-        if (inH[i] == 0) inH[i] = config_.inputs(i).conv_conf().img_size();
-        if (inW[i] == 0) inW[i] = config_.inputs(i).conv_conf().img_size();
+        if (inH[i] == 0)
+          inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+        if (inW[i] == 0) inW[i] = conf.img_size();
         outH.push_back(outputSize(
             inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_));
         outW.push_back(outputSize(
diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
index 85f57dbe0b7c9683ba0941ea0edc611f683cf1b4..aedf4100e32fa1294c361b6163c14eab7869b803 100644
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/ConvOperator.cpp b/paddle/gserver/layers/ConvOperator.cpp
index 9b8e18b1ba2a4502bcdcecade94ec3e29730595c..f943410dee0dc2f3d356c9d7d8f61398fe2871c8 100644
--- a/paddle/gserver/layers/ConvOperator.cpp
+++ b/paddle/gserver/layers/ConvOperator.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/math/Matrix.h"
-#include "paddle/math/MathUtils.h"
 #include "Operator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
 
 namespace paddle {
 
@@ -93,9 +93,9 @@ private:
   bool caffeMode_;
   int inputOffset_, outputOffset_, weightOffset_;
   int numFilters_;
-  int padding_, stride_, filterSize_, channels_, imgSize_;
+  int padding_, stride_, filterSize_, channels_, imgSize_, imgSizeY_;
   int paddingY_, strideY_, filterSizeY_;
-  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputs_;
+  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;
 
   /// Following member variables are same with CudnnConvLayer.
   /// There is no explanation here.
@@ -144,7 +144,7 @@ void ConvOperator::allocConvWorkSpace(size_t maxWorkSpace) {
 void ConvOperator::reshape(int batchSize) {
   imageH_ = ins_[0]->getFrameHeight();
   imageW_ = ins_[0]->getFrameWidth();
-  if (imageH_ == 0) imageH_ = imgSize_;
+  if (imageH_ == 0) imageH_ = imgSizeY_;
   if (imageW_ == 0) imageW_ = imgSize_;
   outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
   outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_);
@@ -182,7 +182,10 @@ void ConvOperator::computeConvSizes() {
   hl_create_tensor_descriptor(&inputDesc_);
   int outputX =
       outputSize(imgSize_, filterSize_, padding_, stride_, caffeMode_);
+  int outputY =
+      outputSize(imgSizeY_, filterSizeY_, paddingY_, strideY_, caffeMode_);
   CHECK_EQ(outputX, outputX_);
+  CHECK_EQ(outputY, outputY_);
   hl_create_tensor_descriptor(&outputDesc_);
   hl_create_convolution_descriptor(&convDesc_,
                                    inputDesc_,
@@ -236,10 +239,12 @@ void ConvOperator::getConvParams() {
   filterPixels_ = filterSize_ * filterSizeY_;
   channels_ = conf.channels();
   imgSize_ = conf.img_size();
-  imgPixels_ = imgSize_ * imgSize_;
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  imgPixels_ = imgSize_ * imgSizeY_;
   CHECK_EQ(conf.groups(), 1U);
   filterChannels_ = conf.filter_channels();
   outputX_ = conf.output_x();
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
   outputs_ = outputX_ * outputX_;
 }
 
diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp
index 4ab0a1dc84164114df080bc1ae06905b15a3ff86..e1c4b91ace21522a3bc640dfc4eaa1a42668ed02 100644
--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
 #include "ConvProjection.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 
@@ -46,7 +46,7 @@ void ConvProjection::getConvParams() {
   filterH_ = conf.filter_size_y();
   filterW_ = conf.filter_size();
 
-  configImgH_ = conf.img_size();
+  configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
   configImgW_ = conf.img_size();
 
   channels_ = conf.channels();
@@ -58,8 +58,11 @@ void ConvProjection::getConvParams() {
 }
 
 void ConvProjection::initCudnn() {
-  hl_create_filter_descriptor(
-      &filterDesc_, channels_, numFilters_, filterH_, filterW_);
+  hl_create_filter_descriptor(&filterDesc_,
+                              channels_ / groups_,
+                              numFilters_ / groups_,
+                              filterH_,
+                              filterW_);
   hl_create_tensor_descriptor(&inputDesc_);
   hl_create_tensor_descriptor(&outputDesc_);
   hl_create_convolution_descriptor(&convDesc_,
@@ -86,7 +89,7 @@ void ConvProjection::initCudnn() {
 void ConvProjection::reshapeTensorDesc(int batchSize) {
   hl_tensor_reshape(inputDesc_,
                     batchSize,
-                    channels_,
+                    channels_ / groups_,
                     imageH_,
                     imageW_,
                     channels_ * imageH_ * imageW_,
@@ -115,7 +118,7 @@ void ConvProjection::reshapeTensorDesc(int batchSize) {
 
   hl_tensor_reshape(outputDesc_,
                     batchSize,
-                    numFilters_,
+                    numFilters_ / groups_,
                     outputH_,
                     outputW_,
                     nStride,
@@ -127,6 +130,10 @@ void ConvProjection::reshapeTensorDesc(int batchSize) {
 void ConvProjection::reshape(int batchSize) {
   size_t width = calOutputSize();
   CHECK_EQ(width, out_->value->getWidth());
+  CHECK_EQ(channels_ * imageH_ * imageW_, in_->value->getWidth())
+      << "Wrong input size for convolution"
+      << " channels=" << channels_ << " imageH=" << imageH_
+      << " imageW=" << imageW_ << " inputSize=" << in_->value->getWidth();
 
   isSelectAlgo_ = (batchSize == batchNum_);
   batchNum_ = batchSize;
diff --git a/paddle/gserver/layers/ConvProjection.h b/paddle/gserver/layers/ConvProjection.h
index 779fe1455ade10ba55e32f4d9478d446b01b8a19..c32e5e1d3ab2f85feb6dd2fb5fbddd7482598e58 100644
--- a/paddle/gserver/layers/ConvProjection.h
+++ b/paddle/gserver/layers/ConvProjection.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/ConvShiftLayer.cpp b/paddle/gserver/layers/ConvShiftLayer.cpp
index 6e77c1f14e6a6896f6ef7c4042954b25bd58266a..9bfb1ab7a47b11a6793159aefcb4f9fa12b81a6b 100644
--- a/paddle/gserver/layers/ConvShiftLayer.cpp
+++ b/paddle/gserver/layers/ConvShiftLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp
index 7e1fef8bc600329ac62002dab7b91238b83b8023..3f4d77a2fe069f239db8cd099dd0d472d6ce3ccc 100644
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/CosSimLayer.cpp b/paddle/gserver/layers/CosSimLayer.cpp
index 894cb5b0d8226cc3b4b60bac38801bf0a7ec6b6a..254120443dc3d41bf2422be2e88cb376d70c93d4 100644
--- a/paddle/gserver/layers/CosSimLayer.cpp
+++ b/paddle/gserver/layers/CosSimLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/CosSimLayer.h b/paddle/gserver/layers/CosSimLayer.h
index bc47998c11f267a1737ff82e8aa2958f6859bf86..5dcc5d8a5b4dc76cb6cea023a874049731a26516 100644
--- a/paddle/gserver/layers/CosSimLayer.h
+++ b/paddle/gserver/layers/CosSimLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/CosSimVecMatLayer.cpp b/paddle/gserver/layers/CosSimVecMatLayer.cpp
index 56d177da6458a590299fee5b24b8a9c935510916..ad490b0b8c4656c1eabf519233f2386b4b6e9417 100644
--- a/paddle/gserver/layers/CosSimVecMatLayer.cpp
+++ b/paddle/gserver/layers/CosSimVecMatLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 094c36ceb1f72ff9ee2cc9fa54de0b06312948fe..7e9519f6b3af50bf47b660b285c3593087f80271 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <memory>
+#include "CostLayer.h"
 #include <algorithm>
-#include "paddle/utils/Logging.h"
 #include <cmath>
-#include "CostLayer.h"
+#include <memory>
+#include "paddle/utils/Logging.h"
 
 #include "paddle/math/SparseMatrix.h"
 
@@ -115,12 +115,12 @@ void MultiClassCrossEntropyWithSelfNorm::forwardImp(Matrix& output,
                                                     Matrix& target) {
   Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
   output.rowSum(*sftMaxSum_);
-  sftMaxSum_->log();
+  sftMaxSum_->log2();
 
   target.oneHotCrossEntropy(output, *label.ids);
   target.add(*sftMaxSum_);
 
-  sftMaxSum_->square();
+  sftMaxSum_->square2();
   target.add(*sftMaxSum_, config_.softmax_selfnorm_alpha());
 }
 
@@ -131,12 +131,12 @@ void MultiClassCrossEntropyWithSelfNorm::backwardImp(Matrix& output,
   output.rowSum(*sftMaxSum_);
 
   Matrix::resizeOrCreate(sumInv_, output.getHeight(), 1, false, useGpu_);
-  sftMaxSum_->reciprocal(*sumInv_);
+  sftMaxSum_->reciprocal2(*sumInv_);
 
   outputG.oneHotCrossEntropyBp(output, *label.ids);
   outputG.addColumnVector(*sumInv_);
 
-  sftMaxSum_->log();
+  sftMaxSum_->log2();
   sumInv_->dotMul(*sumInv_, *sftMaxSum_);
   sumInv_->mulScalar(2 * config_.softmax_selfnorm_alpha());
 
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
index 120ff9bd2d1b402e8ef2d074a84b76b0183dcab0..7f73bdb3f7d63ef1c8d76deb64f40d19d20f87c7 100644
--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
index 6be62b1a25407a5340bb5cdd99745db5d33ec3da..09dac05a7ad7a80bd6b9e12e8f7f060310d516c8 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
-#include "Layer.h"
 #include "CudnnBatchNormLayer.h"
+#include "Layer.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h
index 6220e77ceb5e248e5678c9170e85aff1cb40e1cd..b1e7d2082f1443313bfc858a17adfd737ecff98f 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/Stat.h"
-#include "Layer.h"
 #include "BatchNormBaseLayer.h"
+#include "Layer.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/CudnnConvLayer.cpp b/paddle/gserver/layers/CudnnConvLayer.cpp
index 93c5565d2f401549959d6b067b05289592433a3a..978c2c1479c64ab2cdebaaff7394059b3d033ab6 100644
--- a/paddle/gserver/layers/CudnnConvLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "CudnnConvLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "CudnnConvLayer.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/CudnnConvLayer.h b/paddle/gserver/layers/CudnnConvLayer.h
index 6cfbadfb53839d847b8b2bcf768da0f473ac05e5..b869c695bd753076c6501a1253fcad22139ccadf 100644
--- a/paddle/gserver/layers/CudnnConvLayer.h
+++ b/paddle/gserver/layers/CudnnConvLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "ConvBaseLayer.h"
-#include "paddle/math/Matrix.h"
 #include "Projection.h"
-#include <vector>
+#include "paddle/math/Matrix.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/CudnnPoolLayer.cpp b/paddle/gserver/layers/CudnnPoolLayer.cpp
index 21d8e2579f77c98da1e30a205952fa53e02fb853..4adb2d4709e585a6fec052435c33714d6e3a3f0e 100644
--- a/paddle/gserver/layers/CudnnPoolLayer.cpp
+++ b/paddle/gserver/layers/CudnnPoolLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "CudnnPoolLayer.h"
+#include "paddle/math/Matrix.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "paddle/math/Matrix.h"
-#include "CudnnPoolLayer.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/CudnnPoolLayer.h b/paddle/gserver/layers/CudnnPoolLayer.h
index 6a6b28db961553506bcf5db206a65e1e9d90fe94..072b2f9513f4ef8aed03ecfa7a9014667bb2ce9e 100644
--- a/paddle/gserver/layers/CudnnPoolLayer.h
+++ b/paddle/gserver/layers/CudnnPoolLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/DataLayer.cpp b/paddle/gserver/layers/DataLayer.cpp
index 9a4b2e9d3e256119f3ff24cfcb80d68c81f67c65..3551df4e172f0237685127b0b3869554d9c5f97d 100644
--- a/paddle/gserver/layers/DataLayer.cpp
+++ b/paddle/gserver/layers/DataLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -49,8 +49,13 @@ void DataLayer::copyDataToOutput(Argument& output) {
       output.ids->copyFrom(*data_.ids);
     }
   }
-  output.setFrameHeight(data_.getFrameHeight());
-  output.setFrameWidth(data_.getFrameWidth());
+  if (config_.height() && config_.width()) {
+    output.setFrameHeight(config_.height());
+    output.setFrameWidth(config_.width());
+  } else {
+    output.setFrameHeight(data_.getFrameHeight());
+    output.setFrameWidth(data_.getFrameWidth());
+  }
   output.cpuSequenceDims = data_.cpuSequenceDims;
   output.sequenceStartPositions = data_.sequenceStartPositions;
   output.subSequenceStartPositions = data_.subSequenceStartPositions;
diff --git a/paddle/gserver/layers/DataLayer.h b/paddle/gserver/layers/DataLayer.h
index da74702201bd3af3cd73ad51ef2579da97674bc6..d3bc97bb6cd0b8faf8ae108a0147d77854596e25 100644
--- a/paddle/gserver/layers/DataLayer.h
+++ b/paddle/gserver/layers/DataLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/DataNormLayer.cpp b/paddle/gserver/layers/DataNormLayer.cpp
index b398f3dbedc44eb422124a725aa745f684e821e3..afd532c949fbc4ada53d65a92c858b7c6a11a382 100644
--- a/paddle/gserver/layers/DataNormLayer.cpp
+++ b/paddle/gserver/layers/DataNormLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/DataNormLayer.h b/paddle/gserver/layers/DataNormLayer.h
index 1179d94fbbd4032c9275f0586de5b526eb21c095..b3043cffd210feaf9ddaed096de762aa7e2a6139 100644
--- a/paddle/gserver/layers/DataNormLayer.h
+++ b/paddle/gserver/layers/DataNormLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/DotMulOperator.cpp b/paddle/gserver/layers/DotMulOperator.cpp
index 9409493fdaaf0e84ab2e650e2c5e3db0c1fb1fbc..55dabd79d014a08f5a8bf3628483207984c917d0 100644
--- a/paddle/gserver/layers/DotMulOperator.cpp
+++ b/paddle/gserver/layers/DotMulOperator.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/DotMulProjection.cpp b/paddle/gserver/layers/DotMulProjection.cpp
index 862eeb6f01db04451afb8a91ecb2c04e0f796952..0a1ede3618cb7ae91d5f90fc6bac498c07acc38e 100644
--- a/paddle/gserver/layers/DotMulProjection.cpp
+++ b/paddle/gserver/layers/DotMulProjection.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/EosIdCheckLayer.cpp b/paddle/gserver/layers/EosIdCheckLayer.cpp
index 3a43705d263898bd407248b3d553185f7e40f798..fa53e2e4cfc8a220eeb2a637d7fe759f1744f9d5 100644
--- a/paddle/gserver/layers/EosIdCheckLayer.cpp
+++ b/paddle/gserver/layers/EosIdCheckLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 /**
diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.cpp b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
index 71a69bd0d01f4f6fcd579a408008ad4e00b5fd4d..25948747fe93e65b77d8eef5ac4748c545f79e90 100644
--- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -29,17 +29,19 @@ bool ExpandConvBaseLayer::init(const LayerMap &layerMap,
    * meaning as in conv, we need to swap channels_ and numFilters here for
    * convTrans, and in other functions too.
    * */
-  int channel;
-  int numFilters;
+
   /* Initialize the projection */
   for (auto &inputConfig : config_.inputs()) {
     const ConvConfig &conf = inputConfig.conv_conf();
-    numFilters = isDeconv_ ? conf.channels() : numFilters_;
+    int numFilters = isDeconv_ ? conf.channels() : numFilters_;
     subM_.push_back(numFilters / conf.groups());
-    subN_.push_back(conf.output_x() * conf.output_x());
-    channel = isDeconv_ ? numFilters_ : conf.channels();
-    subK_.push_back(channel * conf.filter_size() * conf.filter_size() /
-                    conf.groups());
+    subN_.push_back(conf.output_x() *
+                    (conf.has_output_y() ? conf.output_y() : conf.output_x()));
+    int channel = isDeconv_ ? numFilters_ : conf.channels();
+    subK_.push_back(
+        channel * conf.filter_size() *
+        (conf.has_filter_size_y() ? conf.filter_size_y() : conf.filter_size()) /
+        conf.groups());
     /* Consistent caffe mode for multiple input */
     caffeMode_ = conf.caffe_mode();
   }
@@ -116,11 +118,11 @@ void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image,
                            imgSizeH_[inIdx],
                            imgSizeW_[inIdx],
                            channel,
+                           filterSizeY_[inIdx],
                            filterSize_[inIdx],
-                           filterSize_[inIdx],
+                           strideY_[inIdx],
                            stride_[inIdx],
-                           stride_[inIdx],
-                           padding_[inIdx],
+                           paddingY_[inIdx],
                            padding_[inIdx],
                            outputH_[inIdx],
                            outputW_[inIdx]);
@@ -145,7 +147,7 @@ void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image,
   real *expInData = expandInput_->getData();
   for (int g = 0; g < groups_[inIdx]; ++g) {
     MatrixPtr A =
-        Matrix::create(wgtData, subK, subM, true, useGpu_);  // mark transpose
+        Matrix::create(wgtData, subM, subK, false, useGpu_);  // mark transpose
     MatrixPtr B = Matrix::create(expInData, subK, subN, false, useGpu_);
     MatrixPtr C = Matrix::create(outData, subM, subN, false, useGpu_);
     C->mul(A, B, 1, 1);
@@ -182,7 +184,7 @@ void ExpandConvBaseLayer::bpropActs(MatrixPtr out,
       // create temporary matrix
       MatrixPtr C = Matrix::create(expandInData, subK, subN, false, useGpu_);
       MatrixPtr B = Matrix::create(localGradData, subM, subN, false, useGpu_);
-      MatrixPtr A = Matrix::create(wgtData, subK, subM, false, useGpu_);
+      MatrixPtr A = Matrix::create(wgtData, subM, subK, true, useGpu_);
       C->mul(A, B);  // mul
 
       // clear the temporary matrix
@@ -208,11 +210,11 @@ void ExpandConvBaseLayer::bpropActs(MatrixPtr out,
                      imgSizeH_[inpIdx],
                      imgSizeW_[inpIdx],
                      channel,
+                     filterSizeY_[inpIdx],
                      filterSize_[inpIdx],
-                     filterSize_[inpIdx],
-                     stride_[inpIdx],
+                     strideY_[inpIdx],
                      stride_[inpIdx],
-                     padding_[inpIdx],
+                     paddingY_[inpIdx],
                      padding_[inpIdx],
                      outputH_[inpIdx],
                      outputW_[inpIdx],
@@ -247,10 +249,10 @@ void ExpandConvBaseLayer::bpropWeights(MatrixPtr image,
 
     // expand-mul one-group by one
     for (int g = 0; g < groups_[inpIdx]; g++) {
-      MatrixPtr A = Matrix::create(expandInData, subK, subN, false, useGpu_);
-      MatrixPtr B = Matrix::create(gradData, subM, subN, true, useGpu_);
-      MatrixPtr C = Matrix::create(wGradData, subK, subM, false, useGpu_);
-      C->mul(A, B, 1, 1);
+      MatrixPtr A = Matrix::create(expandInData, subK, subN, true, useGpu_);
+      MatrixPtr B = Matrix::create(gradData, subM, subN, false, useGpu_);
+      MatrixPtr C = Matrix::create(wGradData, subM, subK, false, useGpu_);
+      C->mul(B, A, 1, 1);
 
       A->clear();
       B->clear();
diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.h b/paddle/gserver/layers/ExpandConvBaseLayer.h
index 5939d27e2a873308d710c1670a3aec843c3573ad..8445642217cf3e83441ddd9beec80f99faf946bc 100644
--- a/paddle/gserver/layers/ExpandConvBaseLayer.h
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "ConvBaseLayer.h"
 #include "paddle/math/Matrix.h"
-#include <vector>
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
index 0649289c1c671ae5952dd8db9d19f576da67409c..f9267b81a7d4264f5f43552e3d54a45e4b212e00 100644
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "ExpandConvLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "ExpandConvLayer.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h
index 82a9e88a4208ea98a97bd56ef2f9f38de4f0031e..de81a017e1bac38a5717e8c83a028f5408c0e084 100644
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/math/Matrix.h"
 #include <vector>
 #include "ExpandConvBaseLayer.h"
+#include "paddle/math/Matrix.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/ExpandConvTransLayer.cpp b/paddle/gserver/layers/ExpandConvTransLayer.cpp
index 1132ab4f92000c96b22a295b360143d2f356ec5a..520586b13889790c94a3e29902a4ea0ee55e8555 100644
--- a/paddle/gserver/layers/ExpandConvTransLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvTransLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "ExpandConvTransLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "ExpandConvTransLayer.h"
 
 /* The implementation of the convTransLayer is basically a swap of forward and
  * backward of the original convLayer.
diff --git a/paddle/gserver/layers/ExpandConvTransLayer.h b/paddle/gserver/layers/ExpandConvTransLayer.h
index 47efe3f65643fd17b86832fc240cda2e30d3fcc4..4a527d67995e255c65fea1f310551f8de5630030 100644
--- a/paddle/gserver/layers/ExpandConvTransLayer.h
+++ b/paddle/gserver/layers/ExpandConvTransLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/math/Matrix.h"
 #include <vector>
 #include "ExpandConvBaseLayer.h"
+#include "paddle/math/Matrix.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/ExpandLayer.cpp b/paddle/gserver/layers/ExpandLayer.cpp
index 9290ce4f6d46c1237322549924ce1eb7754d2309..de5acfde05add701a1a71815b90522a11350fc9b 100644
--- a/paddle/gserver/layers/ExpandLayer.cpp
+++ b/paddle/gserver/layers/ExpandLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/ExpandLayer.h b/paddle/gserver/layers/ExpandLayer.h
index fbe0ced9b1754d72874071575b33f552ccf93cc6..5c636144235cdb3800aa921464985616f8ee9203 100644
--- a/paddle/gserver/layers/ExpandLayer.h
+++ b/paddle/gserver/layers/ExpandLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
index 97c8d143fe0d84c4e59e224962b53995ee50b844..d023074c52167554358d0d4df7ec40cfba9da2a6 100644
--- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp
+++ b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/FullMatrixProjection.cpp b/paddle/gserver/layers/FullMatrixProjection.cpp
index 35a5cb5b7a450e7233b6dddbef58a2acccfb1608..9e72a33a3c6f443497192ff5d39b4d4ad4a02ec0 100644
--- a/paddle/gserver/layers/FullMatrixProjection.cpp
+++ b/paddle/gserver/layers/FullMatrixProjection.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/FullMatrixProjection.h b/paddle/gserver/layers/FullMatrixProjection.h
index ddb1e7b18c4f967383feb922ce89d13a452109b2..58499f2e1ee815b2e11b68fecdb06ed3abe24756 100644
--- a/paddle/gserver/layers/FullMatrixProjection.h
+++ b/paddle/gserver/layers/FullMatrixProjection.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/FullyConnectedLayer.cpp b/paddle/gserver/layers/FullyConnectedLayer.cpp
index 70c56499a7738c12db40bfd0ca5fec399d72f99b..89afe33c36697f8d57885043ed68cdf26576e358 100644
--- a/paddle/gserver/layers/FullyConnectedLayer.cpp
+++ b/paddle/gserver/layers/FullyConnectedLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "FullyConnectedLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/math/SparseMatrix.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "paddle/math/SparseMatrix.h"
-#include <vector>
-#include <algorithm>
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/FullyConnectedLayer.h b/paddle/gserver/layers/FullyConnectedLayer.h
index e15e1236cdb75d1c41bbb993f86545334785909a..ccd584585c97cb679332cbd10d6f3a1306ca5a54 100644
--- a/paddle/gserver/layers/FullyConnectedLayer.h
+++ b/paddle/gserver/layers/FullyConnectedLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.cpp b/paddle/gserver/layers/GatedRecurrentLayer.cpp
index 495c2174f3e9afbee676622d53248c7f5aeea404..930d9a056164e7c677adb53b7b67901364da1309 100644
--- a/paddle/gserver/layers/GatedRecurrentLayer.cpp
+++ b/paddle/gserver/layers/GatedRecurrentLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "Layer.h"
 #include "GatedRecurrentLayer.h"
+#include "Layer.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
@@ -386,8 +386,9 @@ void GatedRecurrentLayer::backwardBatch(int batchSize, MatrixPtr inputGrad) {
       {
         batchSize = outputGradTmp->getHeight();
         gruValue.prevOutValue =
-            (n == 0 ? nullptr : (batchValue_->getBatchValue(n - 1, batchSize))
-                                    ->getData());
+            (n == 0
+                 ? nullptr
+                 : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
         gruGrad.prevOutGrad =
             (n == 0 ? nullptr
                     : (batchGrad_->getBatchValue(n - 1, batchSize))->getData());
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.h b/paddle/gserver/layers/GatedRecurrentLayer.h
index 3b8706a44e21e5a780c6423b65369dc5b695b59b..25770ce57fbaa4d16c9454d824800f2f0c7f957d 100644
--- a/paddle/gserver/layers/GatedRecurrentLayer.h
+++ b/paddle/gserver/layers/GatedRecurrentLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/math/Matrix.h"
-#include "SequenceToBatch.h"
 #include "GruCompute.h"
 #include "Layer.h"
+#include "SequenceToBatch.h"
+#include "paddle/math/Matrix.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/GetOutputLayer.cpp b/paddle/gserver/layers/GetOutputLayer.cpp
index 01579d55fd9d0918b62ae0ddd9a7e90b4a697a13..b77fdbb30e11b72b0c7de765df173204aa0b6851 100644
--- a/paddle/gserver/layers/GetOutputLayer.cpp
+++ b/paddle/gserver/layers/GetOutputLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/GruCompute.cpp b/paddle/gserver/layers/GruCompute.cpp
index d9d423af448fd267b777ef57964dced3b7a09f63..06907768e98f4bad952706cffbbd65d1f86cc6df 100644
--- a/paddle/gserver/layers/GruCompute.cpp
+++ b/paddle/gserver/layers/GruCompute.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
 #include "GruCompute.h"
 #include "hl_recurrent_apply.cuh"
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/GruCompute.cu b/paddle/gserver/layers/GruCompute.cu
index 4a3cf6b1ca73cc72ff79835ac341cedda4cc94e2..d5e547dce347c824f959425551afea66dfd94e5a 100644
--- a/paddle/gserver/layers/GruCompute.cu
+++ b/paddle/gserver/layers/GruCompute.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/GruCompute.h b/paddle/gserver/layers/GruCompute.h
index 58b5aacba0403f8d10e34b055f5a69ad5ffa4837..42c0019319ac9f20f9c3349fb2429c30f03d682b 100644
--- a/paddle/gserver/layers/GruCompute.h
+++ b/paddle/gserver/layers/GruCompute.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/TypeDefs.h"
 #include "ModelConfig.pb.h"
 #include "hl_gpu.h"
+#include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/GruStepLayer.cpp b/paddle/gserver/layers/GruStepLayer.cpp
index 6c9b0c5771bec765d043cd654fbb30ba56f8c813..4a1006aa941f396c233a0cecfc38228f1f9fafe1 100644
--- a/paddle/gserver/layers/GruStepLayer.cpp
+++ b/paddle/gserver/layers/GruStepLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "Layer.h"
 #include "GruCompute.h"
+#include "Layer.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
index 61bc77778501fb9421cd2a72459d35ac9f47a5cb..d62a8d846e5b347aa44ce1951c043d5813a5b3ff 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
index 10762bc92687a3ea8debb7b9aa26a0cf0f94421c..70da3ac126e147387b20c5a97d0116a5a679e044 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/IdentityProjection.cpp b/paddle/gserver/layers/IdentityProjection.cpp
index b38656c960f17b2c2c315eba70c61c328ed3e49a..f1d41a33d40f120d5de8b2bfe9cf3271eefa08be 100644
--- a/paddle/gserver/layers/IdentityProjection.cpp
+++ b/paddle/gserver/layers/IdentityProjection.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
 #include "Projection.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/InterpolationLayer.cpp b/paddle/gserver/layers/InterpolationLayer.cpp
index b00bee235693d56aecfdc676647e102fe8d0ebfc..44fe1fb1fea4203a4a1cac67c581b13adda65966 100644
--- a/paddle/gserver/layers/InterpolationLayer.cpp
+++ b/paddle/gserver/layers/InterpolationLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index 0f9e7c0ff89531edeb5e7c5b2bc03f28b0a08b94..c47943f81c01589eada4b825d54be5c69314b6fa 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,15 +14,15 @@ limitations under the License. */
 
 #include "paddle/utils/Util.h"
 
-#include "paddle/utils/Logging.h"
 #include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Logging.h"
 
 #include "AddtoLayer.h"
+#include "CRFLayer.h"
 #include "CosSimLayer.h"
 #include "CostLayer.h"
-#include "ExpandConvLayer.h"
-#include "CRFLayer.h"
 #include "DataLayer.h"
+#include "ExpandConvLayer.h"
 #include "FullyConnectedLayer.h"
 #include "HierarchicalSigmoidLayer.h"
 #include "MaxLayer.h"
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "TransLayer.h"
 #include "ValidationLayer.h"
 
-P_DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
+DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
 
 namespace paddle {
 
@@ -316,12 +316,12 @@ void Layer::showOutputStats() {
     auto tmpMat = dynamic_cast<CpuSparseMatrix*>(outSquare.get());
     min = tmpMat->getMin();
     max = tmpMat->getMax();
-    tmpMat->square();
+    tmpMat->square2();
     LOG(INFO) << "show statistics of [none zero values] in sparse matrix";
   } else {
     min = outSquare->getMin();
     max = outSquare->getMax();
-    outSquare->square();
+    outSquare->square2();
   }
   real std = (outSquare->getSum() / outSquare->getElementCnt()) - mean * mean;
   std = std > 0 ? std : 0;
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index 3d427a1ac6e38f2bcd49195504d1086b83e3cdf3..172e558b82945296ef8a50d464c03efbfd597e0d 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,18 +14,18 @@ limitations under the License. */
 
 #pragma once
 
-#include <memory>
-#include <functional>
 #include <paddle/parameter/Argument.h>
-#include "paddle/utils/ClassRegistrar.h"
+#include <functional>
+#include <memory>
+#include "ModelConfig.pb.h"
 #include "paddle/math/CpuSparseMatrix.h"
 #include "paddle/parameter/Parameter.h"
+#include "paddle/utils/ClassRegistrar.h"
 #include "paddle/utils/Util.h"
-#include "ModelConfig.pb.h"
 
-#include "paddle/gserver/activations/ActivationFunction.h"
 #include <paddle/parameter/ParallelParameter.h>
 #include <paddle/parameter/Weight.h>
+#include "paddle/gserver/activations/ActivationFunction.h"
 
 /// Macro for registering a layer type.
 /// Example: REGISTER_LAYER(crf_error, CRFDecodingErrorLayer);
diff --git a/paddle/gserver/layers/LinearChainCRF.cpp b/paddle/gserver/layers/LinearChainCRF.cpp
index 2b3a50b2e29cd2291a9fc21980506baa6120563c..af550c7a0154802a93bacccab500695bdad36542 100644
--- a/paddle/gserver/layers/LinearChainCRF.cpp
+++ b/paddle/gserver/layers/LinearChainCRF.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <algorithm>
 #include "LinearChainCRF.h"
+#include <algorithm>
 
 namespace paddle {
 
@@ -60,7 +60,7 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
   expX_->assign(*matX);
   // subtract max to avoid overflow or underflow
   expX_->mul(maxX_, ones_, (real)-1, (real)1);
-  expX_->exp();
+  expX_->exp2();
 
   real* a = a_->getData();
   real* b = b_->getData();
@@ -69,7 +69,7 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
   real* expX = expX_->getData();
   real* maxX = maxX_->getData();
 
-  expW_->exp(*w_);
+  expW_->exp2(*w_);
   real* expW = expW_->getData();
 
   for (int i = 0; i < numClasses_; ++i) {
diff --git a/paddle/gserver/layers/LinearChainCRF.h b/paddle/gserver/layers/LinearChainCRF.h
index 6368f2b9de2f993c6a113315be8d642784b04726..a905bf803dd5443ef8d4ad7702720a50a5220a9a 100644
--- a/paddle/gserver/layers/LinearChainCRF.h
+++ b/paddle/gserver/layers/LinearChainCRF.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/LinearChainCTC.cpp b/paddle/gserver/layers/LinearChainCTC.cpp
index 3368eb4d8a796eef367042f78b8c18d47bc1330e..cb2b249110dbd736a46a713480eca12e59cb391b 100644
--- a/paddle/gserver/layers/LinearChainCTC.cpp
+++ b/paddle/gserver/layers/LinearChainCTC.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <math.h>
 #include "LinearChainCTC.h"
+#include <math.h>
 #include <limits>
 
 namespace paddle {
diff --git a/paddle/gserver/layers/LinearChainCTC.h b/paddle/gserver/layers/LinearChainCTC.h
index 0a93d2e9a6d0d697f5f081abe9fad69faac9b04b..737c9d5c3188f4499267bb9536e1a75ef4871f23 100644
--- a/paddle/gserver/layers/LinearChainCTC.h
+++ b/paddle/gserver/layers/LinearChainCTC.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/LstmCompute.cpp b/paddle/gserver/layers/LstmCompute.cpp
index 38057636edbea5d1d25d20740b16c319a653e42e..4c4297096423762355a5ee028cac252432cc1956 100644
--- a/paddle/gserver/layers/LstmCompute.cpp
+++ b/paddle/gserver/layers/LstmCompute.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
-#include "hl_recurrent_apply.cuh"
 #include "LstmCompute.h"
+#include "hl_recurrent_apply.cuh"
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/LstmCompute.cu b/paddle/gserver/layers/LstmCompute.cu
index af271d682f6f1077cd64e4ea77d1c3f207aff6f8..f75c0c40ccc833e35f8fe8f21c12b3d3f68d5eb6 100644
--- a/paddle/gserver/layers/LstmCompute.cu
+++ b/paddle/gserver/layers/LstmCompute.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/LstmCompute.h b/paddle/gserver/layers/LstmCompute.h
index 97be7218f251f21a9a50c7f8ec28e7c487420a2f..140a4c6ecf5cfaf1045cec3ca2db5d4f2e54aca4 100644
--- a/paddle/gserver/layers/LstmCompute.h
+++ b/paddle/gserver/layers/LstmCompute.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/TypeDefs.h"
 #include "ModelConfig.pb.h"
 #include "hl_gpu.h"
+#include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/LstmLayer.cpp b/paddle/gserver/layers/LstmLayer.cpp
index e70a20e5c0217288b795f647f3918911e3713ceb..2543d1b49a801943819e05bc52e53eaeafae1edf 100644
--- a/paddle/gserver/layers/LstmLayer.cpp
+++ b/paddle/gserver/layers/LstmLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "LstmLayer.h"
-#include "paddle/math/Matrix.h"
 #include "paddle/math/BaseMatrix.h"
+#include "paddle/math/Matrix.h"
 #include "paddle/utils/Stat.h"
 
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(prev_batch_state);
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/LstmLayer.h b/paddle/gserver/layers/LstmLayer.h
index 5b936ff44ef1bc26850c5051f4d5561529002cd4..f49df2c412f05f74da455d41cdf7c9bd4b9ec2e2 100644
--- a/paddle/gserver/layers/LstmLayer.h
+++ b/paddle/gserver/layers/LstmLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,10 +15,10 @@ limitations under the License. */
 #pragma once
 
 #include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/BaseMatrix.h"
-#include "SequenceToBatch.h"
 #include "LstmCompute.h"
+#include "SequenceToBatch.h"
+#include "paddle/math/BaseMatrix.h"
+#include "paddle/math/Matrix.h"
 namespace paddle {
 
 /**
diff --git a/paddle/gserver/layers/LstmStepLayer.cpp b/paddle/gserver/layers/LstmStepLayer.cpp
index e7a8d519f2dc5eade613f3ad1981434ae8d59b7c..5fc6474b8653f4c7dac284e11d88f803405169a3 100644
--- a/paddle/gserver/layers/LstmStepLayer.cpp
+++ b/paddle/gserver/layers/LstmStepLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/MDLstmLayer.cpp b/paddle/gserver/layers/MDLstmLayer.cpp
index 93f52c1c314105f9d0b2530218d43045224df948..1243c12889542103f65b427da8f549e852773c5c 100644
--- a/paddle/gserver/layers/MDLstmLayer.cpp
+++ b/paddle/gserver/layers/MDLstmLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "LstmLayer.h"
-#include "paddle/math/Matrix.h"
 #include "paddle/math/BaseMatrix.h"
+#include "paddle/math/Matrix.h"
 
 namespace paddle {
 
@@ -318,7 +318,7 @@ void MDLstmLayer::forward(PassType passType) {
   CHECK_EQ(starts[numSequences], batchSize);
 
   int* dimsData = input.cpuSequenceDims->getData();
-  CHECK_EQ(int(input.cpuSequenceDims->getSize()), numDims_ * numSequences);
+  CHECK_EQ(int(input.cpuSequenceDims->getSize()), numDims_* numSequences);
 
   for (int i = 0; i < numSequences; i++) {
     std::vector<int> dims;
diff --git a/paddle/gserver/layers/MaxIdLayer.cpp b/paddle/gserver/layers/MaxIdLayer.cpp
index 22670fa1210e1199266cb16a1f08826c3010a84e..80555f3f7b324100c059c3356a4a2e462bc6face 100644
--- a/paddle/gserver/layers/MaxIdLayer.cpp
+++ b/paddle/gserver/layers/MaxIdLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/MaxLayer.cpp b/paddle/gserver/layers/MaxLayer.cpp
index 42bc6bb815232ff8dfa6b49ebf47b10c252e28c5..23629e1986834f0195685cb1ec35358209de6a5b 100644
--- a/paddle/gserver/layers/MaxLayer.cpp
+++ b/paddle/gserver/layers/MaxLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h
index 74df0b8b576c8ea1eef56d465e8c4ceee5019fdb..472ee0ccca196250f4b81fc1e921aaee5f352b7e 100644
--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/MaxOutLayer.cpp b/paddle/gserver/layers/MaxOutLayer.cpp
index a3de069bf7a6c9217e4adfeb2e65409955cc569c..3a86a95321d8843338267df374dae169271410f5 100644
--- a/paddle/gserver/layers/MaxOutLayer.cpp
+++ b/paddle/gserver/layers/MaxOutLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "MaxOutLayer.h"
-#include "hl_gpu.h"
 #include "hl_cnn.h"
+#include "hl_gpu.h"
 
 namespace paddle {
 
@@ -25,10 +25,10 @@ size_t MaxOutLayer::getSize() {
   imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
   imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
   if (imgSizeH_ == 0) {
-    imgSizeH_ = maxoutConf.img_size_y();
+    imgSizeH_ = maxoutConf.image_conf().img_size_y();
   }
   if (imgSizeW_ == 0) {
-    imgSizeW_ = maxoutConf.img_size_x();
+    imgSizeW_ = maxoutConf.image_conf().img_size();
   }
 
   featLen_ = imgSizeH_ * imgSizeW_;
@@ -50,7 +50,7 @@ bool MaxOutLayer::init(const LayerMap& layerMap,
 
   const MaxOutConfig& conf = config_.inputs(0).maxout_conf();
   groups_ = conf.groups();
-  channels_ = conf.channels();
+  channels_ = conf.image_conf().channels();
   CHECK_EQ(channels_ % groups_, 0UL);
   outputChannels_ = channels_ / groups_;
 
diff --git a/paddle/gserver/layers/MaxOutLayer.h b/paddle/gserver/layers/MaxOutLayer.h
index 9011a5c332b17a2f697380b1afb40ad9de504b91..59c2245e0d6490d4f8e1b77b1c88267747aaa63a 100644
--- a/paddle/gserver/layers/MaxOutLayer.h
+++ b/paddle/gserver/layers/MaxOutLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/MixedLayer.cpp b/paddle/gserver/layers/MixedLayer.cpp
index 1392188fcae715734d96b1402924515fa3618965..2525b1984b80a4200923c007d3021d468745133e 100644
--- a/paddle/gserver/layers/MixedLayer.cpp
+++ b/paddle/gserver/layers/MixedLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
 #include "MixedLayer.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/MixedLayer.h b/paddle/gserver/layers/MixedLayer.h
index 271e0c2538d3b7239a5d54ec43180dddff569b76..9655a152c7bc96fb3941fcbd9db4ff71a59e4ebe 100644
--- a/paddle/gserver/layers/MixedLayer.h
+++ b/paddle/gserver/layers/MixedLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include "Layer.h"
-#include "Projection.h"
 #include "Operator.h"
+#include "Projection.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/MultinomialSampler.cpp b/paddle/gserver/layers/MultinomialSampler.cpp
index e85dca72d3162d857e768221e970fe8e3951ae9c..0b285ed20f7ad86575aad3f5cbe710108fd75733 100644
--- a/paddle/gserver/layers/MultinomialSampler.cpp
+++ b/paddle/gserver/layers/MultinomialSampler.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/MultinomialSampler.h b/paddle/gserver/layers/MultinomialSampler.h
index 59683d2ee29924e76ca11eb43fbd8cd175c3c357..677b047029305549084770bdb5eadfeaafbfac8a 100644
--- a/paddle/gserver/layers/MultinomialSampler.h
+++ b/paddle/gserver/layers/MultinomialSampler.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <random>
-
 #include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
@@ -32,6 +32,17 @@ class MultinomialSampler {
 public:
   MultinomialSampler(const real* prob, int size);
 
+  //! protobuf always using double.
+  static MultinomialSampler* create(const double* prob, int size) {
+#ifdef PADDLE_TYPE_DOUBLE
+    return new MultinomialSampler(prob, size);
+#else
+    std::unique_ptr<real[]> tmp(new real[size]);
+    std::copy(prob, prob + size, tmp.get());
+    return new MultinomialSampler(tmp.get(), size);
+#endif
+  }
+
   /**
    * @brief Generate a random sample.
    * @param g is a random number engine. See <random>.
diff --git a/paddle/gserver/layers/MultiplexLayer.cpp b/paddle/gserver/layers/MultiplexLayer.cpp
index c681eb0623ab7b8426fe34ce6817a3f5f4ad8246..d09720c5255747df11d4d7367f67a245e63e6846 100644
--- a/paddle/gserver/layers/MultiplexLayer.cpp
+++ b/paddle/gserver/layers/MultiplexLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp
index 50b29cdea5a352093c0508995da4cf3e2afcc995..5ab765247f63dfe6e6651ca4d27dc7183a9f33e1 100644
--- a/paddle/gserver/layers/NCELayer.cpp
+++ b/paddle/gserver/layers/NCELayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -99,8 +99,8 @@ public:
 
     if (config_.neg_sampling_dist_size()) {
       CHECK_EQ(numClasses_, config_.neg_sampling_dist_size());
-      sampler_.reset(new MultinomialSampler(config_.neg_sampling_dist().data(),
-                                            numClasses_));
+      sampler_.reset(MultinomialSampler::create(
+          config_.neg_sampling_dist().data(), numClasses_));
     }
 
     return true;
diff --git a/paddle/gserver/layers/NormLayer.cpp b/paddle/gserver/layers/NormLayer.cpp
index 7f6ffe229842113869b4f2d61d59cdc0f4e1ddf8..3db0af2515ee9f64aa6c0b0a441e88562d9e398e 100644
--- a/paddle/gserver/layers/NormLayer.cpp
+++ b/paddle/gserver/layers/NormLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "NormLayer.h"
 #include "NormProjectionLayer.h"
+#include "paddle/utils/Logging.h"
 namespace paddle {
 
 REGISTER_LAYER_CREATE_FUNC(norm, &NormLayer::create);
@@ -48,6 +48,9 @@ bool ResponseNormLayer::init(const LayerMap& layerMap,
   outputX_ = conf.output_x();
   imgSize_ = conf.img_size();
   denoms_ = NULL;
+
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
   return true;
 }
 
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
index 9e848e5268d6b4b69f24802b66c5fed7cc1bf9e4..86255b231b1eee578e81f31d76fd66bb845b10b7 100644
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <vector>
 #include "Layer.h"
-#include "paddle/math/Matrix.h"
 #include "NormLayer.h"
+#include "paddle/math/Matrix.h"
 
 namespace paddle {
 
@@ -49,7 +49,7 @@ public:
  */
 class ResponseNormLayer : public NormLayer {
 protected:
-  size_t channels_, size_, outputX_, imgSize_;
+  size_t channels_, size_, outputX_, imgSize_, outputY_, imgSizeY_;
   float scale_, pow_;
   MatrixPtr denoms_;
 
diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp
index 6ac468e6fc7c2962beaf8c28192890634340b296..934fc31e0acf96263654f4d74a1a4394578986cc 100644
--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "NormProjectionLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "NormProjectionLayer.h"
 
 namespace paddle {
 size_t CMRProjectionNormLayer::getSize() {
@@ -23,7 +23,7 @@ size_t CMRProjectionNormLayer::getSize() {
   imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
   imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
   if (imgSizeH_ == 0) {
-    imgSizeH_ = imgSize_;
+    imgSizeH_ = imgSizeY_;
   }
   if (imgSizeW_ == 0) {
     imgSizeW_ = imgSize_;
diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h
index b42e98ab0941e59a38bb1cfa73f49682dbef942c..4f7b638334afe3832e03537486f3ffc4dbbdcd9d 100644
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ b/paddle/gserver/layers/NormProjectionLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "NormLayer.h"
 #include "paddle/math/Matrix.h"
-#include <vector>
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/Operator.cpp b/paddle/gserver/layers/Operator.cpp
index b89c4740142e377f0cbbe755377f37baac270552..a638933914fc489a1d2e4fea9d6144d76cc50cdf 100644
--- a/paddle/gserver/layers/Operator.cpp
+++ b/paddle/gserver/layers/Operator.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/Operator.h b/paddle/gserver/layers/Operator.h
index ff6558dc73b8d60f3b4a3d87c9d28c650c8f2987..6fd331382f243039fa38b2762b2d5edede60d868 100644
--- a/paddle/gserver/layers/Operator.h
+++ b/paddle/gserver/layers/Operator.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/parameter/Parameter.h"
 #include "ModelConfig.pb.h"
+#include "paddle/parameter/Parameter.h"
 
-#include "paddle/parameter/Argument.h"
 #include "Layer.h"
+#include "paddle/parameter/Argument.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/OuterProdLayer.cpp b/paddle/gserver/layers/OuterProdLayer.cpp
index 9b24a4f440c9e1fc3b4e73a7234c791fff045ea9..cf9a008318e9d8dd50d1f401576082c07680f6c4 100644
--- a/paddle/gserver/layers/OuterProdLayer.cpp
+++ b/paddle/gserver/layers/OuterProdLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/ParameterReluLayer.cpp b/paddle/gserver/layers/ParameterReluLayer.cpp
index cd3bffa2e1d01ef8367c39c20c8e6f366c583b68..836c1981ba1843ec280d64799cd3ac0d1d03f374 100644
--- a/paddle/gserver/layers/ParameterReluLayer.cpp
+++ b/paddle/gserver/layers/ParameterReluLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/ParameterReluLayer.h b/paddle/gserver/layers/ParameterReluLayer.h
index 029c09381f0e13de111ef30c4574d2255abfd018..a82497fc01ca1f63719a905c7545911a7e05289b 100644
--- a/paddle/gserver/layers/ParameterReluLayer.h
+++ b/paddle/gserver/layers/ParameterReluLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
index 511dfd87c12551c91e8864364dbf1a1085a989b6..96d5c54accc047b685502a178de2d290f3158731 100644
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "PoolLayer.h"
 #include "PoolProjectionLayer.h"
+#include "paddle/utils/Logging.h"
 #ifndef PADDLE_ONLY_CPU
 #include "CudnnPoolLayer.h"
 #endif
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
index 59be295a538b007993e77f85f079f78a8b881eca..318b89d7c2bce896d183eba8c48c230d962918a5 100644
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "Layer.h"
-#include "paddle/math/Matrix.h"
 #include "paddle/math/MathUtils.h"
-#include <vector>
+#include "paddle/math/Matrix.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/PoolProjection.cpp b/paddle/gserver/layers/PoolProjection.cpp
index 1b227c8084991e4bbf1e380881a6018fe01e9180..d90b438448eb72e72e22e9a91a3cbcd84ac7e6cb 100644
--- a/paddle/gserver/layers/PoolProjection.cpp
+++ b/paddle/gserver/layers/PoolProjection.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/PoolProjection.h b/paddle/gserver/layers/PoolProjection.h
index 9c3191bd80061c13b645c2a107eaa723e2495032..9a75f465f6fbb2f2a928b0e36fcfbe0e510d7b3a 100644
--- a/paddle/gserver/layers/PoolProjection.h
+++ b/paddle/gserver/layers/PoolProjection.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/PoolProjectionLayer.cpp b/paddle/gserver/layers/PoolProjectionLayer.cpp
index aabc60af197af30a367c0f933276116ba316bd34..ed5011ab8990620acb12f3ca6c488ce403336d45 100644
--- a/paddle/gserver/layers/PoolProjectionLayer.cpp
+++ b/paddle/gserver/layers/PoolProjectionLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "PoolProjectionLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "PoolProjectionLayer.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/PoolProjectionLayer.h b/paddle/gserver/layers/PoolProjectionLayer.h
index 777b6f39e7cc4ebaa7078ce3378b2688363245e8..3dc6af2f0e9fb1a12eca7bc0c531a2e7b151fb8a 100644
--- a/paddle/gserver/layers/PoolProjectionLayer.h
+++ b/paddle/gserver/layers/PoolProjectionLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/PowerLayer.cpp b/paddle/gserver/layers/PowerLayer.cpp
index 0b9672f220919c6ee1a792fc2d68e8ae540ea09a..64fecab5b08354ceea8b290b78eede72d24a98a2 100644
--- a/paddle/gserver/layers/PowerLayer.cpp
+++ b/paddle/gserver/layers/PowerLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
@@ -99,7 +99,7 @@ void PowerLayer::backward(const UpdateCallback& callback) {
     Matrix::resizeOrCreate(tmpMtx, batchSize, dataDim, false, useGpu_);
 
     if (inG0) {
-      tmpMtx->log(*inV1);
+      tmpMtx->log2(*inV1);
       tmpMtx->dotMul(*tmpMtx, *outV);
 
       // inG0 += outG .* (log(inV1) * outV)
diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp
index 95be7b34cb106665d2465630233fca6b34d71e79..ac7f658864fee6812ea89d1dbd84ad4db94e3035 100644
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/Projection.cpp b/paddle/gserver/layers/Projection.cpp
index c7eb4b644281ff6e7b58201c41888d3a8967f419..974b3cf059fff24de2b5f4cdcf6f9c05ceedee8d 100644
--- a/paddle/gserver/layers/Projection.cpp
+++ b/paddle/gserver/layers/Projection.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/Projection.h b/paddle/gserver/layers/Projection.h
index 798503113d761091d1a1bdf9e4ec70e0c2c3b3a4..8cd8042479eafdbd6b8dac03b63b344fcf9526b1 100644
--- a/paddle/gserver/layers/Projection.h
+++ b/paddle/gserver/layers/Projection.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp
index 08453e21b8ff27138f9fa44ac834b54eb94c0688..85812c9d660e07e915012a7337e621c10a6597ca 100644
--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Layer.h"
-#include "paddle/utils/Stat.h"
 #include "SequenceToBatch.h"
 #include "paddle/utils/CommandLineParser.h"
+#include "paddle/utils/Stat.h"
 
-P_DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");
+DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/RecurrentLayerGroup.cpp b/paddle/gserver/layers/RecurrentLayerGroup.cpp
index a5443975da4ab6ecb302087fe71b018154d439b8..af8dd61d84e2e53ca26dc054d0516e62ab7aa216 100644
--- a/paddle/gserver/layers/RecurrentLayerGroup.cpp
+++ b/paddle/gserver/layers/RecurrentLayerGroup.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/gserver/layers/Layer.h"
 #include <functional>
+#include "paddle/gserver/layers/Layer.h"
 
 #include "paddle/gserver/gradientmachines/RecurrentGradientMachine.h"
 #include "paddle/utils/Stat.h"
diff --git a/paddle/gserver/layers/ResizeLayer.cpp b/paddle/gserver/layers/ResizeLayer.cpp
index 3c478a33e350cf0e901381890e3df1496893f4db..7fcb3adea01b9d16394ee90b751b10902dc3a190 100644
--- a/paddle/gserver/layers/ResizeLayer.cpp
+++ b/paddle/gserver/layers/ResizeLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Layer.h"
-#include "paddle/math/Matrix.h"
 #include "paddle/math/BaseMatrix.h"
+#include "paddle/math/Matrix.h"
 
 namespace paddle {
 /**
diff --git a/paddle/gserver/layers/SamplingIdLayer.cpp b/paddle/gserver/layers/SamplingIdLayer.cpp
index b39c9948b53118b51090059fc554e76f94316f81..59ff5d41b529099277375cd5e1b498f3331c3b0a 100644
--- a/paddle/gserver/layers/SamplingIdLayer.cpp
+++ b/paddle/gserver/layers/SamplingIdLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/ScalingLayer.cpp b/paddle/gserver/layers/ScalingLayer.cpp
index 71570810f9576df74940968426c09ae421881ba6..7f0084be6b57f5ce8245609e64c744c1a049a925 100644
--- a/paddle/gserver/layers/ScalingLayer.cpp
+++ b/paddle/gserver/layers/ScalingLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/ScalingProjection.cpp b/paddle/gserver/layers/ScalingProjection.cpp
index 7999d02d384a06b900fbfa2c8bb271660b7fe008..ddb8c8711018ad9c92ef2f3bf26325aa5bffef89 100644
--- a/paddle/gserver/layers/ScalingProjection.cpp
+++ b/paddle/gserver/layers/ScalingProjection.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
index 4dfa2c179dafe0d8dcc6766fbafeae129edcc49a..9200a01eee3be8ab61b6181ec337b2c3c70c5966 100644
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "SelectiveFullyConnectedLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/math/SparseMatrix.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "paddle/math/SparseMatrix.h"
-#include <vector>
-#include <algorithm>
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
index 9f92ae060521bd7852b67d45649d1cd0792961d4..bdf9a4652cc71710d1d33e8b085c5aec28f6f806 100644
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/SequenceConcatLayer.cpp b/paddle/gserver/layers/SequenceConcatLayer.cpp
index bd72ba3d167d99b5d3fdd047d6b1bfab611b3232..069bc26e602ff7d925b4115d12388b6716676b29 100644
--- a/paddle/gserver/layers/SequenceConcatLayer.cpp
+++ b/paddle/gserver/layers/SequenceConcatLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
index 0e9531eabb4b389b762e235ec01d5f16c88cd4a1..4bfce766c769f4be2e5cc7bf691d539b1d307a47 100644
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp
index c9f19b7d3b66b3ac031135c04a96ffe27245aa01..35260ca912d5d0e00213ffb7074bd8963da265da 100644
--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "SequencePoolLayer.h"
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h
index 669af80e1d447a9150b450f9fca4456c89ed2c36..aa9c132586e55d0f6bccec1689db60145ca2d43f 100644
--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ b/paddle/gserver/layers/SequencePoolLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/SequenceReshapeLayer.cpp b/paddle/gserver/layers/SequenceReshapeLayer.cpp
index 5ca9b8b300161688817234909f2b875801d90995..23924b0490851ad3c3c74d77e7abd8b0af8fc234 100644
--- a/paddle/gserver/layers/SequenceReshapeLayer.cpp
+++ b/paddle/gserver/layers/SequenceReshapeLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/SequenceToBatch.cpp b/paddle/gserver/layers/SequenceToBatch.cpp
index 04402db9c8af2f51f30a09cbf1e9c4023fe3e531..5fa7b6f4881b9582b540a5b1bfe849220cc2a4ea 100644
--- a/paddle/gserver/layers/SequenceToBatch.cpp
+++ b/paddle/gserver/layers/SequenceToBatch.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <vector>
-#include <algorithm>
 #include "SequenceToBatch.h"
-#include <iostream>
 #include <string.h>
+#include <algorithm>
+#include <iostream>
+#include <vector>
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/SequenceToBatch.h b/paddle/gserver/layers/SequenceToBatch.h
index 6bc12f207ee3fadbd2a75ca5a5dbb7ce199cc99b..17e735a135cba8b43caf0ed9e06bb53903b5cd6a 100644
--- a/paddle/gserver/layers/SequenceToBatch.h
+++ b/paddle/gserver/layers/SequenceToBatch.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/math/Vector.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/SlopeInterceptLayer.cpp b/paddle/gserver/layers/SlopeInterceptLayer.cpp
index dd6ffcd50b01cfa56ee9fbc428ffc2cb9b73ce17..b678f414b6d76fa26818cb379fb0f0fb8fc7ec09 100644
--- a/paddle/gserver/layers/SlopeInterceptLayer.cpp
+++ b/paddle/gserver/layers/SlopeInterceptLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp b/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp
index 9609919695853552ed54d8d55e8a669002fa3147..14fe88ff8a5a1440ad1c98850e571f84813fec3e 100644
--- a/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp
+++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -56,14 +56,14 @@ ProjectionConfig SpatialPyramidPoolLayer::getConfig(size_t imgSizeW,
 size_t SpatialPyramidPoolLayer::getSize() {
   CHECK_EQ(inputLayers_.size(), 1UL);
   size_t layerSize = 0;
-  const SppConfig& sppConf = config_.inputs(0).spp_conf();
+  const ImageConfig& conf = config_.inputs(0).spp_conf().image_conf();
   imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
   imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
   if (imgSizeH_ == 0) {
-    imgSizeH_ = sppConf.has_img_size_y() ? sppConf.img_size_y() : imgSizeW_;
+    imgSizeH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
   }
   if (imgSizeW_ == 0) {
-    imgSizeW_ = sppConf.img_size();
+    imgSizeW_ = conf.img_size();
   }
 
   size_t outputH = 1;
@@ -82,9 +82,10 @@ bool SpatialPyramidPoolLayer::init(const LayerMap& layerMap,
   pyramidHeight_ = sppConf.pyramid_height();
   poolType_ = sppConf.pool_type();
 
-  channels_ = sppConf.channels();
-  imgSizeW_ = sppConf.img_size();
-  imgSizeH_ = sppConf.has_img_size_y() ? sppConf.img_size_y() : imgSizeW_;
+  const ImageConfig& imageConf = sppConf.image_conf();
+  channels_ = imageConf.channels();
+  imgSizeW_ = imageConf.img_size();
+  imgSizeH_ = imageConf.has_img_size_y() ? imageConf.img_size_y() : imgSizeW_;
   poolProjections_.reserve(pyramidHeight_);
   projCol_.reserve(pyramidHeight_);
   projOutput_.resize(pyramidHeight_);
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
index 79db574d99bdb1137e6a55244c382f9c894239c8..32e88cf141a667d9dffbe7dcba46e9fde721f9e7 100644
--- a/paddle/gserver/layers/SpatialPyramidPoolLayer.h
+++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/SubSequenceLayer.cpp b/paddle/gserver/layers/SubSequenceLayer.cpp
index 664f9e13c055df08552974048428326644b69a6e..c52fbee26232ad6eb09f84315a57c73e6aa02eb0 100644
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubSequenceLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/Vector.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/SumToOneNormLayer.cpp b/paddle/gserver/layers/SumToOneNormLayer.cpp
index bcf39168408d2bac50c17d0e22ed747cf0b33d80..aa99b49380d3682ccf3d89220c0c68f22e458271 100644
--- a/paddle/gserver/layers/SumToOneNormLayer.cpp
+++ b/paddle/gserver/layers/SumToOneNormLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/TableProjection.cpp b/paddle/gserver/layers/TableProjection.cpp
index 2bc0d329d9605850ecdce6b4a87351579493d834..270acdd34baa2ba5d68718598cab5c3928cc1e9a 100644
--- a/paddle/gserver/layers/TableProjection.cpp
+++ b/paddle/gserver/layers/TableProjection.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/TableProjection.h b/paddle/gserver/layers/TableProjection.h
index 97c672508a009735a9a8f9980b715881c1f824a2..fb6c0e17c2da181aab23a1860fef993dd254955d 100644
--- a/paddle/gserver/layers/TableProjection.h
+++ b/paddle/gserver/layers/TableProjection.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/TensorLayer.cpp b/paddle/gserver/layers/TensorLayer.cpp
index 03586cc6ff3d148a63af33d89b85d565e2198057..642eb1bdd31c0c16f251dd7afda1ae0a61c0872e 100644
--- a/paddle/gserver/layers/TensorLayer.cpp
+++ b/paddle/gserver/layers/TensorLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/TensorLayer.h b/paddle/gserver/layers/TensorLayer.h
index 9ac651de4d99a23a12394c674bda827e935749b9..ac38ffb620570320497446a6825ca2273b73facc 100644
--- a/paddle/gserver/layers/TensorLayer.h
+++ b/paddle/gserver/layers/TensorLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/layers/TransLayer.cpp b/paddle/gserver/layers/TransLayer.cpp
index 53a24d4cc4633898cff1b56f5a377959a38f6354..d1fa90f38415c53bd1c56df4a6c4be0508004bc6 100644
--- a/paddle/gserver/layers/TransLayer.cpp
+++ b/paddle/gserver/layers/TransLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "TransLayer.h"
+#include "paddle/utils/Logging.h"
 namespace paddle {
 
 REGISTER_LAYER(trans, TransLayer);
diff --git a/paddle/gserver/layers/TransLayer.h b/paddle/gserver/layers/TransLayer.h
index 25b091f9f414ead5048cd65cfc16b67ae1387ad9..b43fa1ebfb003226daed724b4ede3006545e8b07 100644
--- a/paddle/gserver/layers/TransLayer.h
+++ b/paddle/gserver/layers/TransLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
-#include <vector>
 
 namespace paddle {
 /**
diff --git a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
index c883283f782352e674d0fcf0369e8491e31d60ff..3f7ff0488207564e3ebbd5a467f42b46af3b31ff 100644
--- a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
+++ b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
 #include "Projection.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/ValidationLayer.cpp b/paddle/gserver/layers/ValidationLayer.cpp
index 0fee4bd2463ac86dfcb5ecc0b5e75564d86971d2..5127bcaba336b72dc76c832892e057724aeb3471 100644
--- a/paddle/gserver/layers/ValidationLayer.cpp
+++ b/paddle/gserver/layers/ValidationLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <memory>
 #include <algorithm>
 #include <fstream>
+#include <memory>
 
-#include "paddle/utils/Logging.h"
 #include "ValidationLayer.h"
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/ValidationLayer.h b/paddle/gserver/layers/ValidationLayer.h
index eef9c80a7b11f5e2b6d1b028f4d2eff9102d6c28..4c1de7b3b7d6975c2693eb065f7d3e19cc51a95c 100644
--- a/paddle/gserver/layers/ValidationLayer.h
+++ b/paddle/gserver/layers/ValidationLayer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,10 +15,10 @@ limitations under the License. */
 #pragma once
 #include <memory>
 
-#include "paddle/gserver/evaluators/Evaluator.h"
 #include "Layer.h"
+#include "paddle/gserver/evaluators/Evaluator.h"
 
-P_DECLARE_int32(trainer_id);
+DECLARE_int32(trainer_id);
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/WarpCTCLayer.cpp b/paddle/gserver/layers/WarpCTCLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..94e926a8d8f678c91b5c0614a78ba829869ec150
--- /dev/null
+++ b/paddle/gserver/layers/WarpCTCLayer.cpp
@@ -0,0 +1,222 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "WarpCTCLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(warp_ctc, WarpCTCLayer);
+
+bool WarpCTCLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  /* Initialize the basic parament class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2UL);
+
+  /* The inputLayers_[0] must be sequence output without softmax */
+  numClasses_ = config_.size();
+  CHECK_GE(numClasses_, 2UL);
+  CHECK_EQ(numClasses_, inputLayers_[0]->getSize());
+
+  blank_ = config_.blank();
+  CHECK_LT(blank_, numClasses_);
+
+  normByTimes_ = config_.norm_by_times();
+
+  // We don't need sequenceStartPositions because each sample of output_ is
+  // for the cost of one sequence.
+  setNeedSequenceInfo(false);
+
+  return true;
+}
+
+void WarpCTCLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& output = getInput(0);
+  const Argument& labels = getInput(1);
+
+  CHECK(output.sequenceStartPositions);
+  CHECK(labels.sequenceStartPositions);
+  CHECK(labels.ids);
+
+  size_t numSequences = labels.sequenceStartPositions->getSize() - 1;
+  CHECK_EQ(numSequences, output.sequenceStartPositions->getSize() - 1);
+
+  resizeOutput(numSequences, 1);
+
+  const int* cpuLabelStartPositions =
+      labels.sequenceStartPositions->getData(false);
+  const int* cpuOutputStartPositions =
+      output.sequenceStartPositions->getData(false);
+
+  std::vector<int> cpuLabelLengths(numSequences);
+  std::vector<int> cpuOutputLengths(numSequences);
+  for (size_t i = 0; i < numSequences; i++) {
+    cpuLabelLengths[i] =
+        cpuLabelStartPositions[i + 1] - cpuLabelStartPositions[i];
+    cpuOutputLengths[i] =
+        cpuOutputStartPositions[i + 1] - cpuOutputStartPositions[i];
+  }
+
+  /* Get the maximum sequence length */
+  maxSequenceLength_ = 0;
+  maxSequenceLength_ = *std::max_element(
+      cpuOutputLengths.data(), cpuOutputLengths.data() + numSequences);
+
+  Matrix::resizeOrCreate(batchValue_,
+                         /* height */ numSequences * maxSequenceLength_,
+                         /* width */ numClasses_,
+                         /* trans */ false,
+                         /* useGpu */ useGpu_);
+
+  Matrix::resizeOrCreate(batchGrad_,
+                         /* height */ numSequences * maxSequenceLength_,
+                         /* width */ numClasses_,
+                         /* trans */ false,
+                         /* useGpu */ useGpu_);
+  batchGrad_->zeroMem();
+
+  seq2batchPadding(output.value, batchValue_, output.sequenceStartPositions);
+
+  /* labels always in CPU memory */
+  IVector::resizeOrCreate(cpuLabels_,
+                          /* size */ (labels.ids)->getSize(),
+                          /* useGpu */ false);
+  cpuLabels_->copyFrom(*(labels.ids));
+
+  /* labels always in CPU memory */
+  Matrix::resizeOrCreate(cpuCosts_,
+                         /* height */ numSequences,
+                         /* width */ 1,
+                         /* trans */ false,
+                         /* useGpu */ false);
+
+  /* Init warp-ctc options */
+  hl_warpctc_options_t options;
+  hl_warpctc_init(blank_, useGpu_, &options);
+
+  /* Get the needed workspace size */
+  size_t workspaceBytes = 0;
+  hl_warpctc_get_workspace_size(cpuLabelLengths.data(),
+                                cpuOutputLengths.data(),
+                                numClasses_,
+                                numSequences,
+                                &options,
+                                &workspaceBytes);
+  CHECK_GT(workspaceBytes, 0UL);
+
+  size_t workspaceLength = workspaceBytes / sizeof(real) + 1;
+  Vector::resizeOrCreate(workspace_,
+                         /* size */ workspaceLength,
+                         /* useGpu */ useGpu_);
+
+  hl_warpctc_compute_loss(batchValue_->getData(),
+                          batchGrad_->getData(),
+                          cpuLabels_->getData(),
+                          cpuLabelLengths.data(),
+                          cpuOutputLengths.data(),
+                          numClasses_,
+                          numSequences,
+                          cpuCosts_->getData(),
+                          workspace_->getData(),
+                          &options);
+
+  /* Copy the costs */
+  output_.value->copyFrom(*cpuCosts_);
+}
+
+void WarpCTCLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  const Argument& output = getInput(0);
+  CHECK(batchGrad_);
+
+  batch2seqPadding(
+      output.grad, batchGrad_, output.sequenceStartPositions, normByTimes_);
+}
+
+void WarpCTCLayer::seq2batchPadding(const MatrixPtr& seqValue,
+                                    MatrixPtr& batchValue,
+                                    const ICpuGpuVectorPtr& seqStartPositions) {
+  size_t numSequences = seqStartPositions->getSize() - 1;
+  const int* seqStartPositionsData = seqStartPositions->getData(useGpu_);
+
+  real* seqData = seqValue->getData();
+  real* batchData = batchValue->getData();
+  if (useGpu_) {
+    hl_sequence2batch_copy_padding(batchData,
+                                   seqData,
+                                   seqStartPositionsData,
+                                   numClasses_,
+                                   maxSequenceLength_,
+                                   numSequences,
+                                   false,
+                                   true);
+  } else {
+    for (size_t i = 0; i < maxSequenceLength_; i++) {
+      for (size_t j = 0; j < numSequences; j++) {
+        size_t sequenceStart = seqStartPositionsData[j];
+        size_t sequenceLength =
+            seqStartPositionsData[j + 1] - seqStartPositionsData[j];
+        if (i < sequenceLength) {
+          memcpy(batchData + (i * numSequences + j) * numClasses_,
+                 seqData + (sequenceStart + i) * numClasses_,
+                 numClasses_ * sizeof(real));
+        } else {
+          memset(batchData + (i * numSequences + j) * numClasses_,
+                 0,
+                 numClasses_ * sizeof(real));
+        }
+      }
+    }
+  }
+}
+
+void WarpCTCLayer::batch2seqPadding(const MatrixPtr& seqValue,
+                                    MatrixPtr& batchValue,
+                                    const ICpuGpuVectorPtr& seqStartPositions,
+                                    bool normByTimes) {
+  size_t numSequences = seqStartPositions->getSize() - 1;
+  const int* seqStartPositionsData = seqStartPositions->getData(useGpu_);
+
+  real* seqData = seqValue->getData();
+  real* batchData = batchValue->getData();
+  if (useGpu_) {
+    hl_sequence2batch_copy_padding(batchData,
+                                   seqData,
+                                   seqStartPositionsData,
+                                   numClasses_,
+                                   maxSequenceLength_,
+                                   numSequences,
+                                   normByTimes,
+                                   false);
+  } else {
+    for (size_t i = 0; i < numSequences; i++) {
+      int sequenceStart = seqStartPositionsData[i];
+      int sequenceLength =
+          seqStartPositionsData[i + 1] - seqStartPositionsData[i];
+      real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f;
+      for (int j = 0; j < sequenceLength; j++) {
+        for (size_t k = 0; k < numClasses_; k++) {
+          seqData[(sequenceStart + j) * numClasses_ + k] =
+              batchData[(j * numSequences + i) * numClasses_ + k] * scale;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/WarpCTCLayer.h b/paddle/gserver/layers/WarpCTCLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d9ae9249af66dd085f5b6bb7a3c09d8b2276a24
--- /dev/null
+++ b/paddle/gserver/layers/WarpCTCLayer.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer integrating the open-source warp-ctc library
+ *        <https://github.com/baidu-research/warp-ctc> to compute connectionist
+ *        temporal classification cost.
+ *
+ * The config file api is warp_ctc_layer.
+ */
+class WarpCTCLayer : public Layer {
+public:
+  explicit WarpCTCLayer(const LayerConfig& config) : Layer(config) {}
+  ~WarpCTCLayer() {}
+
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback);
+
+protected:
+  /**
+   * sequence matrix and batch matrix copy:
+   * sequence (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
+   * batch    (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0)
+   */
+  void seq2batchPadding(const MatrixPtr& seqValue,
+                        MatrixPtr& batchValue,
+                        const ICpuGpuVectorPtr& seqStartPositions);
+  void batch2seqPadding(const MatrixPtr& seqValue,
+                        MatrixPtr& batchValue,
+                        const ICpuGpuVectorPtr& seqStartPositions,
+                        bool normByTimes);
+
+protected:
+  size_t numClasses_;
+  size_t blank_;
+  size_t maxSequenceLength_;
+  bool normByTimes_;
+
+  MatrixPtr batchValue_;
+  MatrixPtr batchGrad_;
+  VectorPtr workspace_;
+
+  IVectorPtr cpuLabels_;
+  MatrixPtr cpuCosts_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 0651d0b4733ea9c3f54a42169774217b65091aa6..34dc375f21a54688c459236551fb1bc4d41f2eb1 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -34,7 +34,22 @@ add_unittest_without_exec(test_ConvTrans
 
 add_test(NAME test_ConvTrans
     COMMAND test_ConvTrans)
+################# test_ConvUnify #######################
+add_unittest_without_exec(test_ConvUnify
+    test_ConvUnify.cpp
+    LayerGradUtil.cpp
+    TestUtil.cpp)
+    
+add_test(NAME test_ConvUnify
+    COMMAND test_ConvUnify)
+################# test_BatchNorm #######################
+add_unittest_without_exec(test_BatchNorm
+    test_BatchNorm.cpp
+    LayerGradUtil.cpp
+    TestUtil.cpp)
 
+add_test(NAME test_BatchNorm
+    COMMAND test_BatchNorm)
 ################## test_Evaluator #######################
 add_unittest(test_Evaluator
     test_Evaluator.cpp
@@ -62,6 +77,17 @@ add_unittest(test_RecurrentLayer
     test_RecurrentLayer.cpp
     TestUtil.cpp)
 
+############### test_WarpCTCLayer #######################
+if(NOT WITH_DOUBLE)
+    add_unittest_without_exec(test_WarpCTCLayer
+        test_WarpCTCLayer.cpp
+        TestUtil.cpp)
+
+    add_test(NAME test_WarpCTCLayer
+        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${PROJ_ROOT}/warp-ctc/build
+        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+endif()
+
 ############### test_RecurrentGradientMachine ###############
 # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
 # I will fix it.
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index 47575169172832cd3f95a53ed6e4dcb87a5b7a4b..1d5e7de1ba624d98c953efe1cdd2318548c4e914 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "LayerGradUtil.h"
 
-P_DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(thread_local_rand_use_global_seed);
 
 namespace paddle {
 real getCostSum(LayerPtr& testLayer, MatrixPtr weights) {
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
index a061c7fc533ff2c639ceda4db6d89a33fd3f0435..62ac2d160fd916c5bb114341a442eac7df114c99 100644
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/trainer/Trainer.h"
-#include "paddle/gserver/layers/DataLayer.h"
 #include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/trainer/Trainer.h"
 
 #include "TestUtil.h"
 using namespace std;  // NOLINT
diff --git a/paddle/gserver/tests/TestUtil.cpp b/paddle/gserver/tests/TestUtil.cpp
index 84d516683c18551765d707f26cc7003ba3432c7f..e07c60861a4a6567fd1e28559b9806cb623a3bdf 100644
--- a/paddle/gserver/tests/TestUtil.cpp
+++ b/paddle/gserver/tests/TestUtil.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include "TestUtil.h"
 
-#include "paddle/utils/CommandLineParser.h"
 #include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/CommandLineParser.h"
 
-P_DEFINE_int32(fixed_seq_length, 0, "Produce some sequence of fixed length");
+DEFINE_int32(fixed_seq_length, 0, "Produce some sequence of fixed length");
 
 namespace paddle {
 
@@ -63,8 +63,8 @@ MatrixPtr makeRandomSparseMatrix(size_t height,
       std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
           ids.data(), indices.data(), data.data(), HPPL_STREAM_DEFAULT);
     } else {
-      std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-          ->copyFrom(ids.data(), indices.data(), data.data());
+      std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
+          ids.data(), indices.data(), data.data());
     }
     return mat;
   } else {
@@ -80,8 +80,8 @@ MatrixPtr makeRandomSparseMatrix(size_t height,
       std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
           ids.data(), indices.data(), data.data(), HPPL_STREAM_DEFAULT);
     } else {
-      std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-          ->copyFrom(ids.data(), indices.data(), data.data());
+      std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
+          ids.data(), indices.data(), data.data());
     }
     return mat;
   }
diff --git a/paddle/gserver/tests/TestUtil.h b/paddle/gserver/tests/TestUtil.h
index 000f8884e8681db8f4d2a2d6454791958b964f92..ec86469aebbafbf5406a21e6825eda6c105a6b9d 100644
--- a/paddle/gserver/tests/TestUtil.h
+++ b/paddle/gserver/tests/TestUtil.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/tests/__init__.py b/paddle/gserver/tests/__init__.py
index c90af2ee000d46a032984ee23559e7e99b49ddad..f662d6826321eb840739382558f76327d27b5847 100644
--- a/paddle/gserver/tests/__init__.py
+++ b/paddle/gserver/tests/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/tests/concat_dotmul_a.conf b/paddle/gserver/tests/concat_dotmul_a.conf
index 52340596b9b7d3a8a1872b1933f9913bf041ee78..db02ca7e80de63618a7abf7b3673840627cd8c93 100644
--- a/paddle/gserver/tests/concat_dotmul_a.conf
+++ b/paddle/gserver/tests/concat_dotmul_a.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/tests/concat_dotmul_b.conf b/paddle/gserver/tests/concat_dotmul_b.conf
index 68859867bf3b101a3682f2f841de53c0ade85792..5e64970e4440a4f1d8c9282faa486963b3515a9d 100644
--- a/paddle/gserver/tests/concat_dotmul_b.conf
+++ b/paddle/gserver/tests/concat_dotmul_b.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/tests/concat_fullmatrix_a.conf b/paddle/gserver/tests/concat_fullmatrix_a.conf
index 35bafc58ac3d7aef505ab1220ac2a2d840852f9d..940d1efc58fe9c21028c1b1e31c46648ab518cbe 100644
--- a/paddle/gserver/tests/concat_fullmatrix_a.conf
+++ b/paddle/gserver/tests/concat_fullmatrix_a.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/tests/concat_fullmatrix_b.conf b/paddle/gserver/tests/concat_fullmatrix_b.conf
index 00a957d97d591ffaa0e4ebdaaf7bc1c779f632fd..931e5b38efa019e1f0afbd59a00d4115a4aab67a 100644
--- a/paddle/gserver/tests/concat_fullmatrix_b.conf
+++ b/paddle/gserver/tests/concat_fullmatrix_b.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/tests/concat_table_a.conf b/paddle/gserver/tests/concat_table_a.conf
index a8ff70f883318676b5bd295c217105ca4b98edff..047cb44d156daa93ba50cc259144217990685055 100644
--- a/paddle/gserver/tests/concat_table_a.conf
+++ b/paddle/gserver/tests/concat_table_a.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/tests/concat_table_b.conf b/paddle/gserver/tests/concat_table_b.conf
index 95d7c10f7b0cd66e38f60c282e4f67ebf3b7cafb..c666ab994276721b66884e59fe89e816d086df8b 100644
--- a/paddle/gserver/tests/concat_table_b.conf
+++ b/paddle/gserver/tests/concat_table_b.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/tests/img_conv_a.conf b/paddle/gserver/tests/img_conv_a.conf
index 940589ed9ac242d6a73a74c9be39fcaafe66b7be..3ad15c64fe5b793768f5a108f4ce60d15fd5da4a 100644
--- a/paddle/gserver/tests/img_conv_a.conf
+++ b/paddle/gserver/tests/img_conv_a.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -34,6 +34,7 @@ conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
                       num_channels=8,
                       num_filters=16, stride=1,
                       bias_attr=True,
-                      act=LinearActivation())
+                      act=LinearActivation(),
+                      groups=2)
 
 outputs(concat, conv)
diff --git a/paddle/gserver/tests/img_conv_b.conf b/paddle/gserver/tests/img_conv_b.conf
index 8ca9c94541504d208b94f45bf71c8da440d18411..e68008155e97256e4bc865016a507c96995bd2eb 100644
--- a/paddle/gserver/tests/img_conv_b.conf
+++ b/paddle/gserver/tests/img_conv_b.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ proj2 = conv_projection(input=data, filter_size=1, filter_size_y=1,
 concat = concat_layer(input=[proj1, proj2], bias_attr=False, act=ReluActivation())
 
 proj = conv_projection(input=data, filter_size=1, filter_size_y=1,
-                       num_channels=8, num_filters=16, stride=1)
+                       num_channels=8, num_filters=16, stride=1, groups=2)
 
 with mixed_layer(bias_attr=True, act=LinearActivation()) as conv:
     conv += proj
diff --git a/paddle/gserver/tests/img_conv_c.conf b/paddle/gserver/tests/img_conv_c.conf
new file mode 100644
index 0000000000000000000000000000000000000000..4598ffbdb2f1452cacaf9715409263922828bcb0
--- /dev/null
+++ b/paddle/gserver/tests/img_conv_c.conf
@@ -0,0 +1,43 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+data = data_layer(name ="input", size=8*16*16)
+conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                        num_channels=8,
+                        num_filters=16, stride=1,
+                        bias_attr=False,
+                        act=ReluActivation(),
+                        layer_type="exconv")
+conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation(),
+                       layer_type="exconv")
+
+concat = concat_layer(input=[conv1, conv2])
+
+conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                      num_channels=8,
+                      num_filters=16, stride=1,
+                      bias_attr=True,
+                      act=LinearActivation(),
+                      groups=2,
+                      layer_type="exconv")
+
+outputs(concat, conv)
diff --git a/paddle/gserver/tests/img_pool_a.conf b/paddle/gserver/tests/img_pool_a.conf
index 5938e7611201c9a4e3b44ca8aae2f39a80b1ff3b..afd271055d974734fc589a51401542b4bed99534 100644
--- a/paddle/gserver/tests/img_pool_a.conf
+++ b/paddle/gserver/tests/img_pool_a.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,7 +28,6 @@ maxpool = img_pool_layer(input=conv,
                          stride_y=2,
                          padding=1,
                          padding_y=2,
-                         img_width=16,
                          pool_type=MaxPooling(),
 )
 avgpool = img_pool_layer(input=conv,
@@ -39,7 +38,6 @@ avgpool = img_pool_layer(input=conv,
                          stride_y=2,
                          padding=1,
                          padding_y=2,
-                         img_width=16,
                          pool_type=AvgPooling(),
 )
 
diff --git a/paddle/gserver/tests/img_pool_b.conf b/paddle/gserver/tests/img_pool_b.conf
index 6ea9649b3f1eaf72686fcf8a157ef9d75c662e46..e8deb9edbe755c1bcf8ea0180125ff7c470b0e0a 100644
--- a/paddle/gserver/tests/img_pool_b.conf
+++ b/paddle/gserver/tests/img_pool_b.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/tests/pyDataProvider.py b/paddle/gserver/tests/pyDataProvider.py
index 91863b4175b1a58cb7d475732f293f32a3a6ed5a..7235a239439b7544805d1bd06dfb1a72c2e0e937 100644
--- a/paddle/gserver/tests/pyDataProvider.py
+++ b/paddle/gserver/tests/pyDataProvider.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/tests/pyDataProvider/trainer.conf b/paddle/gserver/tests/pyDataProvider/trainer.conf
index 7957814c010d259ac9d98766a273b38d591d1375..7d910df20d4077a6645c42e418816cfaeb28d7e5 100644
--- a/paddle/gserver/tests/pyDataProvider/trainer.conf
+++ b/paddle/gserver/tests/pyDataProvider/trainer.conf
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/tests/rnn_data_provider.py b/paddle/gserver/tests/rnn_data_provider.py
index 715ac08a42d05cec9c7f4b09a0447d44835d417d..3afd45c72f4dd071ddca569caac8716fe102299b 100644
--- a/paddle/gserver/tests/rnn_data_provider.py
+++ b/paddle/gserver/tests/rnn_data_provider.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/tests/sequenceGen.py b/paddle/gserver/tests/sequenceGen.py
index fab876fd30da0a80774d06028ae2321e12354d59..fd725727c04677b5ea8918f6721f0c007e80915d 100644
--- a/paddle/gserver/tests/sequenceGen.py
+++ b/paddle/gserver/tests/sequenceGen.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -33,10 +33,10 @@ def process(settings, file_name):
             label, comment = line.strip().split('\t')
             label = int(''.join(label.split()))
             words = comment.split()
-            word_slot = [
+            words = [
                 settings.word_dict[w] for w in words if w in settings.word_dict
             ]
-            yield word_slot, label
+            yield words, label
 
 
 ## for hierarchical sequence network
@@ -52,20 +52,20 @@ def hook2(settings, dict_file, **kwargs):
 @provider(init_hook=hook2, should_shuffle=False)
 def process2(settings, file_name):
     with open(file_name) as fdata:
-        label_list = []
-        word_slot_list = []
+        labels = []
+        sentences = []
         for line in fdata:
             if (len(line)) > 1:
                 label, comment = line.strip().split('\t')
                 label = int(''.join(label.split()))
                 words = comment.split()
-                word_slot = [
+                words = [
                     settings.word_dict[w] for w in words
                     if w in settings.word_dict
                 ]
-                label_list.append(label)
-                word_slot_list.append(word_slot)
+                labels.append(label)
+                sentences.append(words)
             else:
-                yield word_slot_list, label_list
-                label_list = []
-                word_slot_list = []
+                yield sentences, labels
+                labels = []
+                sentences = []
diff --git a/paddle/gserver/tests/sequence_layer_group.conf b/paddle/gserver/tests/sequence_layer_group.conf
index 087aa96ccb5a7fc2b6d4f5ce81de4e820580570a..68d150d553588c864de56ce1e6f283cc42fbbf2f 100644
--- a/paddle/gserver/tests/sequence_layer_group.conf
+++ b/paddle/gserver/tests/sequence_layer_group.conf
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/tests/sequence_nest_layer_group.conf b/paddle/gserver/tests/sequence_nest_layer_group.conf
index 93a0f6da7905c0b00cf70296143ded2d4431e430..88cb42798baff79fa6a86ef11dabf1781575c0b4 100644
--- a/paddle/gserver/tests/sequence_nest_layer_group.conf
+++ b/paddle/gserver/tests/sequence_nest_layer_group.conf
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/tests/sequence_nest_rnn.conf b/paddle/gserver/tests/sequence_nest_rnn.conf
index 93b08eb2f8746d514e35b49e5261e4fa9fa681e6..2873a599669b4281a53cd71e8bb56f0d18c26b5a 100644
--- a/paddle/gserver/tests/sequence_nest_rnn.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -55,9 +55,8 @@ def outer_step(x):
         input=x)
     last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
 
-    # "return last" should also work. But currently RecurrentGradientMachine
-    # does not handle it, and will report error: In hierachical RNN, all out 
-    # links should be from sequences now.
+    # "return last" won't work, because recurrent_group only support the input 
+    # sequence type is same as return sequence type.
     return inner_rnn_output
 
 out = recurrent_group(
diff --git a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
index 0614958b4719ddb2098dc495c4a6c615f2628457..ad14a2c927c89c9b480af5ad565c37e8b2e54469 100644
--- a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.conf b/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.conf
deleted file mode 100644
index d0b9450f4b9f9659fdd606503b6bc1ea45338c76..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.conf
+++ /dev/null
@@ -1,106 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_unequalength_subseq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 2
-
-speaker1 = data_layer(name="word1", size=dict_dim)
-speaker2 = data_layer(name="word2", size=dict_dim)
-
-emb1 = embedding_layer(input=speaker1, size=word_dim)
-emb2 = embedding_layer(input=speaker2, size=word_dim)
-
-# This hierachical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_multi_unequalength_inputs.conf
-
-def outer_step(x1, x2):
-    outer_mem1 = memory(name = "outer_rnn_state1", size = hidden_dim)
-    outer_mem2 = memory(name = "outer_rnn_state2", size = hidden_dim)
-    def inner_step1(y):
-        inner_mem = memory(name = 'inner_rnn_state_' + y.name,
-                           size = hidden_dim,
-                           boot_layer = outer_mem1)
-        out = fc_layer(input = [y, inner_mem],
-                       size = hidden_dim,
-                       act = TanhActivation(),
-                       bias_attr = True,
-                       name = 'inner_rnn_state_' + y.name)
-        return out
-
-    def inner_step2(y):
-        inner_mem = memory(name = 'inner_rnn_state_' + y.name,
-                           size = hidden_dim,
-                           boot_layer = outer_mem2)
-        out = fc_layer(input = [y, inner_mem],
-                       size = hidden_dim,
-                       act = TanhActivation(),
-                       bias_attr = True,
-                       name = 'inner_rnn_state_' + y.name)
-        return out
-
-    encoder1 = recurrent_group(
-        step = inner_step1,
-        name = 'inner1',
-        input = x1)
-
-    encoder2 = recurrent_group(
-        step = inner_step2,
-        name = 'inner2',
-        input = x2)
-
-    sentence_last_state1 = last_seq(input = encoder1, name = 'outer_rnn_state1')
-    sentence_last_state2_ = last_seq(input = encoder2, name = 'outer_rnn_state2')
-
-    encoder1_expand = expand_layer(input = sentence_last_state1,
-                                   expand_as = encoder2)
-
-    return [encoder1_expand, encoder2]
-
-
-encoder1_rep, encoder2_rep = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[SubsequenceInput(emb1), SubsequenceInput(emb2)],
-    targetInlink=emb2)
-
-encoder1_last = last_seq(input = encoder1_rep)
-encoder1_expandlast = expand_layer(input = encoder1_last,
-                                   expand_as = encoder2_rep)
-context = mixed_layer(input = [identity_projection(encoder1_expandlast),
-                               identity_projection(encoder2_rep)],
-                      size = hidden_dim)
-
-rep = last_seq(input=context)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
-
diff --git a/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py b/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..7303d088043d5096a3491d3b3b32b231bde09a0a
--- /dev/null
+++ b/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
@@ -0,0 +1,98 @@
+# edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/dummy.list',
+    test_list=None,
+    module='rnn_data_provider',
+    obj='process_unequalength_subseq')
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 2
+
+speaker1 = data_layer(name="word1", size=dict_dim)
+speaker2 = data_layer(name="word2", size=dict_dim)
+
+emb1 = embedding_layer(input=speaker1, size=word_dim)
+emb2 = embedding_layer(input=speaker2, size=word_dim)
+
+
+# This hierarchical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn_multi_unequalength_inputs.conf
+def outer_step(x1, x2):
+    index = [0]
+
+    def inner_step(ipt):
+        index[0] += 1
+        i = index[0]
+        outer_mem = memory(name="outer_rnn_state_%d" % i, size=hidden_dim)
+
+        def inner_step_impl(y):
+            inner_mem = memory(
+                name="inner_rnn_state_" + y.name,
+                size=hidden_dim,
+                boot_layer=outer_mem)
+            out = fc_layer(
+                input=[y, inner_mem],
+                size=hidden_dim,
+                act=TanhActivation(),
+                bias_attr=True,
+                name='inner_rnn_state_' + y.name)
+            return out
+
+        encoder = recurrent_group(
+            step=inner_step_impl, name='inner_%d' % i, input=ipt)
+        last = last_seq(name="outer_rnn_state_%d" % i, input=encoder)
+        return encoder, last
+
+    encoder1, sentence_last_state1 = inner_step(ipt=x1)
+    encoder2, sentence_last_state2 = inner_step(ipt=x2)
+
+    encoder1_expand = expand_layer(
+        input=sentence_last_state1, expand_as=encoder2)
+
+    return [encoder1_expand, encoder2]
+
+
+encoder1_rep, encoder2_rep = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[SubsequenceInput(emb1), SubsequenceInput(emb2)],
+    targetInlink=emb2)
+
+encoder1_last = last_seq(input=encoder1_rep)
+encoder1_expandlast = expand_layer(input=encoder1_last, expand_as=encoder2_rep)
+context = mixed_layer(
+    input=[
+        identity_projection(encoder1_expandlast),
+        identity_projection(encoder2_rep)
+    ],
+    size=hidden_dim)
+
+rep = last_seq(input=context)
+prob = fc_layer(
+    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
+
+outputs(
+    classification_cost(
+        input=prob, label=data_layer(
+            name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_rnn.conf b/paddle/gserver/tests/sequence_rnn.conf
index 3294c2c3fc431c9d07aad0ba4620ec97a435fd91..1084edfe708c3348d40b67e270f64d8cda3cee0f 100644
--- a/paddle/gserver/tests/sequence_rnn.conf
+++ b/paddle/gserver/tests/sequence_rnn.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/tests/sequence_rnn_multi_input.conf b/paddle/gserver/tests/sequence_rnn_multi_input.conf
index 51881e21d971bbebeceeab1a7c4954e50e3a5e60..40d031741573251aa94d2a0f355470c53c51de7e 100644
--- a/paddle/gserver/tests/sequence_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_rnn_multi_input.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.conf b/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
similarity index 50%
rename from paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.conf
rename to paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
index 28b1cb98cf132fb74b08d440be44d68c6cf3ffae..786a0c6d780e4e8deadb35e52901e42dae67a281 100644
--- a/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.conf
+++ b/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,11 +16,11 @@
 from paddle.trainer_config_helpers import *
 
 ######################## data source ################################
-define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_unequalength_seq')
-
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/dummy.list',
+    test_list=None,
+    module='rnn_data_provider',
+    obj='process_unequalength_seq')
 
 settings(batch_size=2, learning_rate=0.01)
 ######################## network configure ################################
@@ -38,38 +38,40 @@ emb2 = embedding_layer(input=speaker2, size=word_dim)
 # This hierachical RNN is designed to be equivalent to the RNN in
 # sequence_nest_rnn_multi_unequalength_inputs.conf
 
+
 def step(x1, x2):
-	def calrnn(y):
-		mem = memory(name = 'rnn_state_' + y.name, size = hidden_dim)
-		out = fc_layer(input = [y, mem],
-					   size = hidden_dim,
-					   act = TanhActivation(),
-					   bias_attr = True,
-					   name = 'rnn_state_' + y.name)
-		return out
-	
-	encoder1 = calrnn(x1)
-	encoder2 = calrnn(x2)
-	return [encoder1, encoder2]
+    def calrnn(y):
+        mem = memory(name='rnn_state_' + y.name, size=hidden_dim)
+        out = fc_layer(
+            input=[y, mem],
+            size=hidden_dim,
+            act=TanhActivation(),
+            bias_attr=True,
+            name='rnn_state_' + y.name)
+        return out
+
+    encoder1 = calrnn(x1)
+    encoder2 = calrnn(x2)
+    return [encoder1, encoder2]
+
 
 encoder1_rep, encoder2_rep = recurrent_group(
-    name="stepout",
-    step=step,
-    input=[emb1, emb2])
+    name="stepout", step=step, input=[emb1, emb2])
 
-encoder1_last = last_seq(input = encoder1_rep)
-encoder1_expandlast = expand_layer(input = encoder1_last,
-                                   expand_as = encoder2_rep)
-context = mixed_layer(input = [identity_projection(encoder1_expandlast),
-                               identity_projection(encoder2_rep)],
-                      size = hidden_dim)
+encoder1_last = last_seq(input=encoder1_rep)
+encoder1_expandlast = expand_layer(input=encoder1_last, expand_as=encoder2_rep)
+context = mixed_layer(
+    input=[
+        identity_projection(encoder1_expandlast),
+        identity_projection(encoder2_rep)
+    ],
+    size=hidden_dim)
 
 rep = last_seq(input=context)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
+prob = fc_layer(
+    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
 
+outputs(
+    classification_cost(
+        input=prob, label=data_layer(
+            name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/test_ActivationGrad.cpp b/paddle/gserver/tests/test_ActivationGrad.cpp
index e54c5109e71de1a41ec2bda2af4a19745acbbc83..7d7e68da5c5a9dbcba024002a988f26f7613b724 100644
--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,20 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <vector>
 #include <string>
-#include "paddle/gserver/layers/DataLayer.h"
+#include <vector>
 #include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/trainer/Trainer.h"
 
-#include "TestUtil.h"
 #include "LayerGradUtil.h"
+#include "TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(use_gpu);
+DECLARE_bool(thread_local_rand_use_global_seed);
 
 void testActivation(const string& act) {
   LOG(INFO) << "test activation: " << act;
diff --git a/paddle/gserver/tests/test_BatchNorm.cpp b/paddle/gserver/tests/test_BatchNorm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7f5fcb670b70aed9f0a04180d344556a0390122f
--- /dev/null
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
@@ -0,0 +1,127 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/gserver/layers/ExpandConvTransLayer.h"
+#include "paddle/trainer/Trainer.h"
+#include "paddle/utils/GlobalConstants.h"
+
+#include "LayerGradUtil.h"
+#include "TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
+
+// Test that the batchNormLayer can be followed by a ConvLayer
+TEST(Layer, batchNorm) {
+  FLAGS_use_gpu = false;
+  TestConfig configBN;
+  const int CHANNELS = 6272;
+  const int IMG_SIZE = 1;
+  configBN.layerConfig.set_type("batch_norm");
+  configBN.layerConfig.set_name("bn");
+  configBN.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE);
+  configBN.layerConfig.set_active_type("relu");
+  configBN.biasSize = CHANNELS;
+  configBN.inputDefs.push_back({INPUT_DATA,
+                                "layer_0",
+                                /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS,
+                                /* paraSize= */ CHANNELS});
+
+  configBN.inputDefs.push_back(
+      {INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
+  configBN.inputDefs.back().isStatic = true;
+  configBN.inputDefs.push_back(
+      {INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
+  configBN.inputDefs.back().isStatic = true;
+
+  LayerInputConfig* input = configBN.layerConfig.add_inputs();
+  configBN.layerConfig.add_inputs();
+  configBN.layerConfig.add_inputs();
+
+  ImageConfig* img_conf = input->mutable_image_conf();
+  img_conf->set_channels(CHANNELS);
+  img_conf->set_img_size(IMG_SIZE);
+
+  // Setting up conv-layer config
+  TestConfig config;
+  config.biasSize = 64;
+  config.layerConfig.set_type("exconv");
+  config.layerConfig.set_num_filters(64);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "bn", 6272, 204800});
+  input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(5);
+  conv->set_filter_size_y(5);
+  conv->set_channels(128);
+  conv->set_padding(1);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(7);
+  conv->set_output_x(3);
+  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+                              config.layerConfig.num_filters());
+  config.layerConfig.set_name("conv");
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(configBN,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                "batch_norm",
+                100,
+                false,
+                false);
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr bnLayer;
+  initTestLayer(configBN, &layerMap, &parameters, &bnLayer);
+
+  std::vector<ParameterPtr> parameters2;
+  LayerPtr convLayer;
+  initTestLayer(config, &layerMap, &parameters2, &convLayer);
+
+  bnLayer->forward(PASS_GC);
+  convLayer->forward(PASS_GC);
+
+  CHECK_EQ(convLayer->getOutputValue()->getHeight(), 100);
+  CHECK_EQ(convLayer->getOutputValue()->getWidth(), 576);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_ConvTrans.cpp b/paddle/gserver/tests/test_ConvTrans.cpp
index f3efdfb428d14435fbfced6cfef3b7dadd8ff5a9..99202c2d5702a9569c3a9a92897a8a0e38b8e2a6 100644
--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ b/paddle/gserver/tests/test_ConvTrans.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,26 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <vector>
 #include <string>
-#include "paddle/gserver/layers/DataLayer.h"
+#include <vector>
 #include "ModelConfig.pb.h"
-#include "paddle/trainer/Trainer.h"
-#include "paddle/utils/GlobalConstants.h"
+#include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/gserver/layers/ExpandConvTransLayer.h"
 #include "paddle/math/MathUtils.h"
+#include "paddle/trainer/Trainer.h"
+#include "paddle/utils/GlobalConstants.h"
 
-#include "TestUtil.h"
 #include "LayerGradUtil.h"
+#include "TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
 
 // Test that the convTrans forward is the same as conv backward
 TEST(Layer, convTransLayerFwd) {
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2ab18f886848d198b9063c7559790497ce131efe
--- /dev/null
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -0,0 +1,223 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/gserver/layers/ExpandConvTransLayer.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/trainer/Trainer.h"
+#include "paddle/utils/GlobalConstants.h"
+
+#include "LayerGradUtil.h"
+#include "TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
+
+// Do one forward pass of convTrans layer and check to see if its output
+// matches the given result
+MatrixPtr doOneConvTest(size_t imgSize,
+                        size_t output_x,
+                        size_t stride,
+                        size_t padding,
+                        size_t filter_size,
+                        size_t channel,
+                        size_t numfilters,
+                        size_t groups,
+                        MatrixPtr& inputData,
+                        real* param,
+                        bool useGpu) {
+  TestConfig config;
+  config.biasSize = numfilters;
+  if (useGpu) {
+    config.layerConfig.set_type("cudnn_conv");
+  } else {
+    config.layerConfig.set_type("exconv");
+  }
+  config.layerConfig.set_num_filters(numfilters);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  size_t weightSize = channel * filter_size * filter_size *
+                      config.layerConfig.num_filters() / groups;
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", imgSize * imgSize * channel, weightSize});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(filter_size);
+  conv->set_filter_size_y(filter_size);
+  conv->set_channels(channel);
+  conv->set_padding(padding);
+  conv->set_padding_y(padding);
+  conv->set_stride(stride);
+  conv->set_stride_y(stride);
+  conv->set_groups(groups);
+  conv->set_filter_channels(channel / groups);
+  conv->set_img_size(imgSize);
+  conv->set_output_x(output_x);
+
+  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+                              config.layerConfig.num_filters());
+  config.layerConfig.set_name("conv");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      config, &dataLayers, &datas, &layerMap, "conv", 1, false, useGpu);
+  dataLayers[0]->getOutputValue()->zeroMem();
+  dataLayers[0]->getOutputValue()->copyFrom(*inputData);
+
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr convLayer;
+  initTestLayer(config, &layerMap, &parameters, &convLayer);
+  convLayer->getBiasParameter()->zeroMem();
+  convLayer->getParameters()[0]->zeroMem();
+  convLayer->getParameters()[0]
+      ->getBuf(PARAMETER_VALUE)
+      ->copyFrom(param, weightSize);
+  convLayer->forward(PASS_GC);
+
+  return convLayer->getOutputValue();
+}
+
+TEST(Layer, convParaUnified) {
+#ifndef PADDLE_ONLY_CPU
+  MatrixPtr input, resultCpu, resultGpu;
+  input = Matrix::create(1, 4 * 4, false, false);
+  float inputData[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  float param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+
+  input->setData(inputData);
+
+  resultCpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            false);
+
+  resultGpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  input = Matrix::create(1, 3 * 3 * 2, false, false);
+  float inputData2[] = {1,
+                        2,
+                        3,
+                        4,
+                        5,
+                        6,
+                        7,
+                        8,
+                        9,
+
+                        10,
+                        11,
+                        12,
+                        13,
+                        14,
+                        15,
+                        16,
+                        17,
+                        18};
+  float param2[] = {1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1};
+
+  input->setData(inputData2);
+
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            false);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  float param3[] = {1, 2, 3, 4, 4, 3, 2, 1};
+
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            false);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            true);
+  checkMatrixEqual(resultCpu, resultGpu);
+#endif
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index be639ea09380d02ed8251874bf690fc3596bddf2..e07066dad84aa6326c2447fc5ee80fa496735fbf 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,15 +15,15 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <vector>
 #include "ModelConfig.pb.h"
-#include "paddle/trainer/Trainer.h"
 #include "TestUtil.h"
+#include "paddle/trainer/Trainer.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
 
 enum InputType {
   INPUT_DATA,         // dense vector
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 374ae57dd3681f891cf3f5b698085f0b8fbc6cd7..8a8d094ed357a6565dd9827c4bb10b76db6a146a 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/trainer/Trainer.h"
 #include "paddle/math/MathUtils.h"
+#include "paddle/trainer/Trainer.h"
 
 #include "LayerGradUtil.h"
 #include "TestUtil.h"
@@ -26,11 +26,11 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
 
 TEST(Operator, dot_mul) {
   TestConfig config;
@@ -166,9 +166,8 @@ TEST(Projection, scaling) {
   }
 }
 
-#ifndef PADDLE_ONLY_CPU
-TEST(Projection, conv) {
-  const int NUM_FILTERS = 16;
+void testProjectionConv(size_t groups) {
+  const int NUM_FILTERS = 18;
   const int FILTER_SIZE = 2;
   const int FILTER_SIZE_Y = 3;
   const int CHANNELS = 3;
@@ -186,7 +185,7 @@ TEST(Projection, conv) {
   conv->set_padding_y(1);
   conv->set_stride(2);
   conv->set_stride_y(2);
-  conv->set_groups(1);
+  conv->set_groups(groups);
   conv->set_filter_channels(conv->channels() / conv->groups());
   conv->set_img_size(IMAGE_SIZE);
   int output_x = outputSize(conv->img_size(),
@@ -203,15 +202,21 @@ TEST(Projection, conv) {
   conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
   conf.set_output_size(output_x * output_y * NUM_FILTERS);
 
-  testProjectionGrad(
-      conf,
-      INPUT_DATA,
-      /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE * FILTER_SIZE_Y,
-      /* batchSize */ 100,
-      true,
-      false,
-      NUM_FILTERS,
-      true);
+  testProjectionGrad(conf,
+                     INPUT_DATA,
+                     /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE *
+                         FILTER_SIZE_Y / groups,
+                     /* batchSize */ 100,
+                     true,
+                     false,
+                     NUM_FILTERS,
+                     true);
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST(Projection, conv) {
+  testProjectionConv(1);
+  testProjectionConv(3);
 }
 #endif
 
@@ -223,9 +228,10 @@ TEST(Layer, BilinearInterpLayer) {
 
   LayerInputConfig* input = config.layerConfig.add_inputs();
   BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf();
-  bilinear->set_img_size_x(32);
-  bilinear->set_img_size_y(32);
-  bilinear->set_num_channels(4);
+  ImageConfig* image = bilinear->mutable_image_conf();
+  image->set_img_size(32);
+  image->set_img_size_y(32);
+  image->set_channels(4);
 
   for (auto useGpu : {false, true}) {
     for (auto outSize : {32, 64}) {
@@ -348,7 +354,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
   config.layerConfig.set_partial_sum(1);
   config.layerConfig.set_shared_biases(true);
 
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 768, 288});
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 384, 288});
   LayerInputConfig* input = config.layerConfig.add_inputs();
   ConvConfig* conv = input->mutable_conv_conf();
   conv->set_filter_size(2);
@@ -361,12 +367,18 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
   conv->set_groups(1);
   conv->set_filter_channels(conv->channels() / conv->groups());
   conv->set_img_size(16);
+  conv->set_img_size_y(8);
   conv->set_output_x(outputSize(conv->img_size(),
                                 conv->filter_size(),
                                 conv->padding(),
                                 conv->stride(),
                                 /* caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
                               config.layerConfig.num_filters());
 
   testLayerGrad(config, "conv", 100, trans, useGpu);
@@ -466,10 +478,11 @@ TEST(Layer, maxoutLayer) {
   config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
   LayerInputConfig* input = config.layerConfig.add_inputs();
   MaxOutConfig* maxout = input->mutable_maxout_conf();
+  ImageConfig* image = maxout->mutable_image_conf();
 
-  maxout->set_img_size_x(32);
-  maxout->set_img_size_y(32);
-  maxout->set_channels(4);
+  image->set_img_size(32);
+  image->set_img_size_y(32);
+  image->set_channels(4);
   maxout->set_groups(2);
 
   for (auto useGpu : {false, true}) {
@@ -981,7 +994,7 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
   config.layerConfig.set_type("norm");
   config.layerConfig.set_active_type("relu");
 
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1568, 0});
   LayerInputConfig* input = config.layerConfig.add_inputs();
   NormConfig* norm = input->mutable_norm_conf();
   norm->set_norm_type(normType);
@@ -991,7 +1004,9 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
   norm->set_pow(0.75);
   norm->set_blocked(0);
   norm->set_img_size(14);
+  norm->set_img_size_y(7);
   norm->set_output_x(norm->img_size());
+  norm->set_output_y(norm->img_size_y());
   if (norm->norm_type() == "cmrnorm" ||
       norm->norm_type() == "cmrnorm-projection") {
     norm->set_scale(norm->scale() / norm->size());
@@ -999,7 +1014,7 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
     norm->set_scale(norm->scale() / (norm->size() * norm->size()));
   }
 
-  config.layerConfig.set_size(norm->output_x() * norm->output_x() *
+  config.layerConfig.set_size(norm->output_x() * norm->output_y() *
                               norm->channels());
   config.biasSize = 0;
 
@@ -1100,11 +1115,12 @@ void testSppLayer(const string& poolType,
   SppConfig* sppConfig = input->mutable_spp_conf();
   sppConfig->set_pool_type(poolType);
   sppConfig->set_pyramid_height(pyramidHeight);
-  sppConfig->set_channels(16);
-  sppConfig->set_img_size(10);
-  sppConfig->set_img_size_y(20);
+  ImageConfig* imageConfig = sppConfig->mutable_image_conf();
+  imageConfig->set_channels(16);
+  imageConfig->set_img_size(10);
+  imageConfig->set_img_size_y(20);
   int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1);
-  config.layerConfig.set_size(outputSize * sppConfig->channels());
+  config.layerConfig.set_size(outputSize * imageConfig->channels());
   testLayerGrad(config, "spp", 100, trans, useGpu);
 }
 
@@ -1414,13 +1430,15 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
   TestConfig config;
   const int CHANNELS = 10;
   const int IMG_SIZE = 16;
+  const int IMG_SIZE_Y = 8;
+  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y;
   config.layerConfig.set_type(type);
-  config.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE);
+  config.layerConfig.set_size(size);
   config.layerConfig.set_active_type("sigmoid");
   config.biasSize = CHANNELS;
   config.inputDefs.push_back({INPUT_DATA,
                               "layer_0",
-                              /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS,
+                              /* dim= */ size,
                               /* paraSize= */ CHANNELS});
 
   config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
@@ -1435,6 +1453,7 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
   ImageConfig* img_conf = input->mutable_image_conf();
   img_conf->set_channels(CHANNELS);
   img_conf->set_img_size(IMG_SIZE);
+  img_conf->set_img_size_y(IMG_SIZE_Y);
 
   testLayerGrad(config,
                 "batch_norm",
@@ -1461,6 +1480,7 @@ TEST(Operator, conv) {
   const int FILTER_SIZE_Y = 3;
   const int CHANNELS = 3;
   const int IMAGE_SIZE = 16;
+  const int IMAGE_SIZE_Y = 8;
   OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
   operatorConf.set_type("conv");
   ConvConfig* conv = operatorConf.mutable_conv_conf();
@@ -1475,19 +1495,22 @@ TEST(Operator, conv) {
   conv->set_groups(1);
   conv->set_filter_channels(conv->channels() / conv->groups());
   conv->set_img_size(IMAGE_SIZE);
-  int output_x = outputSize(conv->img_size(),
-                            conv->filter_size(),
-                            conv->padding(),
-                            conv->stride(),
-                            /* caffeMode */ true);
-  conv->set_output_x(output_x);
-  config.layerConfig.set_size(output_x * output_x *
-                              config.layerConfig.num_filters());
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+  conv->set_img_size_y(IMAGE_SIZE_Y);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /*  caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /*  caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
                               NUM_FILTERS);
 
   config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE * CHANNELS, 0});
+      {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
   config.inputDefs.push_back(
       {INPUT_DATA,
        "layer_1",
diff --git a/paddle/gserver/tests/test_LinearChainCRF.cpp b/paddle/gserver/tests/test_LinearChainCRF.cpp
index 913d6ed7511a0c3c7c0b40e1fbdb48a17b51b1b2..330adee8f77f495dab6a13190aaca6a3a5f86b2c 100644
--- a/paddle/gserver/tests/test_LinearChainCRF.cpp
+++ b/paddle/gserver/tests/test_LinearChainCRF.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/tests/test_MultinomialSampler.cpp b/paddle/gserver/tests/test_MultinomialSampler.cpp
index 3fc099adbdb6cb562c4bfc419b777ef534bdfed7..eadf40ade091ae8b3e19d7dc6c999288e8e88c1b 100644
--- a/paddle/gserver/tests/test_MultinomialSampler.cpp
+++ b/paddle/gserver/tests/test_MultinomialSampler.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -20,8 +20,8 @@ limitations under the License. */
 #undef PADDLE_DISABLE_TIMER
 #include "paddle/utils/Stat.h"
 
-#include "paddle/utils/Util.h"
 #include "paddle/gserver/layers/MultinomialSampler.h"
+#include "paddle/utils/Util.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index 1810bc31fc2ce00ed6d8fd588c0dfa9ce398cb45..fc60228f816e0cea30ef764c59a8c7875ed4a0e8 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,22 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #undef PADDLE_DISABLE_TIMER
+#include <gtest/gtest.h>
 #include <paddle/utils/PythonUtil.h>
-#include <cstdlib>
 #include <algorithm>
-#include <gtest/gtest.h>
+#include <cstdlib>
 
+#include "TestUtil.h"
 #include "paddle/trainer/Trainer.h"
 #include "paddle/utils/Stat.h"
-#include "TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DEFINE_bool(use_label, true, "input label or sequence label");
-P_DEFINE_bool(static_para, false, "static parameter");
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DEFINE_bool(use_label, true, "input label or sequence label");
+DEFINE_bool(static_para, false, "static parameter");
 
 struct DataIn {
   std::vector<Argument> inArgs;
@@ -255,10 +255,20 @@ TEST(Compare, img_conv) {
   compareNetwork(config_file_a, config_file_b);
   FLAGS_use_gpu = useGpu;
 }
+
+// Test cudnn_conv and exconv give the same result
+TEST(Compare, img_conv2) {
+  std::string config_file_a = "./gserver/tests/img_conv_a.conf";
+  std::string config_file_b = "./gserver/tests/img_conv_c.conf";
+  bool useGpu = FLAGS_use_gpu;
+  FLAGS_use_gpu = true;
+  compareNetwork(config_file_a, config_file_b);
+  FLAGS_use_gpu = useGpu;
+}
 #endif
 
-P_DEFINE_string(config_file_a, "", "config of one network to compare");
-P_DEFINE_string(config_file_b, "", "config of another network to compare");
+DEFINE_string(config_file_a, "", "config of one network to compare");
+DEFINE_string(config_file_b, "", "config of another network to compare");
 TEST(Compare, network) {
   if (FLAGS_config_file_a != "" && FLAGS_config_file_b != "") {
     compareNetwork(FLAGS_config_file_a, FLAGS_config_file_b);
diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp
index 01070bc1cb3023bc0321f0a8e867b8abd7030e08..d421b6e2f2536e266883508ff29cbec731c9d7e3 100644
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 
-#include "paddle/utils/Util.h"
 #include "paddle/gserver/dataproviders/ProtoDataProvider.h"
+#include "paddle/utils/Util.h"
 
 #include "TestUtil.h"
 
diff --git a/paddle/gserver/tests/test_PyDataProvider.cpp b/paddle/gserver/tests/test_PyDataProvider.cpp
index 802f9aa4cb558f48fe55d7d7d5c882d25925bb32..0f264ecf91837f6681f0577b93be7e35be268c04 100644
--- a/paddle/gserver/tests/test_PyDataProvider.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
index 24aa73910f254e636dfb88182552fe47c12c8543..5f8bc5ecd0f77efc6dcda0330f124ca6cab7f277 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,16 +15,16 @@ limitations under the License. */
 #ifndef PADDLE_NO_PYTHON
 #include <gtest/gtest.h>
 #include <fstream>
-#include "paddle/utils/Util.h"
-#include "paddle/utils/PythonUtil.h"
 #include "paddle/gserver/dataproviders/DataProvider.h"
+#include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Util.h"
 
-P_DEFINE_string(train_list, "unittest.list", "file list for unittest");
+DEFINE_string(train_list, "unittest.list", "file list for unittest");
 
 namespace paddle {
 namespace unittest {
 namespace pydp2 {
-extern void setOnPoolFilledHook(const std::function<void(size_t)>& func);
+extern void setOnPoolFilledHook(const std::function<void(size_t)> &func);
 extern void clearOnPoolFilledHook();
 
 }  // namespace pydp2
@@ -33,8 +33,8 @@ extern void clearOnPoolFilledHook();
 
 const paddle::real epsilon = 1e-5;
 
-static inline int64_t readDataBatch(paddle::DataBatch* batch,
-                                    const std::string& funcName,
+static inline int64_t readDataBatch(paddle::DataBatch *batch,
+                                    const std::string &funcName,
                                     int64_t batchSize = 65535) {
   paddle::DataConfig config;
   config.set_type("py2");
@@ -143,7 +143,7 @@ TEST(PyDataProvider2, init_hook) {
   paddle::DataBatch batch;
   int64_t num = provider->getNextBatchInternal(100000, &batch);
   ASSERT_EQ(num, 200);
-  auto& mat = batch.getStreams()[0].value;
+  auto &mat = batch.getStreams()[0].value;
   ASSERT_EQ((size_t)mat->getWidth(), (size_t)20);
   for (size_t i = 0; i < 200; ++i) {
     for (size_t j = 0; j < 20; ++j) {
@@ -170,7 +170,7 @@ TEST(PyDataProvider2, sparse_no_value_no_seq) {
   CHECK(csm != nullptr);
   for (int i = 0; i < 200; ++i) {
     CHECK_EQ(csm->getColNum(i), (size_t)10);
-    int* cols = csm->getRowCols(i);
+    int *cols = csm->getRowCols(i);
     for (int j = 0; j < 10; ++j) {
       CHECK_EQ(cols[j], (i + 1) * (j + 1));
     }
@@ -185,8 +185,8 @@ TEST(PyDataProvider2, sparse_value_no_seq) {
   CHECK(csm != nullptr);
   for (int i = 0; i < 200; ++i) {
     CHECK_EQ(csm->getColNum(i), (size_t)10);
-    int* cols = csm->getRowCols(i);
-    real* dat = csm->getRowValues(i);
+    int *cols = csm->getRowCols(i);
+    real *dat = csm->getRowValues(i);
     for (int j = 0; j < 10; ++j) {
       EXPECT_EQ(cols[j], (i + 1) * (j + 1));
       EXPECT_EQ(dat[j], real(j) / real(i + 1));
@@ -197,7 +197,7 @@ TEST(PyDataProvider2, sparse_value_no_seq) {
 TEST(PyDataProvider2, index_seq) {
   paddle::DataBatch batch;
   CHECK_EQ(readDataBatch(&batch, "test_index_seq"), 200);
-  auto& arg = batch.getStreams()[0];
+  auto &arg = batch.getStreams()[0];
   CHECK_EQ((int)arg.ids->getSize(), (200 + 1) * 200 / 2);
   size_t tmp = 0;
   for (size_t i = 0; i < 200; ++i) {  // CHECK DATA CORRECT
@@ -219,7 +219,7 @@ TEST(PyDataProvider2, index_seq) {
 TEST(PyDataProvider2, index_sub_seq) {
   paddle::DataBatch batch;
   ASSERT_EQ(readDataBatch(&batch, "test_index_sub_seq"), 200);
-  auto& arg = batch.getStreams()[0];
+  auto &arg = batch.getStreams()[0];
   size_t tmp = 0;
   for (size_t i = 0; i < 200; ++i) {
     for (size_t j = 0; j < i + 1; ++j) {
@@ -268,7 +268,7 @@ TEST(PyDataProvider2, min_pool_size) {
     }
   });
   while (true) {
-    size_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
     if (realBatchSize) {
       totalData -= realBatchSize;
     } else {
@@ -291,7 +291,7 @@ TEST(PyDataProvider2, can_over_batch_size) {
   provider->reset();
   constexpr size_t batchSize = 100;
   while (true) {
-    size_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
     if (realBatchSize) {
       CHECK_LE(realBatchSize, batchSize);
     } else {
@@ -317,12 +317,12 @@ TEST(PyDataProvider2, input_order) {
   provider->reset();
   constexpr size_t batchSize = 100;
   while (true) {
-    size_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
     if (!realBatchSize) {
       break;
     }
-    ASSERT_EQ(batch.getStreams().size(), (size_t)2);
-    for (size_t i = 0; i < realBatchSize; ++i) {
+    ASSERT_EQ(batch.getStreams().size(), static_cast<size_t>(2));
+    for (int64_t i = 0; i < realBatchSize; ++i) {
       ASSERT_EQ(batch.getStream(0).ids->getData()[i], 0);
       ASSERT_EQ(batch.getStream(1).ids->getData()[i], 1);
     }
@@ -341,11 +341,11 @@ TEST(PyDataProvider2, test_check) {
       paddle::DataProvider::create(config, false));
   provider->reset();
   while (true) {
-    size_t realBatchSize = provider->getNextBatchInternal(100, &batch);
+    int64_t realBatchSize = provider->getNextBatchInternal(100, &batch);
     if (!realBatchSize) {
       break;
     } else {
-      auto& ivec = batch.getStream(0).ids;
+      auto &ivec = batch.getStream(0).ids;
       for (size_t i = 0; i < ivec->getSize(); ++i) {
         CHECK_LT(ivec->getData()[i], 10);
       }
@@ -370,7 +370,30 @@ TEST(PyDataProvider2, multiThread) {
   provider.reset();
 }
 
-int main(int argc, char** argv) {
+TEST(PyDataProvider2, minPoolSizeWithCache) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_min_pool_size_with_cache");
+  config.set_async_load_data(true);
+
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+
+  paddle::DataBatch batch;
+
+  for (int i = 0; i < 10; ++i) {
+    provider->reset();
+    int64_t sum = 0;
+    while (int64_t actualNum = provider->getNextBatch(100, &batch)) {
+      sum += actualNum;
+    }
+    ASSERT_EQ(1 << 20, sum);
+  }
+}
+
+int main(int argc, char **argv) {
   testing::InitGoogleTest(&argc, argv);
   paddle::initMain(argc, argv);
   paddle::initPython(argc, argv);
diff --git a/paddle/gserver/tests/test_PyDataProvider2.py b/paddle/gserver/tests/test_PyDataProvider2.py
index 7ca30198fb1d0e7384db2c28524c7898dcd27e50..f7b540013e76f03878a88ebd593a9af6f0ef16c8 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.py
+++ b/paddle/gserver/tests/test_PyDataProvider2.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -111,3 +111,13 @@ def test_check(settings, filename):
             if i < 10:
                 yield_good_value = True
             yield i
+
+
+@provider(
+    input_types=[index_slot(10)],
+    min_pool_size=1000,
+    cache=CacheType.CACHE_PASS_IN_MEM, )
+def test_min_pool_size_with_cache(settings, filename):
+    import random
+    for _ in xrange(2**20):
+        yield random.randint(0, 9)
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
index 80d713dac03a42b370d50ebb17d089e9be2f17ff..b47279b77a6b3aea1313628d79cfd27efa35c361 100644
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <paddle/utils/Util.h>
-#include <paddle/utils/Version.h>
-#include <paddle/utils/PythonUtil.h>
+#include <paddle/gserver/gradientmachines/GradientMachine.h>
 #include <paddle/trainer/Trainer.h>
 #include <paddle/trainer/TrainerInternal.h>
-#include <paddle/gserver/gradientmachines/GradientMachine.h>
+#include <paddle/utils/PythonUtil.h>
+#include <paddle/utils/Util.h>
+#include <paddle/utils/Version.h>
 
-P_DECLARE_int32(seed);
+DECLARE_int32(seed);
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
@@ -45,10 +45,9 @@ public:
     auto p = const_cast<TrainerForTest*>(this);
     auto& params = p->getGradientMachine()->getParameters();
     return std::accumulate(
-        params.begin(),
-        params.end(),
-        0UL,
-        [](size_t a, const ParameterPtr& p) { return a + p->getSize(); });
+        params.begin(), params.end(), 0UL, [](size_t a, const ParameterPtr& p) {
+          return a + p->getSize();
+        });
   }
 };
 
@@ -128,7 +127,7 @@ TEST(RecurrentGradientMachine, HasSubSequence) {
   }
 }
 
-TEST(RecurrentGradientMachine, rnn) {
+TEST(RecurrentGradientMachine, DISABLED_rnn) {
   for (bool useGpu : {false, true}) {
     test("gserver/tests/sequence_rnn.conf",
          "gserver/tests/sequence_nest_rnn.conf",
@@ -137,7 +136,7 @@ TEST(RecurrentGradientMachine, rnn) {
   }
 }
 
-TEST(RecurrentGradientMachine, rnn_multi_input) {
+TEST(RecurrentGradientMachine, DISABLED_rnn_multi_input) {
   for (bool useGpu : {false, true}) {
     test("gserver/tests/sequence_rnn_multi_input.conf",
          "gserver/tests/sequence_nest_rnn_multi_input.conf",
@@ -146,23 +145,24 @@ TEST(RecurrentGradientMachine, rnn_multi_input) {
   }
 }
 
-TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) {
+TEST(RecurrentGradientMachine, DISABLED_rnn_multi_unequalength_input) {
   for (bool useGpu : {false, true}) {
-    test("gserver/tests/sequence_rnn_multi_unequalength_inputs.conf",
-         "gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.conf",
+    test("gserver/tests/sequence_rnn_multi_unequalength_inputs.py",
+         "gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py",
          1e-6,
          useGpu);
   }
 }
 
 int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+
   if (paddle::version::isWithPyDataProvider()) {
     if (!paddle::version::isWithGpu()) {
       FLAGS_use_gpu = false;
     }
     initMain(argc, argv);
     initPython(argc, argv);
-    testing::InitGoogleTest(&argc, argv);
     return RUN_ALL_TESTS();
   } else {
     return 0;
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index 0643cec38b3a5d96de64438c7342f827fde808a9..f91c788863b6963df92b735dbfef2bacee1fff45 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,19 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <vector>
 #include <paddle/utils/Version.h>
+#include <vector>
+#include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/gserver/layers/Layer.h"
-#include "ModelConfig.pb.h"
 
 #include "TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
-P_DECLARE_bool(use_gpu);
-P_DECLARE_bool(rnn_use_batch);
-P_DECLARE_int32(fixed_seq_length);
+DECLARE_bool(use_gpu);
+DECLARE_bool(rnn_use_batch);
+DECLARE_int32(fixed_seq_length);
 
 void checkError(const Matrix& matrix1, const Matrix& matrix2) {
   CHECK(matrix1.getHeight() == matrix2.getHeight());
@@ -220,8 +220,8 @@ TEST(Layer, RecurrentLayer) {
 }
 
 #define protected public
-#include "paddle/gserver/layers/LstmLayer.h"
 #include "paddle/gserver/layers/GatedRecurrentLayer.h"
+#include "paddle/gserver/layers/LstmLayer.h"
 template <class T>
 class TestRecurrentLayer {
 public:
diff --git a/paddle/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
index 204b03332ff5bba3b9f3e5d98050942d6f0f390f..ab23d00a2cb6077147f5b89664a8e2437b4cd63b 100644
--- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp
+++ b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,28 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <gtest/gtest.h>
+#include <math.h>
 #include <paddle/utils/PythonUtil.h>
+#include <algorithm>
 #include <cstdlib>
 #include <ctime>
-#include <math.h>
-#include <gtest/gtest.h>
-#include <algorithm>
+#include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/Layer.h"
 #include "paddle/gserver/layers/FullyConnectedLayer.h"
+#include "paddle/gserver/layers/Layer.h"
 #include "paddle/gserver/layers/SelectiveFullyConnectedLayer.h"
-#include "ModelConfig.pb.h"
 #include "paddle/math/CpuSparseMatrix.h"
 #include "paddle/trainer/Trainer.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(num_passes);
-P_DECLARE_string(config);
-P_DECLARE_string(init_model_path);
-P_DECLARE_string(config_args);
+DECLARE_bool(use_gpu);
+DECLARE_int32(num_passes);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_string(config_args);
 
 size_t fcLayerWidth = 1024;
 
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0a4a814d5247410248f7418e1ef2c79a2da42507
--- /dev/null
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -0,0 +1,250 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/utils/Version.h>
+#include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/CTCLayer.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/gserver/layers/Layer.h"
+#include "paddle/gserver/layers/WarpCTCLayer.h"
+
+#include "TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+
+const real* getData(const Matrix& matrix) {
+  if (matrix.useGpu()) {
+    MatrixPtr cpuMatrix = Matrix::create(
+        matrix.getHeight(), matrix.getWidth(), matrix.isTransposed(), false);
+    cpuMatrix->copyFrom(matrix);
+    return cpuMatrix->getData();
+  } else {
+    return matrix.getData();
+  }
+}
+
+int checkError(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK_EQ(matrix1.getHeight(), matrix2.getHeight());
+  CHECK_EQ(matrix1.getWidth(), matrix2.getWidth());
+  CHECK_EQ(matrix1.isTransposed(), matrix2.isTransposed());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+
+  const real* data1 = getData(matrix1);
+  const real* data2 = getData(matrix2);
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+  return count;
+}
+
+void initArgument(size_t batchSize,
+                  int layerSize,
+                  bool useGpu,
+                  Argument& data) {
+  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.grad = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.value->randomizeUniform();
+  data.value->add(-0.5);
+  data.grad->zeroMem();
+
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+}
+
+LayerPtr createDataLayer(
+    string name, size_t batchSize, int layerSize, bool useGpu, Argument& data) {
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("data");
+  layerConfig.set_size(layerSize);
+  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
+
+  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  dataLayer->setData(data);
+  dataLayer->forward(PASS_GC);
+
+  return layer;
+}
+
+LayerPtr createLabelLayer(string name,
+                          size_t batchSize,
+                          size_t numClasses,
+                          bool useGpu) {
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("data");
+  layerConfig.set_size(1);
+  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
+
+  Argument data;
+  data.ids = IVector::create(batchSize, useGpu);
+  data.ids->rand(numClasses - 1);
+
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+
+  DataLayerPtr labelLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  labelLayer->setData(data);
+  labelLayer->forward(PASS_GC);
+
+  return layer;
+}
+
+LayerPtr createCTCLayer(string name,
+                        size_t numClasses,
+                        bool useGpu,
+                        bool normByTimes,
+                        LayerPtr dataLayer,
+                        LayerPtr labelLayer) {
+  LayerMap layerMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  layerMap[labelLayer->getName()] = labelLayer;
+
+  ParameterMap parameterMap;
+
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("ctc");
+  layerConfig.set_size(numClasses);
+  layerConfig.set_norm_by_times(normByTimes);
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
+  input0.set_input_layer_name(dataLayer->getName());
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
+  input1.set_input_layer_name(labelLayer->getName());
+
+  LayerPtr layer = LayerPtr(new CTCLayer(layerConfig));
+  layerMap[layer->getName()] = layer;
+  layer->init(layerMap, parameterMap);
+
+  ActivationFunction* softmaxActivation = ActivationFunction::create("softmax");
+
+  softmaxActivation->forward(dataLayer->getOutput());
+  layer->forward(PASS_GC);
+
+  layer->backward();
+  softmaxActivation->backward(dataLayer->getOutput());
+
+  return layer;
+}
+
+LayerPtr createWarpCTCLayer(string name,
+                            size_t numClasses,
+                            bool useGpu,
+                            bool normByTimes,
+                            LayerPtr dataLayer,
+                            LayerPtr labelLayer) {
+  LayerMap layerMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  layerMap[labelLayer->getName()] = labelLayer;
+
+  ParameterMap parameterMap;
+
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("warp_ctc");
+  layerConfig.set_size(numClasses);
+  layerConfig.set_blank(numClasses - 1);
+  layerConfig.set_norm_by_times(normByTimes);
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
+  input0.set_input_layer_name(dataLayer->getName());
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
+  input1.set_input_layer_name(labelLayer->getName());
+
+  LayerPtr layer = LayerPtr(new WarpCTCLayer(layerConfig));
+  layerMap[layer->getName()] = layer;
+  layer->init(layerMap, parameterMap);
+
+  layer->forward(PASS_GC);
+  layer->backward();
+
+  return layer;
+}
+
+TEST(Layer, WarpCTCLayer) {
+  for (auto layerSize : {10, 64}) {
+    for (auto batchSize : {1, 10, 32}) {
+      for (auto normByTimes : {false, true}) {
+        for (auto useGpu : {false, true}) {
+#ifdef PADDLE_ONLY_CPU
+          if (useGpu) continue;
+#endif
+          LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize
+                    << " normByTimes = " << normByTimes << " useGpu=" << useGpu;
+
+          FLAGS_use_gpu = useGpu;
+
+          Argument data0;
+          initArgument(batchSize, layerSize, useGpu, data0);
+
+          Argument data1;
+          data1.resizeAndCopyFrom(data0);
+
+          LayerPtr dataLayer0 =
+              createDataLayer("data", batchSize, layerSize, useGpu, data0);
+          LayerPtr dataLayer1 =
+              createDataLayer("data", batchSize, layerSize, useGpu, data1);
+
+          LayerPtr labelLayer =
+              createLabelLayer("label", batchSize, layerSize, useGpu);
+
+          LayerPtr warpctcLayer = createWarpCTCLayer(
+              "cost", layerSize, useGpu, normByTimes, dataLayer0, labelLayer);
+          LayerPtr ctcLayer = createCTCLayer(
+              "cost", layerSize, useGpu, normByTimes, dataLayer1, labelLayer);
+
+          /// Check cost
+          LOG(INFO) << "Check cost: "
+                    << checkError(*(warpctcLayer->getOutput().value),
+                                  *(ctcLayer->getOutput().value))
+                    << " different elements.";
+
+          /// Check gradients
+          LOG(INFO) << "Check gradients: "
+                    << checkError(*(dataLayer0->getOutput().grad),
+                                  *(dataLayer1->getOutput().grad))
+                    << " different elements";
+        }
+      }
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index cba8b37289b53b7d75c64a6a95c9e3900b193902..666a8b8368e3e2ebc522902c176d7491d2920d2a 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include <mutex>
 #include <stdlib.h>
+#include <mutex>
 #include "hl_gpu.h"
 #include "paddle/utils/Logging.h"
 
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 2f32b3fdd1a26c5b1bca43d0bd0ebb0896a012c4..0a0d92d1ae65f5b6020eb71fe2a6db5a3c625d9c 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -355,11 +355,11 @@ void BaseMatrixT<T>::neg() { applyUnary(unary::Neg<T>()); }
 
 DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a));
 template<>
-void BaseMatrixT<real>::exp() { applyUnary(unary::Exp<real>()); }
+void BaseMatrixT<real>::exp2() { applyUnary(unary::Exp<real>()); }
 
 DEFINE_MATRIX_UNARY_OP(Log, a = log(a));
 template<>
-void BaseMatrixT<real>::log() {
+void BaseMatrixT<real>::log2() {
   if (useGpu_) {
     applyUnary(unary::Log<real>());
   } else {
@@ -369,23 +369,23 @@ void BaseMatrixT<real>::log() {
 
 DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a));
 template<>
-void BaseMatrixT<real>::sqrt() { applyUnary(unary::Sqrt<real>()); }
+void BaseMatrixT<real>::sqrt2() { applyUnary(unary::Sqrt<real>()); }
 
 DEFINE_MATRIX_UNARY_OP(Square, a = a * a);
 template<class T>
-void BaseMatrixT<T>::square() { applyUnary(unary::Square<T>()); }
+void BaseMatrixT<T>::square2() { applyUnary(unary::Square<T>()); }
 
 DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a);
 template<class T>
-void BaseMatrixT<T>::reciprocal() { applyUnary(unary::Reciprocal<T>()); }
+void BaseMatrixT<T>::reciprocal2() { applyUnary(unary::Reciprocal<T>()); }
 
 DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a);
 template<class T>
-void BaseMatrixT<T>::abs() { applyUnary(unary::Abs<T>()); }
+void BaseMatrixT<T>::abs2() { applyUnary(unary::Abs<T>()); }
 
 DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0));
 template<class T>
-void BaseMatrixT<T>::sign() { applyUnary(unary::Sign<T>()); }
+void BaseMatrixT<T>::sign2() { applyUnary(unary::Sign<T>()); }
 
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
 template<class T>
@@ -405,7 +405,7 @@ void BaseMatrixT<T>::one() { applyUnary(unary::One<T>()); }
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p));
 template<>
-void BaseMatrixT<real>::pow(real p) {
+void BaseMatrixT<real>::pow2(real p) {
   if (useGpu_) {
     applyUnary(unary::Pow<real>(p));
   } else {
@@ -534,7 +534,7 @@ void BaseMatrixT<T>::add(BaseMatrixT& b, T p) {
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p));
 template<>
-void BaseMatrixT<real>::pow(BaseMatrixT& b, real p) {
+void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
   if (useGpu_) {
     applyBinary(binary::Pow<real>(p), b);
   } else {
@@ -615,7 +615,7 @@ void BaseMatrixT<T>::breluDerivative(BaseMatrixT& b) {
 
 DEFINE_MATRIX_BINARY_OP(Square, b = a * a);
 template<class T>
-void BaseMatrixT<T>::square(BaseMatrixT& b) {
+void BaseMatrixT<T>::square2(BaseMatrixT& b) {
   applyBinary(binary::Square<T>(), b);
 }
 
@@ -657,7 +657,7 @@ void BaseMatrixT<T>::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) {
 
 DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a);
 template<class T>
-void BaseMatrixT<T>::reciprocal(BaseMatrixT& b) {
+void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b) {
   applyBinary(binary::Reciprocal<T>(), b);
 }
 
@@ -669,7 +669,7 @@ void BaseMatrixT<T>::reciprocalDerivative(BaseMatrixT& b) {
 
 DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a);
 template<class T>
-void BaseMatrixT<T>::abs(BaseMatrixT& b) { applyBinary(binary::Abs<T>(), b); }
+void BaseMatrixT<T>::abs2(BaseMatrixT& b) { applyBinary(binary::Abs<T>(), b); }
 
 DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0);
 template<class T>
@@ -729,17 +729,19 @@ void BaseMatrixT<T>::expDerivative(BaseMatrixT& b) {
 
 DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f);
 template<class T>
-void BaseMatrixT<T>::sign(BaseMatrixT& b) { applyBinary(binary::Sign<T>(), b); }
+void BaseMatrixT<T>::sign2(BaseMatrixT& b) {
+  applyBinary(binary::Sign<T>(), b);
+}
 
 DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b));
 template<>
-void BaseMatrixT<real>::exp(BaseMatrixT& b) {
+void BaseMatrixT<real>::exp2(BaseMatrixT& b) {
   applyBinary(binary::Exp<real>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Log, a = log(b));
 template<>
-void BaseMatrixT<real>::log(BaseMatrixT& b) {
+void BaseMatrixT<real>::log2(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::Log<real>(), b);
   } else {
@@ -749,7 +751,7 @@ void BaseMatrixT<real>::log(BaseMatrixT& b) {
 
 DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b));
 template<>
-void BaseMatrixT<real>::sqrt(BaseMatrixT& b) {
+void BaseMatrixT<real>::sqrt2(BaseMatrixT& b) {
   applyBinary(binary::Sqrt<real>(), b);
 }
 
@@ -1065,7 +1067,7 @@ void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
 
 DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c);
 template<class T>
-void BaseMatrixT<T>::max(BaseMatrixT& b, BaseMatrixT& c) {  // NOLINT
+void BaseMatrixT<T>::max2(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::Max<T>(), b, c);
 }
 
@@ -1168,7 +1170,7 @@ void BaseMatrixT<T>::reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2,
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, TWO_PARAMETER,
                                   a = 1 / (p1 * b + p2));
 template<class T>
-void BaseMatrixT<T>::reciprocal(BaseMatrixT& b, T p1, T p2) {
+void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::Reciprocal2<T>(p1, p2), b);
 }
 
@@ -1240,6 +1242,12 @@ void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
   }
 }
 
+DEFINE_MATRIX_BINARY_OP(DeepSwap, T tmp = a; a = b; b = tmp);
+template<class T>
+void BaseMatrixT<T>::deepSwap(BaseMatrixT& b) {
+    applyBinary(binary::DeepSwap<T>(), b);
+}
+
 template<>
 void BaseMatrixT<real>::rowDotMul(size_t destCol,
                                   BaseMatrixT& b,
@@ -1578,11 +1586,6 @@ void BaseMatrixT<real>::minRows(BaseMatrixT& b) {
   applyRow(aggregate::min(), b);
 }
 
-template<>
-void BaseMatrixT<real>::sumCols(BaseMatrixT& b) {
-  applyCol(aggregate::sum(), b);
-}
-
 template<>
 void BaseMatrixT<real>::maxCols(BaseMatrixT& b) {
   applyCol(aggregate::max(), b);
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index d41dcee682cce15e94d45dafeb12bb0dce19b221..2933c20fbad930248c41969d88d45cf397b9dcf8 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <cstddef>
 #include <stdint.h>
+#include <cstddef>
+#include "TensorExpression.h"
 #include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
@@ -70,7 +71,7 @@ public:
 };
 
 template <class T>
-class BaseMatrixT {
+class BaseMatrixT : public TensorExpression<BaseMatrixT<T>, T> {
 public:
   size_t height_, width_;
   size_t stride_;
@@ -427,14 +428,14 @@ public:
    *
    */
   void neg();
-  void exp();
-  void pow(T p);
-  void log();
-  void sqrt();
-  void square();
-  void reciprocal();
-  void abs();
-  void sign();
+  void exp2();
+  void pow2(T p);
+  void log2();
+  void sqrt2();
+  void square2();
+  void reciprocal2();
+  void abs2();
+  void sign2();
   void zero();
 
   /**
@@ -455,6 +456,17 @@ public:
    */
   void assign(T p);
 
+  /**
+   * @code
+   * swap(this, b)
+   * example: swap two Matrices
+   * MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+   * MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+   * cpuA->deepSwap(*cpuB);
+   * @endcode
+   */
+  void deepSwap(BaseMatrixT& b);
+
   /**
    * @code
    * this = this + p
@@ -592,7 +604,7 @@ public:
    * b = this * this
    * @endcode
    */
-  void square(BaseMatrixT& b);
+  void square2(BaseMatrixT& b);
   void squareDerivative(BaseMatrixT& b);
 
   /**
@@ -616,7 +628,7 @@ public:
    * b = 1.0f / this
    * @endcode
    */
-  void reciprocal(BaseMatrixT& b);
+  void reciprocal2(BaseMatrixT& b);
   void reciprocalDerivative(BaseMatrixT& b);
 
   /**
@@ -624,7 +636,7 @@ public:
    * b = this > 0.0f ? this : -this
    * @endcode
    */
-  void abs(BaseMatrixT& b);
+  void abs2(BaseMatrixT& b);
   void absDerivative(BaseMatrixT& b);
 
   /**
@@ -642,12 +654,12 @@ public:
    */
   void expDerivative(BaseMatrixT& b);
 
-  void sign(BaseMatrixT& b);
+  void sign2(BaseMatrixT& b);
 
-  void exp(BaseMatrixT& b);
-  void pow(BaseMatrixT& b, T p);
-  void log(BaseMatrixT& b);
-  void sqrt(BaseMatrixT& b);
+  void exp2(BaseMatrixT& b);
+  void pow2(BaseMatrixT& b, T p);
+  void log2(BaseMatrixT& b);
+  void sqrt2(BaseMatrixT& b);
   void addScalar(BaseMatrixT& b, T p);
   void subScalar(BaseMatrixT& b, T p);
   void mulScalar(BaseMatrixT& b, T p);
@@ -817,7 +829,7 @@ public:
    * this = b>c ? b : c
    * @endcode
    */
-  void max(BaseMatrixT& b, BaseMatrixT& c);  //  NOLINT
+  void max2(BaseMatrixT& b, BaseMatrixT& c);
 
   /**
    * @code
@@ -916,7 +928,7 @@ public:
    * this = 1 / (p1 * b + p2)
    * @endcode
    */
-  void reciprocal(BaseMatrixT& b, T p1, T p2);
+  void reciprocal2(BaseMatrixT& b, T p1, T p2);
 
   /**
    * @code
@@ -1007,8 +1019,6 @@ public:
   /// calculate the minimum value of each row of the matrix b.
   void minRows(BaseMatrixT& b);
 
-  /// calculate the sum of each column of the matrix b.
-  void sumCols(BaseMatrixT& b);
   /// calculate the maximum value of each column of the matrix b.
   void maxCols(BaseMatrixT& b);
   /// calculate the minimum value of each column of the matrix b.
@@ -1041,6 +1051,32 @@ public:
   void rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
 
   virtual bool isSparse() const { return false; }
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (useGpu_) {
+      TensorGpuApply<T>(*this, expr);
+    } else {
+      TensorCpuApply<T>(*this, expr);
+    }
+  }
+
+  template <typename ExpressionType>
+  void operator+=(const ExpressionType& expr) {
+    (*this) = (*this) + expr;
+  }
+  template <typename ExpressionType>
+  void operator-=(const ExpressionType& expr) {
+    (*this) = (*this) - expr;
+  }
+  template <typename ExpressionType>
+  void operator*=(const ExpressionType& expr) {
+    (*this) = (*this) * expr;
+  }
+  template <typename ExpressionType>
+  void operator/=(const ExpressionType& expr) {
+    (*this) = (*this) / expr;
+  }
 };
 
 typedef BaseMatrixT<real> BaseMatrix;
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
index 93b1bf46a10078b4ae83efdbf268f64b6da052dc..f5657c4690ca71200346efd4e2c5244c02c92eb1 100644
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -16,10 +16,12 @@ file(GLOB MATH_HEADERS . *.h)
 file(GLOB MATH_SOURCES . *.cpp)
 set(MATH_SOURCES
     "${PROJ_ROOT}/paddle/math/BaseMatrix.cu"
+    "${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu"
     ${MATH_SOURCES})
 if(NOT WITH_GPU)
     # then compile BaseMatrix.cu as c++ file
     compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/BaseMatrix.cu")
+    compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu")
     add_library(paddle_math STATIC
         ${MATH_SOURCES})
 else()
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index ad3f8e64efd37c27c7f462dd7c8311577a05a391..b5d5b6ef615829fc1e24ccd417e2f0b3312f072d 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "hl_gpu.h"
 #include "CpuSparseMatrix.h"
 #include "SparseMatrix.h"
+#include "float.h"
+#include "hl_gpu.h"
 #include "paddle/math/MathUtils.h"
 #include "paddle/utils/Util.h"
-#include "float.h"
 
 namespace paddle {
 
@@ -656,9 +656,9 @@ void CpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
   if (format_ == SPARSE_CSR) {
     int* srcCols = src.getCols();
     size_t numLessWidth =
-        std::count_if(srcCols,
-                      srcCols + src.getElementCnt(),
-                      [this](size_t n) { return n < this->width_; });
+        std::count_if(srcCols, srcCols + src.getElementCnt(), [this](size_t n) {
+          return n < this->width_;
+        });
     resize(height_, width_, numLessWidth, valueType_, format_);
     rows_[0] = 0;
     size_t index = 0;
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index 861564555166da0bb70d500569dc0d4f89dd2fe5..9676f8864f845e8ab75467c8ca6b6e7e68945d96 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -136,7 +136,7 @@ public:
     return sum;
   }
 
-  virtual void square() {
+  virtual void square2() {
     CHECK(isContiguous());
     if (valueType_ == NO_VALUE) {
       return;
diff --git a/paddle/math/ExecViaCpu.h b/paddle/math/ExecViaCpu.h
index 67fb6c0cda6f46ddf4547b9ec9faaa8931c75eed..1e03cc5f45a96f20e49482fdfc8eba5c7124fe00 100644
--- a/paddle/math/ExecViaCpu.h
+++ b/paddle/math/ExecViaCpu.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index 1217163beecf19c2af215e3d4c72db644cd74b51..d7aa1184872d5a6129becca1f6e282776c9dbe15 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "MathFunctions.h"
-#include "hl_matrix_ops.cuh"
 #include "hl_matrix_apply.cuh"
+#include "hl_matrix_ops.cuh"
 
 namespace paddle {
 
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index 0741c456780e36c6b87dd44d89ffc601ac928f31..c8559eefd8378450fc18c2ba821c65b39c8cc046 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/math/MathUtils.cpp b/paddle/math/MathUtils.cpp
index 878e0b8723025e75f7838e981517f58a3dcb5424..5bbc3e4e3725f186373072440a93f967178e0b27 100644
--- a/paddle/math/MathUtils.cpp
+++ b/paddle/math/MathUtils.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "MathUtils.h"
 #include <algorithm>
-#include "paddle/utils/Logging.h"
 #include "Vector.h"
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/math/MathUtils.h b/paddle/math/MathUtils.h
index 907116c00281bfcf34c6652564f55a37c3f47a8c..f2b298013877d59e1fa0eca4feb09e4f6608fdf5 100644
--- a/paddle/math/MathUtils.h
+++ b/paddle/math/MathUtils.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index b70b47a5fcc72edea8fa5a680c4af962ea0f4ae9..c69e074a76399db923a5c64243f1d3690858810d 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 075dc845768d7dfa156d33d057a30b28628c099c..5685cb7bcbbb6b90687790953d676e3792f36f36 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,20 +14,20 @@ limitations under the License. */
 
 #pragma once
 
+#include <stdint.h>
 #include <memory>
 #include <thread>
-#include <stdint.h>
 
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/ThreadLocal.h"
 
 #include <hl_gpu.h>
 
+#include "BaseMatrix.h"
 #include "MemoryHandle.h"
-#include "paddle/utils/TypeDefs.h"
 #include "Vector.h"
 #include "paddle/utils/ThreadLocal.h"
-#include "BaseMatrix.h"
+#include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
 
@@ -408,7 +408,7 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  virtual void addBias(Matrix& b, real scale, bool sharedBias) {
+  void addBias(Matrix& b, real scale, bool sharedBias) {
     if (!sharedBias) {
       addBias(b, scale);
     } else {
@@ -425,7 +425,7 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  virtual void collectBias(Matrix& a, real scale, bool sharedBias) {
+  void collectBias(Matrix& a, real scale, bool sharedBias) {
     if (!sharedBias) {
       collectBias(a, scale);
     } else {
@@ -1122,6 +1122,7 @@ public:
   virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
     LOG(FATAL) << "Not implemented";
   }
+
   virtual void bilinearForward(const Matrix& in,
                                const size_t inImgH,
                                const size_t inImgW,
@@ -1142,6 +1143,15 @@ public:
                                 const real ratioW) {
     LOG(FATAL) << "Not implemented";
   }
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (useGpu_) {
+      TensorGpuApply<real>(*this, expr);
+    } else {
+      TensorCpuApply<real>(*this, expr);
+    }
+  }
 };
 
 inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
@@ -1518,6 +1528,11 @@ public:
   void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
 
   void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorGpuApply<real>(*this, expr);
+  }
 };
 
 class CpuMatrix : public Matrix {
@@ -1917,6 +1932,11 @@ public:
                         const size_t numChannels,
                         const real ratioH,
                         const real ratioW);
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorCpuApply<real>(*this, expr);
+  }
 };
 
 class SharedCpuMatrix : public CpuMatrix {
@@ -1957,6 +1977,7 @@ public:
   void add(real p1, real p2);
 
 private:
+  using Matrix::mul;
   void initShared(int blockNum);
   void initBlock(int blockNum);
 
diff --git a/paddle/math/MatrixBitCode.cpp b/paddle/math/MatrixBitCode.cpp
index ac5b10c7bd56bb34393ac8abb98900351afc2e41..cea912d3ca02715c203814d13529aadfd9d3b7fb 100644
--- a/paddle/math/MatrixBitCode.cpp
+++ b/paddle/math/MatrixBitCode.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
 #include "Matrix.h"
 #include "hl_gpu.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/math/MemoryHandle.cpp b/paddle/math/MemoryHandle.cpp
index 9101957fc6c221bed4aa8e0c76b4c6735e50fd2d..84afb5944c3ea4aa3b8f44646b23d18b2903281b 100644
--- a/paddle/math/MemoryHandle.cpp
+++ b/paddle/math/MemoryHandle.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cmath>
 #include "MemoryHandle.h"
+#include <cmath>
 #include "Storage.h"
 
 namespace paddle {
diff --git a/paddle/math/MemoryHandle.h b/paddle/math/MemoryHandle.h
index f12635d5d4b6ff7204d4d3e8d6f07d438c0ce1e8..0828d377c90c9fc93b79078ffed0d0b911b99c04 100644
--- a/paddle/math/MemoryHandle.h
+++ b/paddle/math/MemoryHandle.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/math/PoolAllocator.cpp b/paddle/math/PoolAllocator.cpp
index 2c150949dd4eca08824401685beecc19142cbd76..4282c7243a801c4116885cfa91d78c635e49ea94 100644
--- a/paddle/math/PoolAllocator.cpp
+++ b/paddle/math/PoolAllocator.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/math/PoolAllocator.h b/paddle/math/PoolAllocator.h
index 5d33b453127a5aaa355ba8c569baf1eefe931c96..c06efa9ac77a5659b242d039c38455e2ee9b0db6 100644
--- a/paddle/math/PoolAllocator.h
+++ b/paddle/math/PoolAllocator.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include <map>
 #include <memory>
 #include <mutex>
-#include <vector>
 #include <unordered_map>
-#include <map>
+#include <vector>
 #include "Allocator.h"
 
 namespace paddle {
diff --git a/paddle/math/SIMDFunctions.cpp b/paddle/math/SIMDFunctions.cpp
index 1fb156f29bbb586b6251f961bb4fd5f4d5da0737..95219debf50e57407b668d315b91141d259fc779 100644
--- a/paddle/math/SIMDFunctions.cpp
+++ b/paddle/math/SIMDFunctions.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/math/SIMDFunctions.h b/paddle/math/SIMDFunctions.h
index ac82f109104d7c21f346f909984306de105c0fd4..9b0a8719b287a2b88e966484090974586d64521f 100644
--- a/paddle/math/SIMDFunctions.h
+++ b/paddle/math/SIMDFunctions.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp
index 2b0bff9535d5a9ba4a47def4d6f964c799325535..9154503c2132a740aaa42f90eb7061156403ac00 100644
--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "SparseMatrix.h"
 #include <algorithm>
+#include <iostream>
 #include <vector>
 #include "hl_gpu.h"
-#include "SparseMatrix.h"
-#include "paddle/utils/Util.h"
 #include "hl_top_k.h"
-#include <iostream>
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
@@ -537,11 +537,9 @@ void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
     dataVec.emplace_back(
         rows.getData()[i], cols_full.getData()[i], value.getData()[i]);
   }
-  std::sort(dataVec.begin(),
-            dataVec.end(),
-            [](Element a, Element b) {
-              return a.row < b.row || (a.row == b.row && a.col < b.col);
-            });
+  std::sort(dataVec.begin(), dataVec.end(), [](Element a, Element b) {
+    return a.row < b.row || (a.row == b.row && a.col < b.col);
+  });
 
   /*get sorted data, row index, and col index, put them in the right place*/
   cols.resize(height_ + 1);
diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h
index 175ef54b858b7f8f31f45796d733af81a9d67066..bd96a3301ded2fd89bd31b94f42b0cb4718cbcb7 100644
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include <cstddef>
-#include "Matrix.h"
 #include "CpuSparseMatrix.h"
+#include "Matrix.h"
 
 namespace paddle {
 
diff --git a/paddle/math/SparseRowMatrix.cpp b/paddle/math/SparseRowMatrix.cpp
index eefaf4b71f4f027d00405bd4b158adc66a902ef7..b61c6b2d49ccead5e9cfdf595a8bebae0e5b87b5 100644
--- a/paddle/math/SparseRowMatrix.cpp
+++ b/paddle/math/SparseRowMatrix.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,27 +15,24 @@ limitations under the License. */
 #include "SparseRowMatrix.h"
 #include "CpuSparseMatrix.h"
 
-#include <cmath>
 #include <algorithm>
 
 #include "paddle/utils/Logging.h"
 
 #include "SIMDFunctions.h"
 
-#include "paddle/utils/Util.h"
 #include "paddle/utils/Thread.h"
+#include "paddle/utils/Util.h"
 
-P_DEFINE_bool(allow_inefficient_sparse_update,
-              false,
-              "Whether to allow inefficient sparse update");
+DEFINE_bool(allow_inefficient_sparse_update,
+            false,
+            "Whether to allow inefficient sparse update");
 
 namespace paddle {
 
 const unsigned int SparseRowCpuMatrix::kUnusedId_ = -1U;
 
 void SparseRowCpuMatrix::init(size_t height, size_t width) {
-  // @TODO(yuyang18) Just remove this limit
-  CHECK(simd::vec_check(width)) << width;
   height_ = height;
   if (!indexDictHandle_) {
     indexDictHandle_.reset(new IndexDict);
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index 56f113a3614e2e22809abbdaa708557ed3344464..9364feb4a1462a5a9d16ca0f69213ba32ad97d21 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include <algorithm>
 #include <string.h>
-#include "paddle/utils/CommandLineParser.h"
+#include <algorithm>
 #include "Matrix.h"
+#include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Util.h"
 
-P_DECLARE_bool(allow_inefficient_sparse_update);
+DECLARE_bool(allow_inefficient_sparse_update);
 
 namespace paddle {
 
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 57ea5c926647d21a82c87fc262e2999e45e7534f..56e5442394b04230c22d668aa734dc0fa44004c2 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
-#include "Allocator.h"
 #include "Storage.h"
+#include "Allocator.h"
+#include "paddle/utils/Util.h"
 
-P_DEFINE_int32(pool_limit_size,
-               536870912,
-               "maximum memory size managed by a memory pool, default is 512M");
+DEFINE_int32(pool_limit_size,
+             536870912,
+             "maximum memory size managed by a memory pool, default is 512M");
 
 namespace paddle {
 
diff --git a/paddle/math/Storage.h b/paddle/math/Storage.h
index 725de247e64c26ccf80a83bce70989f5c3c4fe45..06a66b5f14643153f82a1596096fc28d3e47e3fd 100644
--- a/paddle/math/Storage.h
+++ b/paddle/math/Storage.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <mutex>
 #include <vector>
-#include "paddle/utils/Locks.h"
 #include "PoolAllocator.h"
+#include "paddle/utils/Locks.h"
 
 namespace paddle {
 
diff --git a/paddle/math/TensorApply.h b/paddle/math/TensorApply.h
new file mode 100644
index 0000000000000000000000000000000000000000..11c7acb44129a3b5bf680a4b5f9ff465d44b0965
--- /dev/null
+++ b/paddle/math/TensorApply.h
@@ -0,0 +1,211 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+
+/**
+ * \brief The tensor evaluator classes.
+ */
+template <typename Derived, class T>
+class TensorApply {
+public:
+  explicit INLINE TensorApply(const Derived& p)
+      : data_(p.data_),
+        stride_(p.stride_),
+        height_(p.height_),
+        width_(p.width_),
+        useGpu_(p.useGpu_) {}
+
+  INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
+  INLINE T apply(int index) const { return data_[index]; }
+  INLINE T& applyRef(int i, int j) { return data_[i * stride_ + j]; }
+  INLINE T& applyRef(int index) { return data_[index]; }
+
+  INLINE size_t getWidth() const { return width_; }
+  INLINE size_t getHeight() const { return height_; }
+  INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
+  INLINE bool useGpu() const { return useGpu_; }
+
+  T* data_;
+  size_t stride_;
+  size_t height_;
+  size_t width_;
+  bool useGpu_;
+};
+
+/**
+ * \brief The tensor evaluator classes.
+ * evaluator for rvalues
+ */
+template <typename Derived, class T>
+class TensorApply<const Derived, T> {
+public:
+  explicit INLINE TensorApply(const Derived& p)
+      : data_(p.data_),
+        stride_(p.stride_),
+        height_(p.height_),
+        width_(p.width_),
+        useGpu_(p.useGpu_) {}
+
+  INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
+  INLINE T apply(int index) const { return data_[index]; }
+
+  INLINE size_t getWidth() const { return width_; }
+  INLINE size_t getHeight() const { return height_; }
+  INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
+  INLINE bool useGpu() const { return useGpu_; }
+
+  const T* data_;
+  size_t stride_;
+  size_t height_;
+  size_t width_;
+  bool useGpu_;
+};
+
+template <typename Derived, class T>
+class TensorApply<const TensorExpression<Derived, T>, T> {
+public:
+  explicit TensorApply(const TensorExpression<Derived, T>& expr)
+      : expr_(expr.derived()) {}
+
+  INLINE T apply(int i, int j) const { return expr_.apply(i, j); }
+  INLINE T apply(int index) const { return expr_.apply(index); }
+
+  INLINE size_t getWidth() const { return expr_.getWidth(); }
+  INLINE size_t getHeight() const { return expr_.getHeight(); }
+  INLINE bool isContiguous() const { return expr_.isContiguous(); }
+  INLINE bool useGpu() const { return expr_.useGpu(); }
+
+  TensorApply<const Derived, T> expr_;
+};
+
+/**
+ * \brief The unary expression evaluator classes.
+ */
+template <class OP, typename ArgType, class T>
+class TensorApply<const TensorUnaryOp<OP, ArgType, T>, T> {
+public:
+  explicit INLINE TensorApply(const TensorUnaryOp<OP, ArgType, T>& expr)
+      : op_(expr.op_), expr_(expr.expr_) {}
+
+  INLINE T apply(int i, int j) const { return op_(expr_.apply(i, j)); }
+  INLINE T apply(int index) const { return op_(expr_.apply(index)); }
+
+  INLINE size_t getWidth() const { return expr_.getWidth(); }
+  INLINE size_t getHeight() const { return expr_.getHeight(); }
+  INLINE bool isContiguous() const { return expr_.isContiguous(); }
+  INLINE bool useGpu() const { return expr_.useGpu(); }
+
+  const OP op_;
+  TensorApply<ArgType, T> expr_;
+};
+
+/**
+ * \brief The binary expression evaluator classes.
+ */
+template <class OP, typename LhsType, typename RhsType, class T>
+class TensorApply<const TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
+public:
+  explicit INLINE TensorApply(
+      const TensorBinaryOp<OP, LhsType, RhsType, T>& expr)
+      : op_(expr.op_), lhs_(expr.lhs_), rhs_(expr.rhs_) {
+#ifndef __CUDA_ARCH__
+    CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+    CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+    CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+#endif
+  }
+
+  INLINE T apply(int i, int j) const {
+    return op_(lhs_.apply(i, j), rhs_.apply(i, j));
+  }
+  INLINE T apply(int index) const {
+    return op_(lhs_.apply(index), rhs_.apply(index));
+  }
+
+  INLINE size_t getWidth() const { return lhs_.getWidth(); }
+  INLINE size_t getHeight() const { return rhs_.getHeight(); }
+  INLINE bool isContiguous() const {
+    return lhs_.isContiguous() && rhs_.isContiguous();
+  }
+  INLINE bool useGpu() const { return lhs_.useGpu(); }
+
+  const OP op_;
+  TensorApply<LhsType, T> lhs_;
+  TensorApply<RhsType, T> rhs_;
+};
+
+/**
+ * \brief The ternary expression evaluator classes.
+ */
+template <typename ArgType1, typename ArgType2, typename ArgType3, class T>
+class TensorApply<const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>, T> {
+public:
+  explicit INLINE TensorApply(
+      const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>& expr)
+      : expr1_(expr.expr1_), expr2_(expr.expr2_), expr3_(expr.expr3_) {
+#ifndef __CUDA_ARCH__
+    CHECK_EQ(expr1_.getWidth(), expr2_.getWidth());
+    CHECK_EQ(expr1_.getWidth(), expr3_.getWidth());
+    CHECK_EQ(expr1_.getHeight(), expr2_.getHeight());
+    CHECK_EQ(expr1_.getHeight(), expr3_.getHeight());
+    CHECK_EQ(expr1_.useGpu(), expr2_.useGpu());
+    CHECK_EQ(expr1_.useGpu(), expr3_.useGpu());
+#endif
+  }
+
+  INLINE T apply(int i, int j) const {
+    return expr1_.apply(i, j) ? expr2_.apply(i, j) : expr3_.apply(i, j);
+  }
+  INLINE T apply(int index) const {
+    return expr1_.apply(index) ? expr2_.apply(index) : expr3_.apply(index);
+  }
+
+  INLINE size_t getWidth() const { return expr1_.getWidth(); }
+  INLINE size_t getHeight() const { return expr1_.getHeight(); }
+  INLINE bool isContiguous() const {
+    return expr1_.isContiguous() && expr2_.isContiguous() &&
+           expr3_.isContiguous();
+  }
+  INLINE bool useGpu() const { return expr1_.useGpu(); }
+
+  TensorApply<ArgType1, T> expr1_;
+  TensorApply<ArgType2, T> expr2_;
+  TensorApply<ArgType3, T> expr3_;
+};
+
+/**
+ * \brief The const expression evaluator classes.
+ */
+template <class OP, typename ArgType, class T>
+class TensorApply<const TensorConstant<OP, ArgType, T>, T> {
+public:
+  explicit INLINE TensorApply(const TensorConstant<OP, ArgType, T>& expr)
+      : op_(expr.op_), expr_(expr.expr_) {}
+
+  INLINE T apply(int i, int j) const { return op_(i, j); }
+  INLINE T apply(int index) const { return op_(index); }
+
+  INLINE size_t getWidth() const { return expr_.getWidth(); }
+  INLINE size_t getHeight() const { return expr_.getHeight(); }
+  INLINE bool isContiguous() const { return true; }
+  INLINE bool useGpu() const { return expr_.useGpu(); }
+
+  const OP op_;
+  TensorApply<ArgType, T> expr_;
+};
+
+}  // namespace paddle
diff --git a/paddle/math/TensorAssign.h b/paddle/math/TensorAssign.h
new file mode 100644
index 0000000000000000000000000000000000000000..943fb5649e4f1b791ab38479ca61c3fe11fc698e
--- /dev/null
+++ b/paddle/math/TensorAssign.h
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+
+/**
+ * \brief Tensor Assign Expression(return by lazyAssign,
+ * and evaluated by AssignEvaluate)
+ */
+template <typename LhsType, typename RhsType, class T>
+class TensorAssignOp {
+public:
+  explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
+      : lhs_(lhs), rhs_(rhs) {
+#ifndef __CUDA_ARCH__
+    CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+    CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+    CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+#endif
+  }
+
+  INLINE void apply(const int i, const int j) {
+    lhs_.applyRef(i, j) = rhs_.apply(i, j);
+  }
+  INLINE void apply(const int index) {
+    lhs_.applyRef(index) = rhs_.apply(index);
+  }
+
+  INLINE size_t getWidth() const { return lhs_.getWidth(); }
+  INLINE size_t getHeight() const { return rhs_.getHeight(); }
+  INLINE bool isContiguous() const {
+    return lhs_.isContiguous() && rhs_.isContiguous();
+  }
+  INLINE bool useGpu() const { return lhs_.useGpu(); }
+
+private:
+  TensorApply<LhsType, T> lhs_;
+  TensorApply<const RhsType, T> rhs_;
+};
+
+template <typename Assign, typename... AssignOp>
+void AssignCpuEvaluate(int height,
+                       int width,
+                       bool isContiguous,
+                       Assign&& assign,
+                       AssignOp&&... args) {
+  if (isContiguous) {
+    int size = height * width;
+    for (int index = 0; index < size; index++) {
+      assign.apply(index);
+      __attribute__((unused)) int dummy[] = {(((args)).apply(index), 0)...};
+    }
+  } else {
+    for (int i = 0; i < height; i++) {
+      for (int j = 0; j < width; j++) {
+        assign.apply(i, j);
+        __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
+      }
+    }
+  }
+}
+
+#ifdef __NVCC__
+template <typename Assign, typename... AssignOp>
+__global__ void AssignGpuEvaluate1(const int border,
+                                   Assign assign,
+                                   AssignOp... args) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    assign.apply(idx);
+    __attribute__((unused)) int dummy[] = {(((args)).apply(idx), 0)...};
+  }
+}
+
+template <typename Assign, typename... AssignOp>
+__global__ void AssignGpuEvaluate2(const int height,
+                                   const int width,
+                                   Assign assign,
+                                   AssignOp... args) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < height; i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < width; j += gridDim.x * blockDim.x) {
+      assign.apply(i, j);
+      __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
+    }
+  }
+}
+#endif
+
+/**
+ * \brief Evaluate one or more TensorAssignOp objects.
+ *
+ * \note At least one assignment expression is required
+ */
+template <typename Assign, typename... AssignOp>
+void AssignEvaluate(Assign&& assign, AssignOp&&... args) {
+  const bool useGpu_ = assign.useGpu();
+  bool isContiguous_ = assign.isContiguous();
+  const size_t height = assign.getHeight();
+  const size_t width = assign.getWidth();
+
+  const int packSize = sizeof...(args);
+  const bool packUseGpu[] = {((args)).useGpu()...};
+  const bool packIsContiguous[] = {((args)).isContiguous()...};
+  const size_t packHeight[] = {((args)).getHeight()...};
+  const size_t packWidth[] = {((args)).getWidth()...};
+
+  for (int i = 0; i < packSize; i++) {
+    CHECK_EQ(useGpu_, packUseGpu[i]);
+    CHECK_EQ(height, packHeight[i]);
+    CHECK_EQ(width, packWidth[i]);
+    isContiguous_ = isContiguous_ && packIsContiguous[i];
+  }
+
+  if (useGpu_) {
+#ifdef __NVCC__
+    if (isContiguous_) {
+      int size = height * width;
+      int blockSize = size <= 1024 ? size : 1024;
+      int gridSize = (size + 1024 - 1) / 1024;
+      AssignGpuEvaluate1<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+          size, assign, args...);
+    } else {
+      int blockSizeY = std::min(32, (int)height);
+      int blockSizeX = (32 / blockSizeY) * 32;
+      int gridSizeX = std::min(32, (int)(width + blockSizeX - 1) / blockSizeX);
+      int gridSizeY = std::min(32, (int)(height + blockSizeY - 1) / blockSizeY);
+      dim3 threads(blockSizeX, blockSizeY);
+      dim3 grid(gridSizeX, gridSizeY);
+      AssignGpuEvaluate2<<<grid, threads, 0, STREAM_DEFAULT>>>(
+          height, width, assign, args...);
+    }
+
+    CHECK_SYNC("AssignEvaluate failed");
+#endif
+  } else {
+    AssignCpuEvaluate(height, width, isContiguous_, assign, args...);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/math/TensorEvaluate.h b/paddle/math/TensorEvaluate.h
new file mode 100644
index 0000000000000000000000000000000000000000..9de2099b850d1723fe085eeed97c5b141629eec1
--- /dev/null
+++ b/paddle/math/TensorEvaluate.h
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include "hl_base.h"
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+
+/**
+ * \brief The tensor cpu evaluate api.
+ */
+template <class T, typename LeftType, typename RightType>
+inline void TensorCpuApply(LeftType& lhs, const RightType& rhs) {
+  TensorApply<LeftType, T> lhs_(lhs);
+  TensorApply<const RightType, T> rhs_(rhs);
+  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+
+  int height = lhs_.getHeight();
+  int width = lhs_.getWidth();
+  if (lhs_.isContiguous() && rhs_.isContiguous()) {
+    int size = height * width;
+    for (int index = 0; index < size; index++) {
+      lhs_.applyRef(index) = rhs_.apply(index);
+    }
+  } else {
+    for (int i = 0; i < height; i++) {
+      for (int j = 0; j < width; j++) {
+        lhs_.applyRef(i, j) = rhs_.apply(i, j);
+      }
+    }
+  }
+}
+
+#ifdef __NVCC__
+template <typename LeftType, typename RightType>
+__global__ void TensorElementWiseOp(LeftType lhs,
+                                    RightType rhs,
+                                    const int border) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    lhs.applyRef(idx) = rhs.apply(idx);
+  }
+}
+
+template <typename LeftType, typename RightType>
+__global__ void TensorElementWiseOp(LeftType lhs, RightType rhs) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < lhs.getHeight(); i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < lhs.getWidth(); j += gridDim.x * blockDim.x) {
+      lhs.applyRef(i, j) = rhs.apply(i, j);
+    }
+  }
+}
+
+/**
+ * \brief The tensor gpu evaluate api.
+ */
+template <class T, typename LeftType, typename RightType>
+inline void TensorGpuApply(LeftType& lhs, const RightType& rhs) {
+  TensorApply<LeftType, T> lhs_(lhs);
+  TensorApply<const RightType, T> rhs_(rhs);
+  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+
+  int dimM = lhs_.getHeight();
+  int dimN = lhs_.getWidth();
+
+  if (lhs_.isContiguous() && rhs_.isContiguous()) {
+    int size = dimM * dimN;
+    int blockSize = size <= 1024 ? size : 1024;
+    int gridSize = (size + 1024 - 1) / 1024;
+    TensorElementWiseOp<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+        lhs_, rhs_, size);
+  } else {
+    int blockSizeY = std::min(32, dimM);
+    int blockSizeX = (32 / blockSizeY) * 32;
+    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
+    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
+    dim3 threads(blockSizeX, blockSizeY);
+    dim3 grid(gridSizeX, gridSizeY);
+    TensorElementWiseOp<<<grid, threads, 0, STREAM_DEFAULT>>>(lhs_, rhs_);
+  }
+
+  CHECK_SYNC("TensorGpuApply failed");
+}
+#else
+template <class T, typename LeftType, typename RightType>
+inline void TensorGpuApply(LeftType& lhs, RightType& rhs) {}
+#endif
+
+}  // namespace paddle
diff --git a/paddle/math/TensorExpression.h b/paddle/math/TensorExpression.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bd789e8c511f33d8415e421281e99eb10fc63fe
--- /dev/null
+++ b/paddle/math/TensorExpression.h
@@ -0,0 +1,446 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <stdint.h>
+#include <cstddef>
+#include "hl_tensor_ops.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/TypeDefs.h"
+
+namespace paddle {
+
+template <class OP, typename ExprType, class T>
+class TensorConstant;
+template <class OP, typename ExprType, class T>
+class TensorUnaryOp;
+template <class OP, typename LhsType, typename RhsType, class T>
+class TensorBinaryOp;
+template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
+class TensorTernaryOp;
+
+template <typename LhsType, typename RhsType, class T>
+class TensorAssignOp;
+
+/**
+ * \brief Tensor base class.
+ *
+ * This is the base class of all Tensor and Expression class.
+ */
+template <typename Derived, class T>
+class TensorExpression {
+public:
+  /**
+   * Element wise unary expression.
+   */
+  template <typename UnaryOp>
+  const TensorUnaryOp<UnaryOp, const Derived, T> unaryExpression(
+      const UnaryOp& op) const {
+    return TensorUnaryOp<UnaryOp, const Derived, T>(op, derived());
+  }
+
+  const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
+      T p) const {
+    return unaryExpression(hppl::unary::add_scale<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::sub_scale<T>, const Derived, T> operator-(
+      T p) const {
+    return unaryExpression(hppl::unary::sub_scale<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
+      T p) const {
+    return unaryExpression(hppl::unary::mul_scale<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::div_scale<T>, const Derived, T> operator/(
+      T p) const {
+    return unaryExpression(hppl::unary::div_scale<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::neg<T>, const Derived, T> operator-() const {
+    return unaryExpression(hppl::unary::neg<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::exp_op<T>, const Derived, T> exp() const {
+    return unaryExpression(hppl::unary::exp_op<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::log_op<T>, const Derived, T> log() const {
+    return unaryExpression(hppl::unary::log_op<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::sqrt_op<T>, const Derived, T> sqrt() const {
+    return unaryExpression(hppl::unary::sqrt_op<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::square<T>, const Derived, T> square() const {
+    return unaryExpression(hppl::unary::square<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::reciprocal<T>, const Derived, T> reciprocal()
+      const {
+    return unaryExpression(hppl::unary::reciprocal<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::abs<T>, const Derived, T> abs() const {
+    return unaryExpression(hppl::unary::abs<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::sign<T>, const Derived, T> sign() const {
+    return unaryExpression(hppl::unary::sign<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::pow_op<T>, const Derived, T> pow(T p) const {
+    return unaryExpression(hppl::unary::pow_op<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::min<T>, const Derived, T> min(T p) const {
+    return unaryExpression(hppl::unary::min<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::max<T>, const Derived, T> max(T p) const {
+    return unaryExpression(hppl::unary::max<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_eq<T>, const Derived, T> operator==(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_eq<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_ne<T>, const Derived, T> operator!=(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_ne<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_le<T>, const Derived, T> operator<=(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_le<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_lt<T>, const Derived, T> operator<(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_lt<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_ge<T>, const Derived, T> operator>=(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_ge<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_gt<T>, const Derived, T> operator>(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_gt<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::and_op<T>, const Derived, T> operator&&(
+      T p) const {
+    return unaryExpression(hppl::unary::and_op<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::or_op<T>, const Derived, T> operator||(
+      T p) const {
+    return unaryExpression(hppl::unary::or_op<T>(p));
+  }
+
+  /**
+   * Element wise binary expression.
+   */
+  template <typename BinaryOp, typename ExpressionType>
+  const TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>
+  binaryExpression(const BinaryOp& op, const ExpressionType& expr) const {
+    return TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>(
+        op, derived(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_eq<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator==(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_eq<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_ne<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator!=(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_ne<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_le<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator<=(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_le<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_lt<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator<(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_lt<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_ge<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator>=(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_ge<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_gt<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator>(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_gt<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::and_op<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator&&(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::and_op<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::or_op<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator||(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::or_op<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::add<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator+(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::add<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::sub<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator-(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::sub<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::mul<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator*(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::mul<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::div<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator/(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::div<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::min<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  min(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::min<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::max<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  max(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::max<T>(), expr);
+  }
+
+  /**
+   * Element wise ternary expression.
+   *
+   * ternary conditional operator(?: operator).
+   * The conditional expression returns one of two values depending on
+   * the result of derived expression.
+   * If derived expression evaluates to true, then expression1 is evaluated.
+   * If derived expression evaluates to false, then expression2 is evaluated.
+   */
+  template <typename ExprType1, typename ExprType2>
+  const TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>
+  condition(const ExprType1& expr1, const ExprType2& expr2) const {
+    return TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>(
+        derived(), expr1, expr2);
+  }
+
+  template <typename ExprType>
+  const TensorTernaryOp<
+      const Derived,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      const ExprType,
+      T>
+  condition(T p, const ExprType& expr) const {
+    return condition(constant(p), expr);
+  }
+
+  template <typename ExprType>
+  const TensorTernaryOp<
+      const Derived,
+      const ExprType,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      T>
+  condition(const ExprType& expr, T p) const {
+    return condition(expr, constant(p));
+  }
+
+  const TensorTernaryOp<
+      const Derived,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      T>
+  condition(T p1, T p2) const {
+    return condition(constant(p1), constant(p2));
+  }
+
+  /**
+   * return a TensorConstant. A TensorConstant object hold a constant value.
+   */
+  const TensorConstant<hppl::unary::constant<T>, const Derived, T> constant(
+      T p) const {
+    return TensorConstant<hppl::unary::constant<T>, const Derived, T>(
+        hppl::unary::constant<T>(p), derived());
+  }
+
+  /**
+   * return a TensorAssignOp, and use AssignEvaluate to evaluate one or more
+   * TensorAssignOp objects.
+   */
+  template <typename ExpressionType>
+  TensorAssignOp<Derived, ExpressionType, T> lazyAssign(
+      const ExpressionType& expr) const {
+    return TensorAssignOp<Derived, ExpressionType, T>(derived(), expr);
+  }
+
+protected:
+  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+};
+
+/**
+ * \brief Unary Operator Expression
+ */
+template <class OP, typename ExprType, class T>
+class TensorUnaryOp
+    : public TensorExpression<TensorUnaryOp<OP, ExprType, T>, T> {
+public:
+  explicit TensorUnaryOp(const OP op, const ExprType& expr)
+      : op_(op), expr_(expr) {}
+
+  const OP op_;
+  const ExprType expr_;
+};
+
+/**
+ * \brief Binary Operator Expression
+ */
+template <class OP, typename LhsType, typename RhsType, class T>
+class TensorBinaryOp
+    : public TensorExpression<TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
+public:
+  explicit TensorBinaryOp(const OP op, const LhsType& lhs, const RhsType& rhs)
+      : op_(op), lhs_(lhs), rhs_(rhs) {}
+
+  const OP op_;
+  const LhsType lhs_;
+  const RhsType rhs_;
+};
+
+/**
+ * \brief Ternary Operator Expression
+ */
+template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
+class TensorTernaryOp : public TensorExpression<
+                            TensorTernaryOp<ExprType1, ExprType2, ExprType3, T>,
+                            T> {
+public:
+  explicit TensorTernaryOp(const ExprType1& expr1,
+                           const ExprType2& expr2,
+                           const ExprType3& expr3)
+      : expr1_(expr1), expr2_(expr2), expr3_(expr3) {}
+
+  const ExprType1 expr1_;
+  const ExprType2 expr2_;
+  const ExprType3 expr3_;
+};
+
+/**
+ * \brief Constant Expression
+ */
+template <class OP, typename ExprType, class T>
+class TensorConstant
+    : public TensorExpression<TensorConstant<OP, ExprType, T>, T> {
+public:
+  explicit TensorConstant(const OP op, const ExprType& expr)
+      : op_(op), expr_(expr) {}
+
+  const OP op_;
+  const ExprType expr_;
+};
+
+/**
+ * \brief operator+ overload
+ * \return a unary operator expression
+ */
+template <typename Derived, class T>
+const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
+    T p, const TensorExpression<Derived, T>& expr) {
+  return expr + p;
+}
+
+/**
+ * \brief operator* overload
+ * \return a unary operator expression
+ */
+template <typename Derived, class T>
+const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
+    T p, const TensorExpression<Derived, T>& expr) {
+  return expr * p;
+}
+
+}  // namespace paddle
+
+#include "TensorApply.h"
+#include "TensorEvaluate.h"
diff --git a/paddle/math/TrainingAlgorithmOp.cu b/paddle/math/TrainingAlgorithmOp.cu
new file mode 100644
index 0000000000000000000000000000000000000000..72ff077270382d52bfcd340cc64d9abf49d1705d
--- /dev/null
+++ b/paddle/math/TrainingAlgorithmOp.cu
@@ -0,0 +1,357 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Logging.h"
+#include "BaseMatrix.h"
+#include "TrainingAlgorithmOp.h"
+
+#if __cplusplus > 199711L
+
+#include "TensorAssign.h"
+
+namespace paddle {
+
+void sparseMomentumApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& momU,
+                         BaseMatrix& momV,
+                         real alpha,
+                         real beta,
+                         real gamma,
+                         real tau,
+                         real learningRate) {
+  auto expr1 = momU.lazyAssign(momU - (alpha * gamma * learningRate) * grad);
+  auto expr2 = momV.lazyAssign(
+    momV + (tau * alpha * gamma * learningRate) * grad);
+  auto expr3 = value.lazyAssign(
+    (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV);
+
+  AssignEvaluate(expr1, expr2, expr3);
+}
+
+void adadeltaApply(BaseMatrix& value,
+                   BaseMatrix& grad,
+                   BaseMatrix& mom,
+                   BaseMatrix& accum,
+                   BaseMatrix& accum_update,
+                   BaseMatrix& lr,
+                   real rou,
+                   real epsilon,
+                   real learningRate,
+                   real momentum,
+                   real decayRate) {
+  auto expr1 = accum.lazyAssign(rou * accum + ((real)1 - rou) * grad.square());
+  auto expr2 = lr.lazyAssign(
+    ((accum_update + epsilon) / (accum + epsilon)).sqrt());
+  auto expr3 = accum_update.lazyAssign(
+    rou * accum_update + ((real)1 - rou) * (grad * lr).square());
+  auto expr4 = mom.lazyAssign(
+    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr5 = value.lazyAssign(value + mom);
+
+  AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
+}
+
+void adagradApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& accum_buffer,
+                  BaseMatrix& accum,
+                  BaseMatrix& lr,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate) {
+  auto expr1 = accum.lazyAssign(accum + grad.square());
+  auto expr2 = lr.lazyAssign(
+    (accum_buffer + accum + epsilon).sqrt().reciprocal());
+  auto expr3 = mom.lazyAssign(
+    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr4 = value.lazyAssign(value + mom);
+
+  AssignEvaluate(expr1, expr2, expr3, expr4);
+}
+
+void rmspropApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& g,
+                  BaseMatrix& f,
+                  BaseMatrix& lr,
+                  real accumulatedRou,
+                  real rou,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate,
+                  bool firstTime) {
+  auto expr2 = f.lazyAssign(accumulatedRou * f + ((real)1 - rou) * grad);
+  auto expr3 = lr.lazyAssign((g - f.square() + epsilon).sqrt().reciprocal());
+  auto expr4 = mom.lazyAssign(
+    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr5 = value.lazyAssign(value + mom);
+
+  if (firstTime) {
+    auto expr1 = g.lazyAssign(accumulatedRou * g + grad.square());
+
+    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
+  } else {
+    auto expr1 = g.lazyAssign(
+      accumulatedRou * g + ((real)1 - rou) * grad.square());
+
+    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
+  }
+}
+
+void decayedAdagradApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& mom,
+                         BaseMatrix& accum,
+                         BaseMatrix& lr,
+                         real accumulatedRou,
+                         real rou,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate,
+                         bool firstTime) {
+  auto expr2 = lr.lazyAssign((accum + epsilon).sqrt().reciprocal());
+  auto expr3 = mom.lazyAssign(
+    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr4 = value.lazyAssign(value + mom);
+
+  if (firstTime) {
+    auto expr1 = accum.lazyAssign(accumulatedRou * accum + grad.square());
+
+    AssignEvaluate(expr1, expr2, expr3, expr4);
+  } else {
+    auto expr1 = accum.lazyAssign(
+      accumulatedRou * accum + ((real)1 - rou) * grad.square());
+
+    AssignEvaluate(expr1, expr2, expr3, expr4);
+  }
+}
+
+void adamApply(BaseMatrix& value,
+               BaseMatrix& grad,
+               BaseMatrix& mom,  // firse moment
+               BaseMatrix& v,    // second moment
+               real beta1,
+               real beta2,
+               real beta1_power,
+               real beta2_power,
+               real epsilon,
+               real learningRate) {
+  real alpha = learningRate *
+      std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+
+  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
+  auto expr2 = v.lazyAssign(beta2 * v + ((real)1 - beta2) * grad.square());
+  auto expr3 = value.lazyAssign(
+    value - (mom * alpha) / (v.sqrt() + epsilon));
+
+  AssignEvaluate(expr1, expr2, expr3);
+}
+
+void adamaxApply(BaseMatrix& value,
+                 BaseMatrix& grad,
+                 BaseMatrix& mom,  // firse moment
+                 BaseMatrix& u,    // weighted infinity norm
+                 real beta1,
+                 real beta2,
+                 int64_t step,
+                 real alpha) {
+  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
+  auto expr2 = u.lazyAssign(
+    (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs()));
+  auto expr3 = value.lazyAssign(
+    value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u));
+
+  AssignEvaluate(expr1, expr2, expr3);
+}
+
+}  // namespace paddle
+
+#else
+
+namespace paddle {
+
+void sparseMomentumApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& momU,
+                         BaseMatrix& momV,
+                         real alpha,
+                         real beta,
+                         real gamma,
+                         real tau,
+                         real learningRate) {
+  /**
+   * \alpha_t = \alpha_{t-1} / k
+   * \beta_t = \beta_{t-1} / (1 + \lambda\gamma_t)
+   * u_t = u_{t-1} - \alpha_t \gamma_t g_t
+   * v_t = v_{t-1} + \tau_{t-1} \alpha_t \gamma_t g_t
+   * \tau_t = \tau_{t-1} + \beta_t / \alpha_t
+   */
+  momU -= (alpha * gamma * learningRate) * grad;
+  momV += (tau * alpha * gamma * learningRate) * grad;
+  value = (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV;
+}
+
+void adadeltaApply(BaseMatrix& value,
+                   BaseMatrix& grad,
+                   BaseMatrix& mom,
+                   BaseMatrix& accum,
+                   BaseMatrix& accum_update,
+                   BaseMatrix& lr,
+                   real rou,
+                   real epsilon,
+                   real learningRate,
+                   real momentum,
+                   real decayRate) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  accum = rou * accum + ((real)1 - rou) * grad.square();
+
+  // learn_rate: sqrt(( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ))
+  lr = ((accum_update + epsilon) / (accum + epsilon)).sqrt();
+
+  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
+  accum_update = rou * accum_update + ((real)1 - rou) * (grad * lr).square();
+
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+
+void adagradApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& accum_buffer,
+                  BaseMatrix& accum,
+                  BaseMatrix& lr,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate) {
+  accum += grad.square();
+  lr = (accum_buffer + accum + epsilon).sqrt().reciprocal();
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+
+void rmspropApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& g,
+                  BaseMatrix& f,
+                  BaseMatrix& lr,
+                  real accumulatedRou,
+                  real rou,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate,
+                  bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  if (firstTime) {
+    g = accumulatedRou * g + grad.square();
+  } else {
+    g = accumulatedRou * g + ((real)1 - rou) * grad.square();
+  }
+
+  // E(f_t) = \rou * E(f_{t-1}) + (1-\rou) * g
+  f = accumulatedRou * f + ((real)1 - rou) * grad;
+
+  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(f_t))^2 + epsilon )
+  // Basiclly if the sign of the gradient changes more often,
+  // the learning rate will be decreased.
+  lr = (g - f.square() + epsilon).sqrt().reciprocal();
+
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+
+void decayedAdagradApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& mom,
+                         BaseMatrix& accum,
+                         BaseMatrix& lr,
+                         real accumulatedRou,
+                         real rou,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate,
+                         bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  if (firstTime) {
+    accum = accumulatedRou * accum + grad.square();
+  } else {
+    accum = accumulatedRou * accum + ((real)1 - rou) * grad.square();
+  }
+
+  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
+  // Basiclly if the bigger the magnitude gradient is,
+  // the smaller the learning rate will be.
+  lr = (accum + epsilon).sqrt().reciprocal();
+
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+
+void adamApply(BaseMatrix& value,
+               BaseMatrix& grad,
+               BaseMatrix& mom,  // firse moment
+               BaseMatrix& v,    // second moment
+               real beta1,
+               real beta2,
+               real beta1_power,
+               real beta2_power,
+               real epsilon,
+               real learningRate) {
+  real alpha = learningRate *
+      std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  mom = beta1 * mom + ((real)1 - beta1) * grad;
+
+  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
+  v = beta2 * v + ((real)1 - beta2) * grad.square();
+
+  value -=  (mom * alpha) / (v.sqrt() + epsilon);
+}
+
+void adamaxApply(BaseMatrix& value,
+                 BaseMatrix& grad,
+                 BaseMatrix& mom,  // firse moment
+                 BaseMatrix& u,    // weighted infinity norm
+                 real beta1,
+                 real beta2,
+                 int64_t step,
+                 real alpha) {
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  mom = beta1 * mom + ((real)1 - beta1) * grad;
+
+  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
+  u = (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs());
+
+  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
+  value -= (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u);
+}
+
+}  // namespace paddle
+
+#endif
diff --git a/paddle/math/TrainingAlgorithmOp.h b/paddle/math/TrainingAlgorithmOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..881a8d72d888083ad87a536c127009d68c51076e
--- /dev/null
+++ b/paddle/math/TrainingAlgorithmOp.h
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "BaseMatrix.h"
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+
+/**
+ * \brief Sparse Momentum optimizer.
+ */
+extern void sparseMomentumApply(BaseMatrix& value,
+                                BaseMatrix& grad,
+                                BaseMatrix& momU,
+                                BaseMatrix& momV,
+                                real alpha,
+                                real beta,
+                                real gamma,
+                                real tau,
+                                real learningRate);
+
+/**
+ * \brief AdaDelta optimizer.
+ */
+extern void adadeltaApply(BaseMatrix& value,
+                          BaseMatrix& grad,
+                          BaseMatrix& sum,
+                          BaseMatrix& sum1,
+                          BaseMatrix& mom,
+                          BaseMatrix& lr,
+                          real rou,
+                          real epsilon,
+                          real learningRate,
+                          real momentum,
+                          real decayRate);
+
+/**
+ * \brief AdaGrad optimizer.
+ */
+extern void adagradApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& sum,
+                         BaseMatrix& sum1,
+                         BaseMatrix& mom,
+                         BaseMatrix& lr,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate);
+
+/**
+ * \brief RMSProp optimizer.
+ */
+extern void rmspropApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& g,
+                         BaseMatrix& f,
+                         BaseMatrix& mom,
+                         BaseMatrix& lr,
+                         real accumulatedRou,
+                         real rou,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate,
+                         bool firstTime);
+
+/**
+ * \brief Decayed AdaGrad optimizer.
+ */
+extern void decayedAdagradApply(BaseMatrix& value,
+                                BaseMatrix& grad,
+                                BaseMatrix& mom,
+                                BaseMatrix& accum,
+                                BaseMatrix& lr,
+                                real accumulatedRou,
+                                real rou,
+                                real epsilon,
+                                real learningRate,
+                                real momentum,
+                                real decayRate,
+                                bool firstTime);
+
+/**
+ * \brief Adam optimizer.
+ */
+extern void adamApply(BaseMatrix& value,
+                      BaseMatrix& grad,
+                      BaseMatrix& mom,
+                      BaseMatrix& v,
+                      real beta1,
+                      real beta2,
+                      real beta1_power,
+                      real beta2_power,
+                      real epsilon,
+                      real learningRate);
+
+/**
+ * \brief AdaMax optimizer.
+ */
+extern void adamaxApply(BaseMatrix& value,
+                        BaseMatrix& grad,
+                        BaseMatrix& mom,  // firse moment
+                        BaseMatrix& u,    // weighted infinity norm
+                        real beta1,
+                        real beta2,
+                        int64_t step,
+                        real alpha);
+}  // namespace paddle
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
index b2ade83138428a510e6be1bfa82290008e4167d0..eaa1cdce305c2f9d7a517e9e8c8606dc1f70780b 100644
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
 #include "Vector.h"
+#include "paddle/utils/Util.h"
 
 #include <memory>
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/Thread.h"
-#include "paddle/utils/Flags.h"
 #include "Matrix.h"
 #include "hl_gpu.h"
 #include "hl_table_apply.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Thread.h"
+#include "paddle/utils/ThreadLocal.h"
 
 namespace paddle {
 
@@ -754,8 +754,7 @@ void ParallelCpuVectorT<real>::exec(SyncThreadPool::JobFunc func) {
 }
 
 template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, bool useGpu)
-    : sync_(nullptr) {
+CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, bool useGpu) : sync_(nullptr) {
   if (!useGpu) {
     cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size);
   } else {
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
index 46a25c04dff6041222b8c97b8904322546f2bbe3..8a24103bd4107035c8068c24ec3be6ec06957112 100644
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,15 +14,15 @@ limitations under the License. */
 
 #pragma once
 
-#include <memory>
 #include <cmath>
+#include <memory>
 
 #include <hl_gpu.h>
 
-#include "MemoryHandle.h"
-#include "paddle/utils/TypeDefs.h"
 #include "BaseMatrix.h"
+#include "MemoryHandle.h"
 #include "paddle/utils/Thread.h"
+#include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
 
@@ -265,6 +265,15 @@ public:
   /// print the "idx" element of the Vector
   virtual void printOneElement(std::ostream& os, size_t idx) const = 0;
 
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (BaseVector<T>::useGpu_) {
+      TensorGpuApply<T>(*this, expr);
+    } else {
+      TensorCpuApply<T>(*this, expr);
+    }
+  }
+
 protected:
   friend class GpuVectorT<T>;
   friend class CpuVectorT<T>;
@@ -322,6 +331,11 @@ public:
   virtual void print(std::ostream& os, size_t num) const;
   virtual void printOneElement(std::ostream& os, size_t idx) const;
 
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorGpuApply<T>(*this, expr);
+  }
+
 protected:
   virtual void copyTo(CpuVectorT<T>* dest) const;
   virtual void copyTo(GpuVectorT<T>* dest) const;
@@ -385,6 +399,11 @@ public:
   virtual T get(size_t pos);
   virtual void print(std::ostream& os, size_t num) const;
   virtual void printOneElement(std::ostream& os, size_t idx) const;
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorCpuApply<T>(*this, expr);
+  }
 };
 
 template <class T>
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index 247be983ba3296383c8e2f30f1036859ecfde492..a3ea078509704f305672d0b02d272de0f6c97f51 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -2,7 +2,8 @@
 
 add_simple_unittest(test_ExecViaCpu)
 add_simple_unittest(test_SIMDFunctions)
-add_simple_unittest(test_matrix)
+add_simple_unittest(test_TrainingAlgorithm)
+add_simple_unittest(test_SparseMatrix)
 
 # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
 add_unittest(test_matrixCompare
@@ -13,4 +14,20 @@ add_simple_unittest(test_sparseMatrixCompare)
 add_simple_unittest(test_perturbation)
 add_simple_unittest(test_CpuGpuVector)
 add_simple_unittest(test_Allocator)
+
+if(WITH_GPU)
+    CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu)
+    link_paddle_test(test_Tensor)
+    CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
+    link_paddle_test(test_lazyAssign)
+else()
+    compile_cu_as_cpp(test_Tensor.cu)
+    add_unittest(test_Tensor test_Tensor.cu)
+    compile_cu_as_cpp(test_lazyAssign.cu)
+    add_unittest(test_lazyAssign test_lazyAssign.cu)
+endif(WITH_GPU)
+
 add_simple_unittest(test_FPException)
+add_simple_unittest(test_GpuProfiler)
+add_simple_unittest(test_BaseMatrix)
+add_simple_unittest(test_Matrix)
diff --git a/paddle/math/tests/OriginalOptimizerApi.h b/paddle/math/tests/OriginalOptimizerApi.h
new file mode 100644
index 0000000000000000000000000000000000000000..0188372771d97942a0761c673d40d040528ff59a
--- /dev/null
+++ b/paddle/math/tests/OriginalOptimizerApi.h
@@ -0,0 +1,201 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/math/Vector.h"
+#include "paddle/utils/GlobalConstants.h"
+
+using namespace paddle;  // NOLINT
+
+void SparseMomentumParameterOptimizer(const VectorPtr vecs[],
+                                      real alpha,
+                                      real beta,
+                                      real gamma,
+                                      real tau,
+                                      real learningRate) {
+  vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
+                                   -alpha * gamma * learningRate);
+  vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
+                                   tau * alpha * gamma * learningRate);
+  vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
+                             tau / beta + 1.0 / alpha,
+                             *vecs[PARAMETER_MOMENTUM_VT],
+                             1.0 / beta);
+}
+
+void AdagradParameterOptimizer(const VectorPtr vecs[],
+                               real epsilon,
+                               real learningRate,
+                               real momentum,
+                               real decayRate) {
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
+                                                1.0f);
+  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
+                                     *vecs[PARAMETER_GRADIENT_SQURESUM1]);
+  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+
+void AdaDeltaParameterOptimizer(const VectorPtr vecs[],
+                                real rou,
+                                real epsilon,
+                                real learningRate,
+                                real momentum,
+                                real decayRate) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], rou, 1.0f - rou);
+
+  // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
+  vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
+                                        *vecs[PARAMETER_GRADIENT_SQURESUM],
+                                        epsilon,
+                                        epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->sqrt2();
+
+  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
+      *vecs[PARAMETER_GRADIENT],
+      *vecs[PARAMETER_LEARNING_RATE],
+      rou,
+      1.0f - rou);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+
+void RMSPropParameterOptimizer(const VectorPtr vecs[],
+                               real accumulatedRou,
+                               real rou,
+                               real epsilon,
+                               real learningRate,
+                               real momentum,
+                               real decayRate,
+                               bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
+
+  // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou);
+
+  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
+  // Basiclly if the sign of the gradient changes more often,
+  // the learning rate will be decreased.
+  vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
+  vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
+                                           -1.0f);
+  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+
+void DecayedAdagradParameterOptimizer(const VectorPtr vecs[],
+                                      real accumulatedRou,
+                                      real rou,
+                                      real epsilon,
+                                      real learningRate,
+                                      real momentum,
+                                      real decayRate,
+                                      bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
+
+  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
+  // Basiclly if the bigger the magnitude gradient is,
+  // the smaller the learning rate will be.
+  vecs[PARAMETER_LEARNING_RATE]->assign(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+
+void AdamParameterOptimizer(const VectorPtr vecs[],
+                            real beta1,
+                            real beta2,
+                            real beta1_power,
+                            real beta2_power,
+                            real epsilon,
+                            real learningRate) {
+  Vector* m = vecs[PARAMETER_MOMENTUM].get();
+  Vector* g = vecs[PARAMETER_GRADIENT].get();
+  Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
+  Vector* theta = vecs[PARAMETER_VALUE].get();
+
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  m->add(*g, beta1, 1 - beta1);
+
+  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
+  g->square2();
+  v->add(*g, beta2, 1 - beta2);
+
+  // tmp = m_t / ( \sqrt{v_t} + \epsilon )
+  // \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
+  g->sqrt2(*v);
+  g->dotDiv(*m, *g, 0., epsilon);
+  real alpha =
+      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+  theta->add(*theta, 1.0, *g, -alpha);
+}
+
+void AdamaxParameterOptimizer(
+    const VectorPtr vecs[], real beta1, real beta2, int64_t step, real alpha) {
+  Vector* m = vecs[PARAMETER_MOMENTUM].get();
+  Vector* g = vecs[PARAMETER_GRADIENT].get();
+  Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
+  Vector* theta = vecs[PARAMETER_VALUE].get();
+
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  m->add(*g, beta1, 1 - beta1);
+
+  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
+  u->mulScalar(beta2);
+  g->abs2();
+  u->max2(*u, *g);
+
+  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
+  g->dotDiv(*m, *u);
+  real learningRate = alpha / (1 - std::pow(beta1, step));
+  theta->add(*theta, 1.0, *g, -learningRate);
+}
diff --git a/paddle/math/tests/PerfUtils.h b/paddle/math/tests/PerfUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c6a63ce6c03c0db993d4bb52fe536e82ba2d726
--- /dev/null
+++ b/paddle/math/tests/PerfUtils.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// Performance Check
+#ifdef PADDLE_DISABLE_TIMER
+
+#define EXPRESSION_PERFORMANCE(expression) expression;
+
+#else
+
+#include "paddle/utils/Stat.h"
+using namespace paddle;  // NOLINT
+
+#define EXPRESSION_PERFORMANCE(expression)                             \
+  do {                                                                 \
+    char expr[30];                                                     \
+    strncpy(expr, #expression, 30);                                    \
+    if (expr[29] != '\0') {                                            \
+      expr[27] = '.';                                                  \
+      expr[28] = '.';                                                  \
+      expr[29] = '\0';                                                 \
+    }                                                                  \
+    expression;                                                        \
+    for (int i = 0; i < 20; i++) {                                     \
+      REGISTER_TIMER(expr);                                            \
+      expression;                                                      \
+    }                                                                  \
+    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ') \
+              << *globalStat.getStat(expr);                            \
+    globalStat.reset();                                                \
+  } while (0)
+
+#endif
diff --git a/paddle/math/tests/TensorCheck.h b/paddle/math/tests/TensorCheck.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bc4a03067a75527fa30e5bb5526f93dc7b9fdcc
--- /dev/null
+++ b/paddle/math/tests/TensorCheck.h
@@ -0,0 +1,216 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+/**
+ * This file provides a TensorCheck template function, which can be used to
+ * compare CpuMatrix and GpuMatrix, CpuVector and GpuVector, and so on.
+ */
+
+#include <cmath>
+#include "paddle/math/Matrix.h"
+
+namespace autotest {
+
+using paddle::Matrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::VectorT;
+using paddle::CpuVectorT;
+using paddle::GpuVectorT;
+
+class AssertEqual {
+public:
+  AssertEqual(real err = 0) : err_(err) {}
+
+  inline bool operator()(real a, real b) {
+    if (err_ == 0) {
+      if (a != b) {
+        return false;
+      }
+    } else {
+      if (std::fabs(a - b) > err_) {
+        if ((std::fabs(a - b) / std::fabs(a)) > (err_ / 10.0f)) {
+          return false;
+        }
+      }
+    }
+
+    return true;
+  }
+
+private:
+  real err_;
+};
+
+template <typename Tensor>
+class CopyToCpu;
+
+template <>
+class CopyToCpu<CpuMatrix> {
+public:
+  explicit CopyToCpu(const CpuMatrix& arg) : arg_(arg) {}
+  const CpuMatrix& copiedArg() const { return arg_; }
+
+private:
+  const CpuMatrix& arg_;
+};
+
+template <>
+class CopyToCpu<GpuMatrix> {
+public:
+  explicit CopyToCpu(const GpuMatrix& arg)
+      : arg_(arg.getHeight(), arg.getWidth()) {
+    arg_.copyFrom(arg);
+  }
+  CpuMatrix& copiedArg() { return arg_; }
+
+private:
+  CpuMatrix arg_;
+};
+
+template <>
+class CopyToCpu<Matrix> {
+public:
+  explicit CopyToCpu(const Matrix& arg)
+      : arg_(arg.getHeight(), arg.getWidth()) {
+    arg_.copyFrom(arg);
+  }
+  CpuMatrix& copiedArg() { return arg_; }
+
+private:
+  CpuMatrix arg_;
+};
+
+template <typename T>
+class CopyToCpu<CpuVectorT<T>> {
+public:
+  explicit CopyToCpu(const CpuVectorT<T>& arg) : arg_(arg) {}
+  const CpuVectorT<T>& copiedArg() const { return arg_; }
+
+private:
+  const CpuVectorT<T>& arg_;
+};
+
+template <typename T>
+class CopyToCpu<GpuVectorT<T>> {
+public:
+  explicit CopyToCpu(const GpuVectorT<T>& arg) : arg_(arg.getSize()) {
+    arg_.copyFrom(arg);
+  }
+  CpuVectorT<T>& copiedArg() { return arg_; }
+
+private:
+  CpuVectorT<T> arg_;
+};
+
+template <typename T>
+class CopyToCpu<VectorT<T>> {
+public:
+  explicit CopyToCpu(const VectorT<T>& arg) : arg_(arg.getSize()) {
+    arg_.copyFrom(arg);
+  }
+  CpuVectorT<T>& copiedArg() { return arg_; }
+
+private:
+  CpuVectorT<T> arg_;
+};
+
+template <typename AssertEq>
+void TensorCheck(AssertEq compare,
+                 const CpuMatrix& matrix1,
+                 const CpuMatrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      real a = data1[i * width + j];
+      real b = data2[i * width + j];
+      if (!compare(a, b)) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+template <typename AssertEq, class T>
+void TensorCheck(AssertEq compare,
+                 const CpuVectorT<T>& vector1,
+                 const CpuVectorT<T>& vector2) {
+  CHECK(vector1.getSize() == vector2.getSize());
+
+  const T* data1 = vector1.getData();
+  const T* data2 = vector2.getData();
+  size_t size = vector1.getSize();
+  int count = 0;
+  for (size_t i = 0; i < size; i++) {
+    real a = data1[i];
+    real b = data2[i];
+    if (!compare(a, b)) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+template <typename AssertEq, typename Tensor1, typename Tensor2>
+void TensorCheck(AssertEq compare,
+                 const Tensor1& tensor1,
+                 const Tensor2& tensor2) {
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+
+template <typename AssertEq>
+void TensorCheck(AssertEq compare, real args1, real args2) {
+  EXPECT_EQ(compare(args1, args2), true) << "[Test error] args1 = " << args1
+                                         << ", args2 = " << args2;
+}
+
+template <typename AssertEq>
+void TensorCheck(AssertEq compare, size_t args1, size_t args2) {
+  EXPECT_EQ(args1, args2) << "[Test error] args1 = " << args1
+                          << ", args2 = " << args2;
+}
+
+template <typename Tensor1, typename Tensor2>
+void TensorCheckEqual(const Tensor1& tensor1, const Tensor2& tensor2) {
+  AssertEqual compare(0);
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+
+template <typename Tensor1, typename Tensor2>
+void TensorCheckErr(const Tensor1& tensor1, const Tensor2& tensor2) {
+#ifndef PADDLE_TYPE_DOUBLE
+  AssertEqual compare(1e-3);
+#else
+  AssertEqual compare(1e-10);
+#endif
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+
+}  // namespace autotest
diff --git a/paddle/math/tests/TestUtils.h b/paddle/math/tests/TestUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3020961880484a7944f8cc61377a4f08122e403
--- /dev/null
+++ b/paddle/math/tests/TestUtils.h
@@ -0,0 +1,294 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+/**
+ * This file provides a AutoCompare calss to simplify the comparison
+ * of CPU and GPU member functions.
+ *
+ * This takes two steps
+ * 1. Construct an AutoCompare object.
+ *    When constructing an AutoCompare object, you can set the err argument
+ * to specify the maximum error for CPU and GPU functions.
+ *
+ * 2. Use the template functions cmpWithArg or cmpWithoutArg.
+ * A. [cmpWithArg] Requires the caller construct the cpu arguments.
+ *
+ *  AutoCompare test;
+ *  Init Argument arg1,arg2...
+ *  test.cmpWithArg(function, arg1, arg2....)
+ *
+ * B. [cmpWithoutArg] The caller do not need construct arguments.
+ *    If matrix used in these functions arguments is the same size.
+ *    Such as the element wise function and the aggregate function
+ *    defined in the BaseMatrix.cpp.
+ *
+ *  AutoCompare test;
+ *  test.cmpWithoutArg<I...>(function, height, width)
+*/
+
+#include <gtest/gtest.h>
+#include "TensorCheck.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+
+namespace autotest {
+
+using paddle::BaseMatrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::CpuIVector;
+using paddle::GpuIVector;
+using paddle::CpuSparseMatrix;
+using paddle::GpuSparseMatrix;
+
+template <typename T1, typename T2>
+class ReplaceType {
+public:
+  typedef T1 type;
+};
+
+template <>
+class ReplaceType<BaseMatrix, CpuMatrix> {
+public:
+  typedef CpuMatrix type;
+};
+
+template <>
+class ReplaceType<BaseMatrix, GpuMatrix> {
+public:
+  typedef GpuMatrix type;
+};
+
+template <>
+class ReplaceType<Matrix, CpuMatrix> {
+public:
+  typedef CpuMatrix type;
+};
+
+template <>
+class ReplaceType<Matrix, GpuMatrix> {
+public:
+  typedef GpuMatrix type;
+};
+
+// construct a argument
+template <typename T>
+T construct(int height, int width);
+
+template <>
+float construct(int height, int width) {
+  return 0.5;
+}
+
+template <>
+double construct(int height, int width) {
+  return 0.5;
+}
+
+template <>
+size_t construct(int height, int width) {
+  size_t offset = std::rand() % (height < width ? height : width);
+  return offset;
+}
+
+template <>
+CpuMatrix construct(int height, int width) {
+  CpuMatrix a(height, width);
+  return a;
+}
+
+template <>
+GpuMatrix construct(int height, int width) {
+  GpuMatrix a(height, width);
+  return a;
+}
+
+// init a argument
+template <typename T>
+void init(T& v) {
+  return;
+}
+
+template <>
+void init(CpuMatrix& v) {
+  v.randomizeUniform();
+}
+
+template <>
+void init(GpuMatrix& v) {
+  v.randomizeUniform();
+}
+
+// init a tuple which contains a set of arguments.
+template <std::size_t I = 0, typename... Args>
+inline typename std::enable_if<I == sizeof...(Args), void>::type initTuple(
+    std::tuple<Args...>& t) {}
+
+template <std::size_t I = 0, typename... Args>
+    inline typename std::enable_if <
+    I<sizeof...(Args), void>::type initTuple(std::tuple<Args...>& t) {
+  init(std::get<I>(t));
+  initTuple<I + 1>(t);
+}
+
+// copy a argument, copy src to dest
+template <typename T1, typename T2>
+void copy(T1& dest, T2& src) {
+  dest = src;
+}
+
+template <>
+void copy(GpuMatrix& dest, CpuMatrix& src) {
+  dest.copyFrom(src);
+}
+
+// copy a tuple, copy src to dest
+template <std::size_t I = 0, typename... Args1, typename... Args2>
+inline typename std::enable_if<I == sizeof...(Args1), void>::type copyTuple(
+    std::tuple<Args1...>& dest, std::tuple<Args2...>& src) {}
+
+template <std::size_t I = 0, typename... Args1, typename... Args2>
+    inline typename std::enable_if <
+    I<sizeof...(Args1), void>::type copyTuple(std::tuple<Args1...>& dest,
+                                              std::tuple<Args2...>& src) {
+  copy(std::get<I>(dest), std::get<I>(src));
+  copyTuple<I + 1>(dest, src);
+}
+
+// call member function
+template <typename C,
+          typename FC,
+          typename R,
+          typename... FArgs,
+          typename... Args>
+R call(C& obj, R (FC::*f)(FArgs...), Args&&... args) {
+  return (obj.*f)(args...);
+}
+
+template <typename T>
+class ReturnType {
+public:
+  typedef T type;
+};
+
+template <>
+class ReturnType<CpuMatrix> {
+public:
+  typedef GpuMatrix type;
+};
+
+template <>
+class ReturnType<CpuIVector> {
+public:
+  typedef GpuIVector type;
+};
+
+template <>
+class ReturnType<CpuSparseMatrix> {
+public:
+  typedef GpuSparseMatrix type;
+};
+
+template <typename T>
+typename ReturnType<T>::type autoArgs(T& v) {
+  return v;
+}
+
+template <>
+GpuMatrix autoArgs(CpuMatrix& v) {
+  GpuMatrix a(v.getHeight(), v.getWidth());
+  a.copyFrom(v);
+  return a;
+}
+
+template <>
+GpuIVector autoArgs(CpuIVector& v) {
+  GpuIVector a(v.getSize());
+  a.copyFrom(v);
+  return a;
+}
+
+template <>
+GpuSparseMatrix autoArgs(CpuSparseMatrix& v) {
+  GpuSparseMatrix a(v.getHeight(),
+                    v.getWidth(),
+                    v.getElementCnt(),
+                    v.getValueType(),
+                    v.getFormat());
+  a.copyFrom(v, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  return a;
+}
+
+class AutoCompare {
+public:
+  /**
+   * err is the allowed calculation error.
+   * The smaller the value of err,
+   * the stricter the comparison is between CPU and GPU calculations.
+   */
+  AutoCompare(size_t height, size_t width, real err = 1e-3)
+      : cpu(height, width), gpu(height, width), compare(err) {
+    init(cpu);
+    copy(gpu, cpu);
+  }
+
+  template <typename C, typename R, typename... FArgs, typename... Args>
+  void cmpWithArg(R (C::*f)(FArgs...), Args&&... args) {
+    static_assert(sizeof...(FArgs) == sizeof...(Args),
+                  "size of parameter packs are not equal");
+    call(cpu, f, args...);
+    call(gpu, f, autoArgs(args)...);
+
+    TensorCheck(compare, cpu, gpu);
+  }
+
+  template <std::size_t... I, typename C, typename R, typename... Args>
+  void cmpWithoutArg(R (C::*f)(Args...), size_t height, size_t width) {
+    static_assert(sizeof...(I) == sizeof...(Args),
+                  "size of parameter packs are not equal");
+    (void)height;
+    (void)width;
+    auto tuple1 = std::make_tuple(
+        construct<typename ReplaceType<
+            typename std::decay<
+                typename std::tuple_element<I,
+                                            std::tuple<Args...>>::type>::type,
+            CpuMatrix>::type>(height, width)...);
+
+    auto tuple2 = std::make_tuple(
+        construct<typename ReplaceType<
+            typename std::decay<
+                typename std::tuple_element<I,
+                                            std::tuple<Args...>>::type>::type,
+            GpuMatrix>::type>(height, width)...);
+
+    initTuple(tuple1);
+    copyTuple(tuple2, tuple1);
+
+    call(cpu, f, std::get<I>(tuple1)...);
+    call(gpu, f, std::get<I>(tuple2)...);
+
+    TensorCheck(compare, cpu, gpu);
+  }
+
+protected:
+  CpuMatrix cpu;
+  GpuMatrix gpu;
+  AssertEqual compare;
+};
+
+}  // namespace autotest
diff --git a/paddle/math/tests/test_Allocator.cpp b/paddle/math/tests/test_Allocator.cpp
index 084322a1caf579cf6237b41c51efa220c6f2d5a2..33e0952efedddec16acf6153209e14f18fd48134 100644
--- a/paddle/math/tests/test_Allocator.cpp
+++ b/paddle/math/tests/test_Allocator.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include "paddle/utils/Util.h"
 #include "paddle/utils/Logging.h"
+#include "paddle/utils/Util.h"
 #define private public
-#include "paddle/math/MemoryHandle.h"
 #include "paddle/math/Allocator.h"
+#include "paddle/math/MemoryHandle.h"
 #include "paddle/math/PoolAllocator.h"
 
 using namespace paddle;  // NOLINT
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cc7c1e7eb2734605cb278a4b97cab22bdba1594e
--- /dev/null
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -0,0 +1,251 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_ONLY_CPU
+/**
+ * This test file use autotest::AutoCompare and cmpWithoutArg to compares the
+ * implementation of CPU and GPU member function in
+ * BaseMatrix.cpp and Matrix.cpp.
+ */
+
+#include <gtest/gtest.h>
+#include "TestUtils.h"
+#include "paddle/math/BaseMatrix.h"
+
+using paddle::BaseMatrix;
+using paddle::Matrix;
+using autotest::AutoCompare;
+
+// Test all void (BaseMatrix::*)() function
+TEST(BaseMatrix, void) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)()) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg(f, height, width);
+      };
+
+      compare(&BaseMatrix::neg);
+      compare(&BaseMatrix::exp2);
+      compare(&BaseMatrix::log2);
+      compare(&BaseMatrix::sqrt2);
+      compare(&BaseMatrix::square2);
+      compare(&BaseMatrix::reciprocal2);
+      compare(&BaseMatrix::abs2);
+      compare(&BaseMatrix::sign2);
+      compare(&BaseMatrix::zero);
+      compare(&BaseMatrix::one);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(real) function
+TEST(BaseMatrix, real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0>(f, height, width);
+      };
+
+      compare(&BaseMatrix::pow2);
+      compare(&BaseMatrix::subScalar);
+      compare(&BaseMatrix::mulScalar);
+      compare(&BaseMatrix::divScalar);
+      compare(&BaseMatrix::assign);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::biggerThanScalar);
+      compare(&BaseMatrix::downClip);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(BaseMatrix&) function
+TEST(BaseMatrix, BaseMatrix) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0>(f, height, width);
+      };
+
+      compare(&BaseMatrix::assign);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::relu);
+      compare(&BaseMatrix::reluDerivative);
+      compare(&BaseMatrix::softrelu);
+      compare(&BaseMatrix::softreluDerivative);
+      compare(&BaseMatrix::brelu);
+      compare(&BaseMatrix::breluDerivative);
+      compare(&BaseMatrix::square2);
+      compare(&BaseMatrix::squareDerivative);
+      compare(&BaseMatrix::tanh);
+      compare(&BaseMatrix::tanhDerivative);
+      compare(&BaseMatrix::reciprocal2);
+      compare(&BaseMatrix::reciprocalDerivative);
+      compare(&BaseMatrix::abs2);
+      compare(&BaseMatrix::absDerivative);
+      compare(&BaseMatrix::sigmoid);
+      compare(&BaseMatrix::sigmoidDerivative);
+      compare(&BaseMatrix::expDerivative);
+      compare(&BaseMatrix::sign2);
+      compare(&BaseMatrix::exp2);
+      compare(&BaseMatrix::log2);
+      compare(&BaseMatrix::sqrt2);
+      compare(&BaseMatrix::dotMul);
+      compare(&BaseMatrix::dotMulSquare);
+      compare(&BaseMatrix::dotSquareMul);
+      compare(&BaseMatrix::addColVector);
+      compare(&BaseMatrix::addRowVector);
+      compare(&BaseMatrix::mulRowVector);
+      compare(&BaseMatrix::divRowVector);
+      compare(&BaseMatrix::addP2P);
+      compare(&BaseMatrix::invSqrt);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(real, real) function
+TEST(BaseMatrix, real_real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(real, real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::clip);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(BaseMatrix&, real) function
+TEST(BaseMatrix, BaseMatrix_real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&, real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+
+      compare(&BaseMatrix::addBias);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::sub);
+      compare(&BaseMatrix::pow2);
+      compare(&BaseMatrix::addScalar);
+      compare(&BaseMatrix::subScalar);
+      compare(&BaseMatrix::mulScalar);
+      compare(&BaseMatrix::divScalar);
+      compare(&BaseMatrix::scalarDiv);
+      compare(&BaseMatrix::addSquare);
+      compare(&BaseMatrix::isEqualTo);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(BaseMatrix&, BaseMatrix&) function
+TEST(BaseMatrix, BaseMatrix_BaseMatrix) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height,
+                      width](void (BaseMatrix::*f)(BaseMatrix&, BaseMatrix&)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+
+      compare(&BaseMatrix::softCrossEntropy);
+      compare(&BaseMatrix::softCrossEntropyBp);
+      compare(&BaseMatrix::binaryLabelCrossEntropy);
+      compare(&BaseMatrix::binaryLabelCrossEntropyBp);
+      compare(&BaseMatrix::sub);
+      compare(&BaseMatrix::add2);
+      compare(&BaseMatrix::dotMul);
+      compare(&BaseMatrix::dotDiv);
+      compare(&BaseMatrix::logisticRegressionLoss);
+      compare(&BaseMatrix::logisticRegressionLossBp);
+      compare(&BaseMatrix::biggerThan);
+      compare(&BaseMatrix::max2);
+      compare(&BaseMatrix::dotMulSquare);
+      compare(&BaseMatrix::dotSquareSquare);
+    }
+  }
+}
+
+void TestEelementWise(size_t height, size_t width) {
+  AutoCompare rowScale(height, width);
+  rowScale.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowScale, height, width);
+
+  AutoCompare rowDotMul(height, width);
+  rowDotMul.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowDotMul, height, width);
+
+  AutoCompare binaryClassificationError(height, width);
+  binaryClassificationError.cmpWithoutArg<0, 1, 2, 3>(
+      &BaseMatrix::binaryClassificationError, height, width);
+
+  AutoCompare sumOfSquaresBp(height, width);
+  sumOfSquaresBp.cmpWithoutArg<0, 1>(&Matrix::sumOfSquaresBp, height, width);
+}
+
+void TestAggregateToRow(size_t height, size_t width) {
+  AutoCompare maxCols(1, width);
+  maxCols.cmpWithoutArg<0>(&BaseMatrix::maxCols, height, width);
+
+  AutoCompare minCols(1, width);
+  minCols.cmpWithoutArg<0>(&BaseMatrix::minCols, height, width);
+
+  AutoCompare addDotMulVMM(1, width);
+  addDotMulVMM.cmpWithoutArg<0, 1>(&BaseMatrix::addDotMulVMM, height, width);
+
+  AutoCompare sumCols(1, width);
+  sumCols.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumCols, height, width);
+
+  AutoCompare collectBias(1, width);
+  collectBias.cmpWithoutArg<0, 1>(
+      static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::collectBias),
+      height,
+      width);
+}
+
+void TestAggregateToCol(size_t height, size_t width) {
+  AutoCompare maxRows(height, 1);
+  maxRows.cmpWithoutArg<0>(&BaseMatrix::maxRows, height, width);
+
+  AutoCompare minRows(height, 1);
+  minRows.cmpWithoutArg<0>(&BaseMatrix::minRows, height, width);
+
+  AutoCompare sumRows(height, 1);
+  sumRows.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumRows, height, width);
+
+  AutoCompare sumOfSquares(height, 1);
+  sumOfSquares.cmpWithoutArg<0, 1>(&Matrix::sumOfSquares, height, width);
+}
+
+TEST(BaseMatrix, Other) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      TestEelementWise(height, width);
+      TestAggregateToRow(height, width);
+      TestAggregateToCol(height, width);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#endif
diff --git a/paddle/math/tests/test_CpuGpuVector.cpp b/paddle/math/tests/test_CpuGpuVector.cpp
index 7b50b020cda9315c48b897cdab82675c625094ef..624fa20ca58bca3f16fa567487bbaa5d9656e1b1 100644
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ b/paddle/math/tests/test_CpuGpuVector.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #ifndef PADDLE_ONLY_CPU
 
-#include "paddle/utils/Util.h"
+#include <gtest/gtest.h>
 #include "paddle/math/Vector.h"
+#include "paddle/utils/Util.h"
 #include "test_matrixUtil.h"
-#include <gtest/gtest.h>
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
index b3eca19a7291d2b71b801793f824c1087a3ded27..27216ddb58eccd7fd52e121e795baf463ea69f51 100644
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/utils/PythonUtil.h>
 #include <gtest/gtest.h>
-#include <vector>
+#include <paddle/utils/PythonUtil.h>
 #include <paddle/utils/Util.h>
+#include <vector>
 #include "paddle/math/SparseMatrix.h"
 
 using namespace paddle;  // NOLINT
diff --git a/paddle/math/tests/test_FPException.cpp b/paddle/math/tests/test_FPException.cpp
index f996e0daddd3ef41e195de48640631a979a87192..6aa5891bce922c00cbb4f69a511fb3c42d53f319 100644
--- a/paddle/math/tests/test_FPException.cpp
+++ b/paddle/math/tests/test_FPException.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/math/tests/test_GpuProfiler.cpp b/paddle/math/tests/test_GpuProfiler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d490078d909e7940e83a6f461f9386eeda02f53c
--- /dev/null
+++ b/paddle/math/tests/test_GpuProfiler.cpp
@@ -0,0 +1,165 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_ONLY_CPU
+
+#include <gtest/gtest.h>
+#include "paddle/gserver/tests/TestUtil.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      real a = data1[i * width + j];
+      real b = data2[i * width + j];
+      if (fabs(a - b) > err) {
+        if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
+          count++;
+        }
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void testBilinearFwdBwd(int numSamples,
+                        int imgSizeH,
+                        int imgSizeW,
+                        int channels) {
+  int inWidth = imgSizeH * imgSizeW * channels;
+  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
+  real ratioH = 0.5;
+  real ratioW = 0.5;
+
+  // forward
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+
+  input->randomizeUniform();
+  inputGpu->copyFrom(*input);
+
+  {
+    // nvprof: GPU Proflier
+    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
+    target->bilinearForward(*input,
+                            imgSizeH,
+                            imgSizeW,
+                            2 * imgSizeH,
+                            2 * imgSizeW,
+                            channels,
+                            ratioH,
+                            ratioW);
+    targetGpu->bilinearForward(*inputGpu,
+                               imgSizeH,
+                               imgSizeW,
+                               2 * imgSizeH,
+                               2 * imgSizeW,
+                               channels,
+                               ratioH,
+                               ratioW);
+  }
+
+  // check
+  targetCheck->copyFrom(*targetGpu);
+  MatrixCheckErr(*target, *targetCheck);
+
+  // backward
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr targetCheckGrad =
+      CpuMatrix::create(numSamples, inWidth, false, false);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->bilinearBackward(*targetGrad,
+                              2 * imgSizeH,
+                              2 * imgSizeW,
+                              imgSizeH,
+                              imgSizeW,
+                              channels,
+                              ratioH,
+                              ratioW);
+  inputGpuGrad->bilinearBackward(*targetGpuGrad,
+                                 2 * imgSizeH,
+                                 2 * imgSizeW,
+                                 imgSizeH,
+                                 imgSizeW,
+                                 channels,
+                                 ratioH,
+                                 ratioW);
+
+  // check
+  targetCheckGrad->copyFrom(*inputGpuGrad);
+  MatrixCheckErr(*inputGrad, *targetCheckGrad);
+}
+
+TEST(Profiler, testBilinearFwdBwd) {
+  auto numSamples = 10;
+  auto channels = 16;
+  auto imgSize = 64;
+  {
+    // nvprof: GPU Proflier
+    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
+    // Paddle built-in timer
+    REGISTER_TIMER_INFO(
+        "testBilinearFwdBwd",
+        "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
+    testBilinearFwdBwd(numSamples, imgSize, imgSize, channels);
+  }
+  globalStat.printAllStatus();
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+
+  // nvprof: GPU Proflier
+  REGISTER_GPU_PROFILER(
+      "RecursiveProfilingTest",
+      "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
+
+  return RUN_ALL_TESTS();
+}
+
+#endif /* PADDLE_ONLY_CPU */
diff --git a/paddle/math/tests/test_Matrix.cpp b/paddle/math/tests/test_Matrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..adb5fbd9fa30d810a25a2eb11f6d57474c1304c7
--- /dev/null
+++ b/paddle/math/tests/test_Matrix.cpp
@@ -0,0 +1,300 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_ONLY_CPU
+/**
+ * This test file use autotest::AutoCompare and cmpWithArg to compares the
+ * implementation of CPU and GPU member function in Matrix.cpp.
+ */
+
+#include <gtest/gtest.h>
+#include "TestUtils.h"
+
+using paddle::BaseMatrix;
+using paddle::Matrix;
+using paddle::CpuMatrix;
+using paddle::CpuIVector;
+using paddle::CpuSparseMatrix;
+using autotest::AutoCompare;
+
+void testBilinearFwdBwd(int numSamples,
+                        int imgSizeH,
+                        int imgSizeW,
+                        int channels) {
+  int inWidth = imgSizeH * imgSizeW * channels;
+  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
+  real ratioH = 0.5;
+  real ratioW = 0.5;
+
+  AutoCompare forward(numSamples, outWidth);
+  CpuMatrix arg1(numSamples, inWidth);
+  arg1.randomizeUniform();
+  forward.cmpWithArg(&Matrix::bilinearForward,
+                     arg1,
+                     imgSizeH,
+                     imgSizeW,
+                     2 * imgSizeH,
+                     2 * imgSizeW,
+                     channels,
+                     ratioH,
+                     ratioW);
+
+  AutoCompare backward(numSamples, inWidth);
+  CpuMatrix arg2(numSamples, outWidth);
+  arg2.randomizeUniform();
+  backward.cmpWithArg(&Matrix::bilinearBackward,
+                      arg2,
+                      2 * imgSizeH,
+                      2 * imgSizeW,
+                      imgSizeH,
+                      imgSizeW,
+                      channels,
+                      ratioH,
+                      ratioW);
+}
+
+TEST(Matrix, BilinearFwdBwd) {
+  for (auto numSamples : {5, 10}) {
+    for (auto channels : {8, 16}) {
+      for (auto imgSizeH : {14, 28}) {
+        for (auto imgSizeW : {16, 30}) {
+          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
+          testBilinearFwdBwd(numSamples, imgSizeH, imgSizeW, channels);
+        }
+      }
+    }
+  }
+}
+
+void testMatrixAddBias(int height, int width, real scale) {
+  AutoCompare test(height, width);
+  CpuMatrix arg1(1, width);
+  arg1.randomizeUniform();
+  test.cmpWithArg(
+      static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::addBias),
+      arg1,
+      scale);
+}
+
+void testMatrixAddDotMulMMV(int height, int width) {
+  AutoCompare test(height, width);
+  CpuMatrix arg1(height, width);
+  CpuMatrix arg2(1, width);
+  arg1.randomizeUniform();
+  arg2.randomizeUniform();
+  test.cmpWithArg(&BaseMatrix::addDotMulMMV, arg1, arg2);
+}
+
+TEST(Matrix, unary) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+      testMatrixAddBias(height, width, 1.0);
+      testMatrixAddBias(height, width, 3.5);
+      testMatrixAddDotMulMMV(height, width);
+    }
+  }
+}
+
+void testMatrixAddAtOffset(int height, int width1, int width2, int offset) {
+  AutoCompare test(height, width2);
+  CpuMatrix arg1(height, width1);
+  arg1.randomizeUniform();
+  test.cmpWithArg(&Matrix::addAtOffset, arg1, offset);
+}
+
+void testMatrixAssignAtOffset(int height, int width1, int width2, int offset) {
+  AutoCompare test(height, width2);
+  CpuMatrix arg1(height, width1);
+  arg1.randomizeUniform();
+  test.cmpWithArg(&Matrix::assignAtOffset, arg1, offset);
+}
+
+TEST(Matrix, AtOffset) {
+  for (auto height : {1, 11, 73, 128, 200}) {
+    for (auto width1 : {1, 32, 100, 512, 1000}) {
+      for (auto width2 : {1, 32, 100, 512, 1000}) {
+        int columnOffset = 0;
+        int offset = std::abs(width1 - width2);
+        if (offset) {
+          columnOffset = std::rand() % offset;
+        }
+        VLOG(3) << " height=" << height << " width1=" << width1
+                << " width2=" << width2 << " columnOffset = " << columnOffset;
+        testMatrixAddAtOffset(height, width1, width2, columnOffset);
+        testMatrixAssignAtOffset(height, width1, width2, columnOffset);
+      }
+    }
+  }
+}
+
+void testMatrixSelectRows(int numSamples, int tableSize, int inputDim) {
+  AutoCompare test(numSamples, inputDim);
+  CpuMatrix arg1(tableSize, inputDim);
+  CpuIVector arg2(numSamples);
+  arg1.randomizeUniform();
+  arg2.rand(tableSize);
+  test.cmpWithArg(&Matrix::selectRows, arg1, arg2);
+}
+
+TEST(Matrix, tableProjection) {
+  for (auto numSamples : {10, 100, 1000, 10000, 80000}) {
+    for (auto tableSize : {10, 100}) {
+      for (auto inputDim : {20, 50}) {
+        VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
+                << " inputDim=" << inputDim;
+        testMatrixSelectRows(numSamples, tableSize, inputDim);
+      }
+    }
+  }
+}
+
+void testMatrixCopyByRowIndex(int outHeight, int inHeight, int width) {
+  AutoCompare test(outHeight, width);
+  CpuMatrix arg1(inHeight, width);
+  CpuIVector arg2(outHeight);
+  arg1.randomizeUniform();
+  arg2.rand(inHeight);
+  test.cmpWithArg(&Matrix::copyByRowIndex, arg1, arg2);
+}
+
+TEST(Matrix, copyByRowIndex) {
+  for (auto outHeight : {31, 500, 1000}) {
+    for (auto inHeight : {17, 257, 500, 1200}) {
+      for (auto width : {512, 1024}) {
+        VLOG(3) << outHeight << " " << inHeight << " " << width;
+        testMatrixCopyByRowIndex(outHeight, inHeight, width);
+      }
+    }
+  }
+}
+
+void testCosSim(int heightX, int heightY, int width, real scale) {
+  AutoCompare test(heightX, 1);
+  CpuMatrix arg1(heightX, width);
+  CpuMatrix arg2(heightY, width);
+  arg1.randomizeUniform();
+  arg2.randomizeUniform();
+  arg2.add(-0.5);
+  test.cmpWithArg(&Matrix::cosSim, arg1, arg2, scale);
+}
+
+TEST(Matrix, cosSim) {
+  for (auto heightX : {10, 100, 1000}) {
+    for (auto heightY : {1, heightX}) {
+      for (auto width : {10, 100, 1000}) {
+        for (auto scale : {1.0, 2.0}) {
+          testCosSim(heightX, heightY, width, scale);
+        }
+      }
+    }
+  }
+}
+
+void testParamReluForward(int height, int width, int w_height, int w_width) {
+  AutoCompare test(height, width);
+  CpuMatrix arg1(height, width);
+  CpuMatrix arg2(w_height, w_width);
+  arg1.randomizeUniform();
+  arg2.randomizeUniform();
+  arg1.add(-0.5);
+  test.cmpWithArg(&Matrix::paramReluForward, arg1, arg2);
+}
+
+void testParamReluBackwardW(int height, int width, int w_height, int w_width) {
+  AutoCompare test(w_height, w_width);
+  CpuMatrix arg1(height, width);
+  CpuMatrix arg2(height, width);
+  arg1.randomizeUniform();
+  arg2.randomizeUniform();
+  arg2.add(-0.5);
+  test.cmpWithArg(&Matrix::paramReluBackwardW, arg1, arg2);
+}
+
+TEST(Matrix, paramRelu) {
+  for (auto height : {10, 100}) {
+    for (auto width : {10, 100}) {
+      for (auto w_height : {1, 2}) {
+        for (auto w_width : {1, 2}) {
+          testParamReluForward(height, width, w_height, w_width);
+          testParamReluBackwardW(height, width, w_height, w_width);
+        }
+      }
+    }
+  }
+}
+
+void testAddSharedBias(int numSamples, int dim, int channel) {
+  AutoCompare test(numSamples, dim);
+  CpuMatrix arg1(1, channel);
+  arg1.randomizeUniform();
+  test.cmpWithArg(&Matrix::addSharedBias, arg1, 1.0);
+}
+
+void testCollectSharedBias(int numSamples, int dim, int channel) {
+  AutoCompare test(1, channel);
+  CpuMatrix arg1(numSamples, dim);
+  arg1.randomizeUniform();
+  test.cmpWithArg(&Matrix::collectSharedBias, arg1, 1.0);
+}
+
+TEST(Matrix, sharedBias) {
+  for (auto numSamples : {1, 100, 520}) {
+    for (auto dim : {100 * 16, 100 * 32}) {
+      for (auto channel : {8, 16}) {
+        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
+                << " channel=" << channel;
+        testAddSharedBias(numSamples, dim, channel);
+        testCollectSharedBias(numSamples, dim, channel);
+      }
+    }
+  }
+}
+
+void testMultiBinaryLabelCrossEntropy(int numSamples, int dim) {
+  AutoCompare forward(numSamples, 1);
+  CpuMatrix arg1(numSamples, dim);
+  CpuSparseMatrix arg2(
+      numSamples, dim, numSamples, paddle::NO_VALUE, paddle::SPARSE_CSR);
+
+  CpuMatrix output1(numSamples, dim);
+  output1.randomizeUniform();
+  output1.softmax(arg1);
+  for (int i = 0; i < numSamples; i++) {
+    const unsigned int id = std::rand() % dim;
+    arg2.setRow(i, 1, &id, nullptr);
+  }
+  forward.cmpWithArg(&Matrix::multiBinaryLabelCrossEntropy, arg1, arg2);
+
+  AutoCompare backward(numSamples, dim);
+  backward.cmpWithArg(&Matrix::multiBinaryLabelCrossEntropyBp, arg1, arg2);
+}
+
+TEST(Matrix, multiBinaryCrossEntropy) {
+  for (auto numSamples : {100, 1000, 10000}) {
+    for (auto dim : {100, 1000, 10000}) {
+      VLOG(3) << " numSamples=" << numSamples << " dim=" << dim;
+      testMultiBinaryLabelCrossEntropy(numSamples, dim);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#endif
diff --git a/paddle/math/tests/test_SIMDFunctions.cpp b/paddle/math/tests/test_SIMDFunctions.cpp
index 8405b96fc2b915e2e1a5676ab5e3f25b4acde75a..f62843310d886ba7d449e793066b19a7cc7bd5a9 100644
--- a/paddle/math/tests/test_SIMDFunctions.cpp
+++ b/paddle/math/tests/test_SIMDFunctions.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,10 +17,10 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 
-#include <random>
-#include <functional>
 #include <algorithm>
+#include <functional>
 #include <memory>
+#include <random>
 
 #include <stdlib.h>
 #include <time.h>
diff --git a/paddle/math/tests/test_matrix.cpp b/paddle/math/tests/test_SparseMatrix.cpp
similarity index 99%
rename from paddle/math/tests/test_matrix.cpp
rename to paddle/math/tests/test_SparseMatrix.cpp
index 3788218aab100d4ad683e85149a9513e54ca2480..88b75b6d83612c56a598cf1b301bd38f888e1cce 100644
--- a/paddle/math/tests/test_matrix.cpp
+++ b/paddle/math/tests/test_SparseMatrix.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/math/tests/test_Tensor.cu b/paddle/math/tests/test_Tensor.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1859b9fc13576b6f1d0bc13b43f7e7a2ef6030c9
--- /dev/null
+++ b/paddle/math/tests/test_Tensor.cu
@@ -0,0 +1,1173 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/math/Matrix.h"
+#include "TensorCheck.h"
+
+using paddle::Matrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::CpuVector;
+using paddle::GpuVector;
+using paddle::CpuIVector;
+using paddle::GpuIVector;
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;
+
+#define INIT_UNARY(A1, A2)                  \
+    Tensor A1(height, width);               \
+    Tensor A2(height, width);               \
+    A1.randomizeUniform();                  \
+    A2.copyFrom(A1)
+#define INIT_BINARY(A1, A2, B)              \
+    INIT_UNARY(A1, A2);                     \
+    Tensor B(height, width);                \
+    B.randomizeUniform()
+#define INIT_TERNARY(A1, A2, B, C)          \
+    INIT_BINARY(A1, A2, B);                 \
+    Tensor C(height, width);                \
+    C.randomizeUniform()
+#define INIT_QUATERNARY(A1, A2, B, C, D)    \
+    INIT_TERNARY(A1, A2, B, C);             \
+    Tensor D(height, width);                \
+    D.randomizeUniform()
+
+template<typename Tensor>
+struct TestUnaryMatrix {
+  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
+
+  explicit TestUnaryMatrix(UnaryFunc testUnaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_UNARY(A1, A2);
+        testUnaryFunc(A1, A2);
+      }
+    }
+  }
+};
+
+template<typename Tensor>
+struct TestBinaryMatrix {
+  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B)> BinaryFunc;
+
+  explicit TestBinaryMatrix(BinaryFunc testBinaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_BINARY(A1, A2, B);
+        testBinaryFunc(A1, A2, B);
+      }
+    }
+  }
+};
+
+template<typename Tensor>
+struct TestTernaryMatrix {
+  typedef std::function<void(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C)> TernaryFunc;
+
+  explicit TestTernaryMatrix(TernaryFunc testTernaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_TERNARY(A1, A2, B, C);
+        testTernaryFunc(A1, A2, B, C);
+      }
+    }
+  }
+};
+
+template<typename Tensor>
+struct TestQuaternaryMatrix {
+  typedef std::function<void(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)> QuaternaryFunc;
+
+  explicit TestQuaternaryMatrix(QuaternaryFunc testQuaternaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_QUATERNARY(A1, A2, B, C, D);
+        testQuaternaryFunc(A1, A2, B, C, D);
+      }
+    }
+  }
+};
+
+template<typename Tensor, class T>
+struct TestUnaryVectorT {
+  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
+
+  explicit TestUnaryVectorT(UnaryFunc testUnaryFunc) {
+    for (auto size : {1, 11, 73, 128, 200, 330, 512, 1000, 4210}) {
+      LOG(INFO) << " size=" << size;
+      Tensor A1(size);
+      Tensor A2(size);
+      if (typeid(T) == typeid(real)) {
+        A1.rand();
+      } else {
+        A1.rand(1000);
+      }
+      A2.copyFrom(A1);
+      testUnaryFunc(A1, A2);
+    }
+  }
+};
+
+void SetTensorValue(Matrix& matrix, real value) {
+  int height = matrix.getHeight();
+  int width = matrix.getWidth();
+  int stride = matrix.getStride();
+  real* data = matrix.getData();
+  for (int i = 0; i < height; i++) {
+    int j = rand() % width;  // NOLINT
+    if (typeid(matrix) == typeid(CpuMatrix)) {
+      data[i * stride + j] = value;
+    } else if (typeid(matrix) == typeid(GpuMatrix)) {
+      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
+    } else {
+    }
+  }
+}
+
+template<typename Tensor>
+void testTensorAddScalar(Tensor& A1, Tensor& A2) {
+  real p1 = 2.5;
+  real p2 = 3.0;
+  A1.add(p1);   // a += p
+  A2 += p1;
+  TensorCheckEqual(A1, A2);
+
+  A1.add(p1, p2);  // a = a * p1 + p2
+  A2 = A2 * p1 + p2;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSubScalar(Tensor& A1, Tensor& A2) {
+  real p = 2.5;
+  A1.subScalar(p);  // a -= p
+  A2 -= p;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorMulScalar(Tensor& A1, Tensor& A2) {
+  real p = 2.5;
+  A1.mulScalar(p);  // a *= p
+  A2 *= p;
+  TensorCheckEqual(A1, A2);
+
+  real learningRate = 0.7f;
+  real decayRate = 1.2f;
+  A1.applyL2(learningRate, decayRate);
+  A2 = A2 * (1.0f / (1.0f + learningRate * decayRate));
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorDivScalar(Tensor& A1, Tensor& A2) {
+  real p = 2.5;
+  A1.divScalar(p);  // a /= p
+  A2 /= p;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorNeg(Tensor& A1, Tensor& A2) {
+  A1.neg();  // a = -a
+  A2 = -A2;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorAbs(Tensor& A1, Tensor& A2) {
+  A1.abs2();  // a = a > 0 ? a : -a
+  A2 = A2.abs();
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSquare(Tensor& A1, Tensor& A2) {
+  A1.square2();  // a = a * a
+  A2 = A2.square();
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorReciprocal(Tensor& A1, Tensor& A2) {
+  A1.reciprocal2();  // a = 1.0f / a
+  A2 = A2.reciprocal();
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSign(Tensor& A1, Tensor& A2) {
+  A1.sign2();  // a = (a > 0) - (a < 0)
+  A2 = A2.sign();
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorAssign(Tensor& A1, Tensor& A2) {
+  A1.assign(1.5);   // a = p
+  A2 = A2.constant(1.5);
+  TensorCheckEqual(A1, A2);
+
+  A1.one();  // a = 1
+  A2 = A2.constant(1.0);
+  TensorCheckEqual(A1, A2);
+
+  A1.zero();  // a = 0
+  A2 = A2.constant(0.0);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
+  testTensorAddScalar(A1, A2);
+  testTensorSubScalar(A1, A2);
+  testTensorMulScalar(A1, A2);
+  testTensorDivScalar(A1, A2);
+  testTensorNeg(A1, A2);
+  testTensorAbs(A1, A2);
+  testTensorSquare(A1, A2);
+  testTensorReciprocal(A1, A2);
+  testTensorSign(A1, A2);
+  testTensorAssign(A1, A2);
+}
+
+template<typename Tensor>
+void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
+  A1.add(2);   // a += p
+  A2 += 2;
+  TensorCheckEqual(A1, A2);
+
+  A1.add(3, 2);  // a = a * p1 + p2
+  A2 = A2 * 3 + 2;
+  TensorCheckEqual(A1, A2);
+
+  testTensorNeg(A1, A2);
+  testTensorAbs(A1, A2);
+}
+
+TEST(Unary, BaseOp) {
+  TestUnaryMatrix<CpuMatrix> testCpuMatrix(testUnaryBaseOp<CpuMatrix>);
+  TestUnaryVectorT<CpuVector, real> testCpuVector(testUnaryBaseOp<CpuVector>);
+  TestUnaryVectorT<CpuIVector, int>
+    testCpuIVector(testUnaryBaseOpInt<CpuIVector>);
+
+#ifndef PADDLE_ONLY_CPU
+  TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>);
+  TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>);
+  TestUnaryVectorT<GpuIVector, int>
+    testGpuIVector(testUnaryBaseOpInt<GpuIVector>);
+#endif
+}
+
+template<typename Tensor>
+void testTensorExp(Tensor& A1, Tensor& A2) {
+  A1.exp2();  // a = exp(a)
+  A2 = A2.exp();
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorLog(Tensor& A1, Tensor& A2) {
+  A1.log2();  // a = log(a)
+  A2 = A2.log();
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSqrt(Tensor& A1, Tensor& A2) {
+  A1.sqrt2();  // a = sqrt(a)
+  A2 = A2.sqrt();
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorPow(Tensor& A1, Tensor& A2) {
+  A1.pow2(3.2);  // a = pow(a, p)
+  A2 = A2.pow(3.2);
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testUnayrMathOp(Tensor& A1, Tensor& A2) {
+  testTensorExp(A1, A2);
+  testTensorLog(A1, A2);
+  testTensorSqrt(A1, A2);
+  testTensorPow(A1, A2);
+}
+
+TEST(Unary, MathOp) {
+  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrMathOp<CpuMatrix>);
+
+#ifndef PADDLE_ONLY_CPU
+  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrMathOp<GpuMatrix>);
+#endif
+}
+
+template<typename Tensor>
+void testTensorClip(Tensor& A1, Tensor& A2) {
+  real p1 = 0.003f;
+  real p2 = 0.877f;
+  A1.clip(p1, p2);  // a = a < p1 ? p1 : (a > p2 ? p2 : a)
+  // A2 = A2.min(0.877f).max(0.003f);
+  A2 = (A2 < p1).condition(p1, (A2 > p2).condition(p2, A2));
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
+  real p = 0.5f;
+  A1.biggerThanScalar(p);  // a = a > p ? 1.0f : 0.0f
+  A2 = (A2 > p).condition((real)1.0, (real)0.0);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorapplyL1(Tensor& A1, Tensor& A2) {
+  /**
+   * T lambda = p;
+   * a = (a > lambda) ? (a - lambda)
+   *                  : (a < -lambda) ? (a + lambda) : 0
+   *
+   * p = learningRate * decayRate;
+   */
+  real learningRate = 0.7f;
+  real decayRate = 0.6f;
+  A1.applyL1(learningRate, decayRate);
+  A2 = (A2 > (learningRate * decayRate)).condition(
+    (A2 - (learningRate * decayRate)),
+    (A2 < -(learningRate * decayRate)).condition(
+      (A2 + (learningRate * decayRate)), (real)0.0));
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testUnayrCompareOp(Tensor& A1, Tensor& A2) {
+  testTensorClip(A1, A2);
+  testTensorBiggerThanScalar(A1, A2);
+
+  A1.randomizeUniform();
+  A1.subScalar(0.5f);
+  A2.copyFrom(A1);
+  testTensorapplyL1(A1, A2);
+}
+
+TEST(Unary, CompareOp) {
+  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrCompareOp<CpuMatrix>);
+
+#ifndef PADDLE_ONLY_CPU
+  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrCompareOp<GpuMatrix>);
+#endif
+}
+
+template<typename Tensor>
+void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p1 = 2.5;
+  real p2 = 3.2;
+  A1.add(B);  // a += b
+  A2 += B;
+  TensorCheckEqual(A1, A2);
+
+  A1.add(B, p1);  // a += b * p
+  A2 += B * p1;
+  TensorCheckEqual(A1, A2);
+
+  A1.add(B, p1, p2);  // a = p1 * a + p2 * b
+  A2 = A2 * p1 + B * p2;
+  TensorCheckEqual(A1, A2);
+
+  A1.addScalar(B, p1);  // a = b + p
+  A2 = B + p1;
+  TensorCheckEqual(A1, A2);
+
+  A1.addSquare(B, p1);  // a += p * b * b
+  A2 += B.constant(p1) * B * B;
+  TensorCheckEqual(A1, A2);
+
+  A1.decayAddSquare(B, p1, p2);  // a = p1 * a + p2 * b * b
+  A2 = A2 * p1 + B.constant(p2) * B * B;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 2.5;
+  A1.sub(B);  // a -= b
+  A2 -= B;
+  TensorCheckEqual(A1, A2);
+
+  A1.sub(B, p);  // a -= b * p
+  A2 -= B * p;
+  TensorCheckEqual(A1, A2);
+
+  A1.subScalar(B, p);  // a = b - p
+  A2 = B - p;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 2.5;
+  A1.mulScalar(B, p);  // a = b * p
+  A2 = B * p;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotMulSquare(B);  // a *= b * b
+  A2 *= B * B;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotSquareMul(B);  // a = a * a * b
+  A2 = A2 * A2 * B;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotMul(B);  // a *= b
+  A2 *= B;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 2.5;
+  A1.divScalar(B, p);  // a = b / p
+  A2 = B / p;
+  TensorCheckEqual(A1, A2);
+
+  A1.scalarDiv(B, p);  // a = p / b
+  A2 = B.constant(p) / B;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorAssign(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.assign(B);  // a = b
+  A2 = B;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSquare(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.square2(A1);   // b = a * a
+  A2 = B.square();
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSquareDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.squareDerivative(B);  // a *= 2.0 * b
+  A2 = A2 * (real)2.0 * B;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.reciprocal2(A1);  // b = 1.0f / a
+  A2 = B.reciprocal();
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 0.58;
+  real p2 = 0.32;
+  A1.reciprocal2(B, p1, p2);  // a = 1 / (p1 * b + p2)
+  A2 = (B * p1 + p2).reciprocal();
+  TensorCheckEqual(A1, A2);
+
+  real learningRate = 0.7f;
+  real decayRate = 1.2f;
+  A1.applyL2(B, learningRate, decayRate);  // a *= (1.0f / (1.0f + p * b))
+  A2 *= (B.constant(1.0f) +
+    B.constant(learningRate * decayRate) * B).reciprocal();
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorReciprocalDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.reciprocalDerivative(B);  // a *= -b * b
+  A2 *= (-B) * B;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSign(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.sign2(A1);  // b = a > 0.0f ? 1.0f : -1.0f
+  A2 = B.sign();
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorAbs(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.abs2(A1);  // b = a > 0.0f ? a : -a
+  A2 = B.abs();
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) {
+  testTensorAdd(A1, A2, B);
+  testTensorSub(A1, A2, B);
+  testTensorMul(A1, A2, B);
+  testTensorDiv(A1, A2, B);
+  testTensorSquare(A1, A2, B);
+  testTensorSquareDerivative(A1, A2, B);
+  testTensorReciprocal(A1, A2, B);
+  testTensorReciprocalDerivative(A1, A2, B);
+  testTensorAbs(A1, A2, B);
+  testTensorSign(A1, A2, B);
+  testTensorAssign(A1, A2, B);
+}
+
+TEST(Binary, BaseOp) {
+  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryBaseOp<CpuMatrix>);
+
+#ifndef PADDLE_ONLY_CPU
+  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryBaseOp<GpuMatrix>);
+#endif
+}
+
+template<typename Tensor>
+void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = exp(b)
+  A1.exp2(B);
+  A2 = B.exp();
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorExpDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.expDerivative(B);  // a *= b
+  A2 *= B;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = log(b)
+  A1.log2(B);
+  A2 = B.log();
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = sqrt(b)
+  A1.sqrt2(B);
+  A2 = B.sqrt();
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = 1.0f / sqrt(b)
+  A1.invSqrt(B);
+  A2 = B.sqrt().reciprocal();
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorPow(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.pow2(B, 2.5f);  // a = pow(b, p)
+  A2 = B.pow(2.5f);
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+   * const T THRESHOLD = 40.0;
+   * b = log(1.0 +
+   *         exp((a > THRESHOLD) ? THRESHOLD
+   *             : ((a < -THRESHOLD) ? (-THRESHOLD) : a)))
+   */
+  B.softrelu(A1);
+
+  real THRESHOLD = 40.0;
+  A2 = (B.constant(1.0f) +
+        (B > THRESHOLD).condition(
+          THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)).exp()).log();
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+   * const T THRESHOLD = 40.0;
+   * a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
+   *                             ? THRESHOLD
+   *                             : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
+   */
+  A1.softreluDerivative(B);
+  real THRESHOLD = 40.0;
+  A2 = A2 * (B.constant(1.0f) -
+             (B.constant(-1.0f) *
+              (B > THRESHOLD).condition(
+                THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))).exp());
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+    const T THRESHOLD_MIN = -40.0;
+    const T THRESHOLD_MAX = 13.0;
+    T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN
+            : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
+    b = 1.0f / (1.0f + exp(-tmp)))
+   */
+  B.sigmoid(A1);
+
+  const real THRESHOLD_MIN = -40.0;
+  const real THRESHOLD_MAX = 13.0;
+  auto tmp = (B < THRESHOLD_MIN).condition(
+    THRESHOLD_MIN, (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B));
+  A2 = (B.constant(1.0f) + (-tmp).exp()).reciprocal();
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSigmoidDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.sigmoidDerivative(B);  // a *= b * (1 - b)
+  A2 *= B * (B.constant(1.0f) - B);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorTanh(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.tanh(A1);  // b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
+  A2 = B.constant(2.0f) / ((B * ((real)-2.0f)).exp() + (real)1.0f) - (real)1.0f;
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.tanhDerivative(B);  // a *= 1 - b * b
+  A2 *= B.constant(1.0f) - B * B;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorScaledTanh(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p1 = 2.5;
+  real p2 = 3.1;
+  // b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)
+  B.scaledTanh(A1, p1, p2);
+  A2 = B.constant(p1) *
+      (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0)
+       - (real)1.0);
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p1 = 2.5;
+  real p2 = 3.1;
+  // a *= (p2 / p1) * (p1 * p1 - b * b));
+  A1.scaledTanhDerivative(B, p1, p2);
+  A2 = A2 * (B.constant(p2 / p1) * (B.constant(p1 * p1) - B * B));
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) {
+  testTensorTanhDerivative(A1, A2, B);
+  testTensorScaledTanhDerivative(A1, A2, B);
+  testTensorSigmoidDerivative(A1, A2, B);
+  testTensorExpDerivative(A1, A2, B);
+  testTensorScaledTanh(A1, A2, B);
+  testTensorTanh(A1, A2, B);
+  testTensorExp(A1, A2, B);
+  testTensorLog(A1, A2, B);
+  testTensorSqrt(A1, A2, B);
+  testTensorInvSqrt(A1, A2, B);
+  testTensorPow(A1, A2, B);
+
+  testTensorSoftrelu(A1, A2, B);
+  testTensorSoftreluDerivative(A1, A2, B);
+  testTensorSigmoid(A1, A2, B);
+}
+
+TEST(Binary, MathOp) {
+  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryMathOp<CpuMatrix>);
+
+#ifndef PADDLE_ONLY_CPU
+  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryMathOp<GpuMatrix>);
+#endif
+}
+
+template<typename Tensor>
+void testTensorRelu(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.relu(A1);  // b = a > 0.0f ? a : 0.0f
+  A2 = (B > (real)0.0f).condition(B, (real)0.0f);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorReluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.reluDerivative(B);  // a *= (b > 0.0f ? 1.0f : 0.0f)
+  A2 *= (B > (real)0.0).condition((real)1.0, (real)0.0);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+   * b = a > p1 ? a : p1
+   * b = b < p2 ? b : p2
+   * int p1 = 0, p2 = 24;
+   */
+  SetTensorValue(B, 32.0f);
+  B.brelu(A1);
+  auto tmp = (B > (real)0.0f).condition(B, (real)0.0f);
+  A2 = (tmp < (real)24.0f).condition(tmp, (real)24.0f);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  SetTensorValue(B, 32.0f);
+  /*
+   * a *= (b > p1 && b < p2) ? 1.0 : 0.0
+   * int p1 = 0, p2 = 24;
+   */
+  A1.breluDerivative(B);
+  A2 *= (B > (real)0.0f && B < (real)24.0f).condition((real)1.0f, (real)0.0f);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorAbsDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.absDerivative(B);  // a = (b > 0) ? a : (b < 0) ? -a : 0
+  A2 = (B > (real)0.0f).condition(A2,
+    (B < (real)0.0f).condition(-A2, (real)0.0f));
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 0.613;
+  SetTensorValue(B, p);
+  A1.isEqualTo(B, p);  // a = (b == p)
+  A2 = (B == p);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
+  /**
+   * T lambda = p * b;
+   * a = (a > lambda) ? (a - lambda)
+   *                  : (a < -lambda) ? (a + lambda) : 0
+   *
+   * p = learningRate * decayRate;
+   */
+  real learningRate = 0.7f;
+  real decayRate = 0.6f;
+  A1.applyL1(B, learningRate, decayRate);
+  auto lambda = B.constant(learningRate * decayRate) * B;
+  A2 = (A2 > lambda).condition(
+    (A2 - lambda), (A2 < -lambda).condition((A2 + lambda), (real)0.0f));
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.subScalar(0.5f);
+  SetTensorValue(B, 0.0f);
+  testTensorReluDerivative(A1, A2, B);
+
+  A1.randomizeUniform();
+  A2.copyFrom(A1);
+  testTensorBreluDerivative(A1, A2, B);
+
+  testTensorAbsDerivative(A1, A2, B);
+  testTensorRelu(A1, A2, B);
+  testTensorBrelu(A1, A2, B);
+  testTensorIsEqualTo(A1, A2, B);
+}
+
+TEST(Binary, CompareOp) {
+  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryCompareOp<CpuMatrix>);
+
+#ifndef PADDLE_ONLY_CPU
+  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryCompareOp<GpuMatrix>);
+#endif
+}
+
+template<typename Tensor>
+void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.add(B, C);  // a = b + c
+  A2 = B + C;
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 1.5;
+  real p2 = 2.5;
+  real p3 = 3.8;
+  A1.add(B, p1, C, p2);  // a = p1 * b + p2 * c
+  A2 = B * p1 + C * p2;
+  TensorCheckEqual(A1, A2);
+
+  A1.add2(B, C);  // a = a + b + c
+  A2 = A2 + B + C;
+  TensorCheckEqual(A1, A2);
+
+  A1.add2(B, C, p1, p2, p3);  // a = p1 * a + p2 * b + p3 * c
+  A2 = A2 * p1 + B * p2 + C * p3;
+  TensorCheckEqual(A1, A2);
+
+  A1.decayAddSquareMul(B, C, p1, p2);  // a = p1 * a + p2 * b * b * c * c
+  A2 = A2 * p1 + B.constant(p2) * B * B * C * C;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.sub(B, C);  // a = b - c
+  A2 = B - C;
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 1.5;
+  real p2 = 2.5;
+  A1.sub(B, p1, C, p2);  // a = p1 * b - p2 * c
+  A2 = B * p1 - C * p2;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.dotMul(B, C);  // a = b * c
+  A2 = B * C;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotMulSquare(B, C);  // a = b * c * c
+  A2 = B * C * C;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotSquareSquare(B, C);  // a = b * b * c * c
+  A2 = B * B * C * C;
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 1.5;
+  real p2 = 2.5;
+
+  /*
+   * T tmp = p1 * b + p2 * c;
+   * a *= tmp * tmp
+   */
+  A1.dotMulSquareSum(B, C, p1, p2);
+  auto tmp = B * p1 + C * p2;
+  A2 *= tmp * tmp;
+  TensorCheckEqual(A1, A2);
+
+  /*
+   * T tmp = p1 * b + p2 * c;
+   * a = tmp * tmp
+   */
+  A1.dotSquareSum(B, C, p1, p2);
+  auto tmp2 = B * p1 + C * p2;
+  A2 = tmp2 * tmp2;
+  TensorCheckEqual(A1, A2);
+
+  // a *= p1 * b + p2 * c
+  A1.dotMulSum(B, C, p1, p2);
+  A2 *= B * p1 + C * p2;
+  TensorCheckEqual(A1, A2);
+
+  // a = p1 * a + p2 * b * c
+  A1.addDotMul(B, C, p1, p2);
+  A2 = A2 * p1 + B.constant(p2) * B * C;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.dotDiv(B, C);  // a = (b == 0.0) ? 0.0 : b / c
+  A2 = (B == (real)0.0).condition((real)0.0, B / C);
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 1.5;
+  real p2 = 2.5;
+  A1.dotDiv(B, C, p1, p2);  // a = (b + p1) / (c + p2)
+  A2 = (B + p1) / (C + p2);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  real p1 = 1.5;
+  real p2 = 2.5;
+  real p3 = 3.5;
+  A1.reciprocalSum(B, C, p1, p2, p3);  // a = 1 / (p1 * b + p2 * c + p3)
+  A2 = (B * p1 + C * p2 + p3).reciprocal();
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSoftCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.softCrossEntropy(B, C);  // a = -c * log(b) - (1 - c) * log(1 - b)
+  A2 = -C * B.log() - (C.constant(1.0f) - C) * (B.constant(1.0f) - B).log();
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSoftCrossEntropyBp(Tensor& A1,
+                                  Tensor& A2,
+                                  Tensor& B,
+                                  Tensor& C) {
+  A1.softCrossEntropyBp(B, C);  // a += (b - c) / (b * (1 - b))
+  A2 += (B - C) / (B * (B.constant(1.0f) - B));
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  testTensorAdd(A1, A2, B, C);
+  testTensorSub(A1, A2, B, C);
+  testTensorMul(A1, A2, B, C);
+  testTensorDiv(A1, A2, B, C);
+  testTensorReciprocal(A1, A2, B, C);
+  testTensorSoftCrossEntropyBp(A1, A2, B, C);
+
+  testTensorSoftCrossEntropy(A1, A2, B, C);
+}
+
+TEST(Ternary, BaseOp) {
+  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryBaseOp<CpuMatrix>);
+
+#ifndef PADDLE_ONLY_CPU
+  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryBaseOp<GpuMatrix>);
+#endif
+}
+
+template<typename Tensor>
+void testTensorBinaryLabelCrossEntropy(Tensor& A1,
+                                       Tensor& A2,
+                                       Tensor& B,
+                                       Tensor& C) {
+  A1.binaryLabelCrossEntropy(B, C);  // a = c > 0.5 ? -log(b) : -log(1.0 - b)
+  A2 = (C > (real)0.5).condition(
+    -(B.log()), -((B.constant(1.0f) - B).log()));
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorBinaryLabelCrossEntropyBp(Tensor& A1,
+                                         Tensor& A2,
+                                         Tensor& B,
+                                         Tensor& C) {
+  // a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)
+  A1.binaryLabelCrossEntropyBp(B, C);
+  A2 += (C > (real)0.5).condition(
+    (B.constant(-1.0f) / B), (B.constant(1.0f) - B).reciprocal());
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorLogisticRegressionLoss(Tensor& A1,
+                                      Tensor& A2,
+                                      Tensor& B,
+                                      Tensor& C) {
+  SetTensorValue(B, 50.0f);
+  SetTensorValue(B, -50.0f);
+  /**
+   * const T THRESHOLD = 40.0;
+   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
+   *                                        ? -THRESHOLD
+   *                                        : b;
+   * a = log(1 + exp(x)) - c * x
+   */
+  A1.logisticRegressionLoss(B, C);
+  real THRESHOLD = 40.0;
+  auto tmp = (B > THRESHOLD).condition(
+    THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
+  A2 = (C.constant(1.0f) + tmp.exp()).log() - C * tmp;
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorLogisticRegressionLossBp(Tensor& A1,
+                                        Tensor& A2,
+                                        Tensor& B,
+                                        Tensor& C) {
+  SetTensorValue(B, 50.0f);
+  SetTensorValue(B, -50.0f);
+  /**
+   * const T THRESHOLD = 40.0;
+   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
+   *                                        ? -THRESHOLD
+   *                                        : b;
+   * x = exp(x); a = x / (1 + x) - c
+   */
+  A1.logisticRegressionLossBp(B, C);
+  real THRESHOLD = 40.0;
+  auto tmp = (B > THRESHOLD).condition(
+    THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
+  auto tmp2 = tmp.exp();
+  A2 = tmp2 / (C.constant(1.0) + tmp2) - C;
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorBiggerThan(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.biggerThan(B, C);  // a = (b > c) ? 1.0f : 0.0f
+  A2 = (B > C).condition((real)1.0f, (real)0.0f);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorMax(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.max2(B, C);  // a = (b > c) ? b : c
+  A2 = (B > C).condition(B, C);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  testTensorBinaryLabelCrossEntropyBp(A1, A2, B, C);
+  testTensorBinaryLabelCrossEntropy(A1, A2, B, C);
+  testTensorBiggerThan(A1, A2, B, C);
+  testTensorMax(A1, A2, B, C);
+
+  testTensorLogisticRegressionLoss(A1, A2, B, C);
+  testTensorLogisticRegressionLossBp(A1, A2, B, C);
+}
+
+TEST(Ternary, CompareOp) {
+  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryCompareOp<CpuMatrix>);
+
+#ifndef PADDLE_ONLY_CPU
+  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryCompareOp<GpuMatrix>);
+#endif
+}
+
+template<typename Tensor>
+void testQuaternaryAdd(Tensor& A1,
+                       Tensor& A2,
+                       Tensor& B,
+                       Tensor& C,
+                       Tensor& D) {
+  // A1.add3(B, C, D, 1.5f, 2.5f, 3.5f);  // a = p1 * b + p2 * c + p3 * d
+  // A2 = B * 1.5f + C * 2.5f + D * 3.5f;
+  // TensorCheckEqual(A1, A2);
+
+  /*
+   * T tmp = p1 * b + p2 * c + p3 * d;
+   * a += tmp * tmp
+   */
+  real p1 = 1.5f;
+  real p2 = 2.5f;
+  real p3 = 3.5f;
+  A1.addSquareSum(B, C, D, p1, p2, p3);
+  auto tmp = B * p1 + C * p2 + D * p3;
+  A2 += tmp * tmp;
+  TensorCheckEqual(A1, A2);
+}
+
+TEST(Quaternary, BaseOp) {
+  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryAdd<CpuMatrix>);
+
+#ifndef PADDLE_ONLY_CPU
+  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryAdd<GpuMatrix>);
+#endif
+}
+
+template<typename Tensor>
+void testTensorBiggerThan(Tensor& A1,
+                          Tensor& A2,
+                          Tensor& B,
+                          Tensor& C,
+                          Tensor& D) {
+  // a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
+  A1.biggerThan(B, C, D);
+  A2 = ((B > C && D > (real)0.5)
+        || (B < C && D < (real)0.5)).condition((real)1.0, (real)0.0);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorRankLoss(Tensor& A1,
+                        Tensor& A2,
+                        Tensor& B,
+                        Tensor& C,
+                        Tensor& D) {
+  /**
+   * const T THRESHOLD = 40.0; a = b - c;
+   * a = (a > THRESHOLD)
+   *         ? THRESHOLD
+   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
+   * a = log(1 + exp(a)) - a * d
+   */
+  A1.rankLoss(B, C, D);
+
+  real THRESHOLD = 40.0;
+  auto tmp = B - C;
+  auto tmp2 = (tmp > THRESHOLD).condition(
+    THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
+  A2 = (D.constant(1.0f) + tmp2.exp()).log() - tmp2 * D;
+
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorRankLossBp(Tensor& A1,
+                          Tensor& A2,
+                          Tensor& B,
+                          Tensor& C,
+                          Tensor& D) {
+  /**
+   * const T THRESHOLD = 40.0; a = b - c;
+   * a = (a > THRESHOLD)
+   *         ? THRESHOLD
+   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
+   * a = exp(a); a = (a / (1 + a) - d)
+   */
+  A1.rankLossBp(B, C, D);
+  real THRESHOLD = 40.0;
+  auto tmp = B - C;
+  auto tmp2 = (tmp > THRESHOLD).condition(
+    THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
+  auto tmp3 = tmp2.exp();
+  A2 = tmp3 / (D.constant(1.0f) + tmp3) - D;
+
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testQuaternaryCompareOp(Tensor& A1,
+                             Tensor& A2,
+                             Tensor& B,
+                             Tensor& C,
+                             Tensor& D) {
+  testTensorBiggerThan(A1, A2, B, C, D);
+  testTensorRankLoss(A1, A2, B, C, D);
+  testTensorRankLossBp(A1, A2, B, C, D);
+}
+
+TEST(Quaternary, CompareOp) {
+  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryCompareOp<CpuMatrix>);
+
+#ifndef PADDLE_ONLY_CPU
+  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
+#endif
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  hl_start();
+  hl_init(0);
+  return RUN_ALL_TESTS();
+}
+
diff --git a/paddle/math/tests/test_TrainingAlgorithm.cpp b/paddle/math/tests/test_TrainingAlgorithm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2c458cba9ca11e9af8a98b88a6392978c2a9be77
--- /dev/null
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
@@ -0,0 +1,469 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "OriginalOptimizerApi.h"
+#include "PerfUtils.h"
+#include "TensorCheck.h"
+#include "paddle/math/TrainingAlgorithmOp.h"
+#include "paddle/utils/Util.h"
+
+using namespace paddle;  // NOLINT
+
+#ifndef PADDLE_TYPE_DOUBLE
+DEFINE_double(max_diff, 1e-5, "max diff allowed");
+#else
+DEFINE_double(max_diff, 1e-13, "max diff allowed");
+#endif
+
+class SetMaxDiff {
+public:
+  explicit SetMaxDiff(double max_diff) {
+    max_diff_ = FLAGS_max_diff;
+    FLAGS_max_diff = max_diff;
+  }
+  ~SetMaxDiff() { FLAGS_max_diff = max_diff_; }
+
+private:
+  double max_diff_;
+};
+
+#define COPY_VECTOR_TO_CPU(cpuVec, vector)               \
+  do {                                                   \
+    if (vector->useGpu()) {                              \
+      cpuVec = Vector::create(vector->getSize(), false); \
+      cpuVec->copyFrom(*vector);                         \
+    } else {                                             \
+      cpuVec = vector;                                   \
+    }                                                    \
+  } while (0)
+
+int VectorCheckErr(const Vector& vector1, const Vector& vector2) {
+  CHECK(vector1.getSize() == vector2.getSize());
+
+  const real* data1 = vector1.getData();
+  const real* data2 = vector2.getData();
+  size_t size = vector1.getSize();
+  int count = 0;
+  for (size_t i = 0; i < size; i++) {
+    real a = data1[i];
+    real b = data2[i];
+    if (fabs(a - b) > FLAGS_max_diff) {
+      if ((fabsf(a - b) / fabsf(a)) > (FLAGS_max_diff / 10.0f)) {
+        count++;
+      }
+    }
+  }
+
+  return count;
+}
+
+int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
+  VectorPtr tmp1;
+  VectorPtr tmp2;
+  COPY_VECTOR_TO_CPU(tmp1, vector1);
+  COPY_VECTOR_TO_CPU(tmp2, vector2);
+  return VectorCheckErr(*tmp1, *tmp2);
+}
+
+#ifdef PADDLE_DISABLE_TIMER
+
+#define CHECK_VECTORPTR(vector1, vector2) \
+  EXPECT_EQ(VectorCheckErr(vector1, vector2), 0)
+
+#else
+
+#define CHECK_VECTORPTR(vector1, vector2)
+
+#endif
+
+typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
+
+void testCase(testMatrixFunc matrixFunc) {
+#ifndef PADDLE_ONLY_CPU
+  for (auto useGpu : {false, true}) {
+#else
+  for (auto useGpu : {false}) {
+#endif
+    for (auto size : {1,
+                      32,
+                      64,
+                      128,
+                      512,
+                      1024,
+                      4096,
+                      32768,
+                      65536,
+                      131072,
+                      262144,
+                      524288,
+                      1048576,
+                      2097152}) {
+      LOG(INFO) << " size=" << size << " useGpu=" << useGpu;
+      matrixFunc(size, useGpu);
+    }
+  }
+}
+
+#define INIT_VECTOR(vec1, vec2, type, size, useGpu) \
+  vec1[type] = Vector::create(size, useGpu);        \
+  vec2[type] = Vector::create(size, useGpu);        \
+  vec1[type]->rand();                               \
+  vec2[type]->copyFrom(*vec1[type]);
+
+void testAdagrad(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+
+  EXPRESSION_PERFORMANCE(AdagradParameterOptimizer(
+      bufs1, epsilon, learningRate, momentum, decayRate));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& accum_buffer = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+
+  EXPRESSION_PERFORMANCE(adagradApply(value,
+                                      grad,
+                                      mom,
+                                      accum_buffer,
+                                      accum,
+                                      lr,
+                                      epsilon,
+                                      learningRate,
+                                      momentum,
+                                      decayRate));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+
+TEST(Training, Adagrad) { testCase(testAdagrad); }
+
+void testAdaDelta(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+
+  EXPRESSION_PERFORMANCE(AdaDeltaParameterOptimizer(
+      bufs1, rou, epsilon, learningRate, momentum, decayRate));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum_update = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+
+  EXPRESSION_PERFORMANCE(adadeltaApply(value,
+                                       grad,
+                                       mom,
+                                       accum,
+                                       accum_update,
+                                       lr,
+                                       rou,
+                                       epsilon,
+                                       learningRate,
+                                       momentum,
+                                       decayRate));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+
+TEST(Training, AdaDelta) { testCase(testAdaDelta); }
+
+template <bool isFirstTime>
+void testRMSProp(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+
+  /* make sure 'g - f.square()' greater than 0 */
+  bufs1[PARAMETER_GRADIENT_SQURESUM]->add(1.0);
+  bufs2[PARAMETER_GRADIENT_SQURESUM]->copyFrom(
+      *bufs1[PARAMETER_GRADIENT_SQURESUM]);
+
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+  real accumulatedRou = rou;
+
+  EXPRESSION_PERFORMANCE(RMSPropParameterOptimizer(bufs1,
+                                                   accumulatedRou,
+                                                   rou,
+                                                   epsilon,
+                                                   learningRate,
+                                                   momentum,
+                                                   decayRate,
+                                                   isFirstTime));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& sum1 = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+
+  EXPRESSION_PERFORMANCE(rmspropApply(value,
+                                      grad,
+                                      mom,
+                                      sum,
+                                      sum1,
+                                      lr,
+                                      accumulatedRou,
+                                      rou,
+                                      epsilon,
+                                      learningRate,
+                                      momentum,
+                                      decayRate,
+                                      isFirstTime));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+
+TEST(Training, RMSProp) {
+  testCase(testRMSProp<true>);
+  testCase(testRMSProp<false>);
+}
+
+template <bool isFirstTime>
+void testDecayedAdagrad(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+  real accumulatedRou = rou;
+
+  if (isFirstTime) {
+    bufs1[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
+    bufs2[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
+  }
+
+  EXPRESSION_PERFORMANCE(DecayedAdagradParameterOptimizer(bufs1,
+                                                          accumulatedRou,
+                                                          rou,
+                                                          epsilon,
+                                                          learningRate,
+                                                          momentum,
+                                                          decayRate,
+                                                          isFirstTime));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+
+  EXPRESSION_PERFORMANCE(decayedAdagradApply(value,
+                                             grad,
+                                             mom,
+                                             sum,
+                                             lr,
+                                             accumulatedRou,
+                                             rou,
+                                             epsilon,
+                                             learningRate,
+                                             momentum,
+                                             decayRate,
+                                             isFirstTime));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+
+TEST(Training, DecayedAdagrad) {
+  testCase(testDecayedAdagrad<false>);
+  testCase(testDecayedAdagrad<true>);
+}
+
+void testAdam(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_SECOND_MOMENTUM, size, useGpu);
+
+  real beta1 = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta2 = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta1_power = (real)rand() / (real)RAND_MAX;   // NOLINT
+  real beta2_power = (real)rand() / (real)RAND_MAX;   // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+
+  EXPRESSION_PERFORMANCE(AdamParameterOptimizer(
+      bufs1, beta1, beta2, beta1_power, beta2_power, epsilon, learningRate));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& v = *bufs2[PARAMETER_SECOND_MOMENTUM];
+
+  EXPRESSION_PERFORMANCE(adamApply(value,
+                                   grad,
+                                   mom,
+                                   v,
+                                   beta1,
+                                   beta2,
+                                   beta1_power,
+                                   beta2_power,
+                                   epsilon,
+                                   learningRate));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_SECOND_MOMENTUM],
+                  bufs2[PARAMETER_SECOND_MOMENTUM]);
+}
+
+TEST(Training, Adam) { testCase(testAdam); }
+
+void testAdamax(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_WEIGHTED_INFINITY_NORM, size, useGpu);
+
+  real beta1 = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real beta2 = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real alpha = (real)rand() / (real)RAND_MAX;  // NOLINT
+  int64_t step = 2;
+
+  EXPRESSION_PERFORMANCE(
+      AdamaxParameterOptimizer(bufs1, beta1, beta2, step, alpha));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& u = *bufs2[PARAMETER_WEIGHTED_INFINITY_NORM];
+
+  EXPRESSION_PERFORMANCE(
+      adamaxApply(value, grad, mom, u, beta1, beta2, step, alpha));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_WEIGHTED_INFINITY_NORM],
+                  bufs2[PARAMETER_WEIGHTED_INFINITY_NORM]);
+}
+
+TEST(Training, Adamax) {
+#ifndef PADDLE_TYPE_DOUBLE
+  SetMaxDiff diff(1e-4);
+#endif
+  testCase(testAdamax);
+}
+
+void testSparseMomentum(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_UT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_VT, size, useGpu);
+
+  real alpha = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta = (real)rand() / (real)RAND_MAX;          // NOLINT
+  real gamma = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real tau = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+
+  EXPRESSION_PERFORMANCE(SparseMomentumParameterOptimizer(
+      bufs1, alpha, beta, gamma, tau, learningRate));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& momU = *bufs2[PARAMETER_MOMENTUM_UT];
+  BaseMatrix& momV = *bufs2[PARAMETER_MOMENTUM_VT];
+
+  EXPRESSION_PERFORMANCE(sparseMomentumApply(
+      value, grad, momU, momV, alpha, beta, gamma, tau, learningRate));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_UT], bufs2[PARAMETER_MOMENTUM_UT]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_VT], bufs2[PARAMETER_MOMENTUM_VT]);
+}
+
+TEST(Training, SparseMomentum) { testCase(testSparseMomentum); }
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/math/tests/test_batchTranspose.cpp b/paddle/math/tests/test_batchTranspose.cpp
index a9596992b2b1fced417c048600b05b39882b2bf2..9925e24dc14294ec70806ffd9cc496ea01beaa43 100644
--- a/paddle/math/tests/test_batchTranspose.cpp
+++ b/paddle/math/tests/test_batchTranspose.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "test_matrixUtil.h"
 #include "hl_batch_transpose.h"
+#include "test_matrixUtil.h"
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/math/tests/test_lazyAssign.cu b/paddle/math/tests/test_lazyAssign.cu
new file mode 100644
index 0000000000000000000000000000000000000000..16541edb54b807d4e1690d4ae63fd44459e2d726
--- /dev/null
+++ b/paddle/math/tests/test_lazyAssign.cu
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/math/Matrix.h"
+#include "paddle/math/TensorAssign.h"
+#include "TensorCheck.h"
+#include "PerfUtils.h"
+
+using paddle::BaseMatrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;
+
+typedef std::function<void(int height, int width)> testMatrixFunc;
+void testMatrixCase(testMatrixFunc matrixFunc) {
+  for (auto height : {1}) {
+    for (auto width : {1, 32, 64, 128, 512, 1024, 4096, 32768, 65536, 131072,
+                       262144, 524288, 1048576, 2097152, 4194304, 8388608}) {
+      matrixFunc(height, width);
+    }
+  }
+}
+
+template<typename Tensor>
+void testLazyAssign(int height, int width) {
+  Tensor A1(height, width);
+  Tensor A2(height, width);
+  Tensor B(height, width);
+  Tensor C(height, width);
+  Tensor D(height, width);
+  A1.randomizeUniform();
+  B.randomizeUniform();
+  C.randomizeUniform();
+  D.randomizeUniform();
+  A2.copyFrom(A1);
+
+  EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;);
+
+  EXPRESSION_PERFORMANCE(
+    auto expr1 = A2.lazyAssign(B + C);
+    auto expr2 = A2.lazyAssign(A2 * D);
+    AssignEvaluate(expr1, expr2););
+
+  TensorCheckErr(A1, A2);
+}
+
+TEST(lazyAssign, CPU) {
+  testMatrixCase(testLazyAssign<CpuMatrix>);
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST(lazyAssign, GPU) {
+  testMatrixCase(testLazyAssign<GpuMatrix>);
+}
+#endif
+
+template<typename Tensor>
+void sgdUpdateTensor(Tensor& A, Tensor& B, Tensor& C, Tensor& D,
+     real p1, real p2, real p3) {
+  C = C * p2 - D * (B + A * p3) * p1;
+  A += C;
+}
+
+void sgdUpdateLazyAssign(BaseMatrix& A, BaseMatrix& B,
+    BaseMatrix& C, BaseMatrix& D,
+    real p1, real p2, real p3) {
+  auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1);
+  auto expr2 = A.lazyAssign(A + C);
+  AssignEvaluate(expr1, expr2);
+}
+
+template<typename Tensor>
+void testSgdUpdate(int height, int width) {
+  Tensor A1(height, width);
+  Tensor A2(height, width);
+  Tensor A3(height, width);
+  A1.randomizeUniform();
+  A2.copyFrom(A1);
+  A3.copyFrom(A1);
+
+  Tensor B(height, width);
+  B.randomizeUniform();
+
+  Tensor C1(height, width);
+  Tensor C2(height, width);
+  Tensor C3(height, width);
+  C1.randomizeUniform();
+  C2.copyFrom(C1);
+  C3.copyFrom(C1);
+
+  Tensor D(height, width);
+  D.randomizeUniform();
+
+  real p1 = 0.2;
+  real p2 = 0.3;
+  real p3 = 0.5;
+
+  /**
+   * c = p2 * c - p1 * (b + p3 * a);
+   * a = a + c;
+   */
+  // BaseMatrix API
+  EXPRESSION_PERFORMANCE(
+  A1.sgdUpdate(B, C1, D, p1, p2, p3););
+
+  // Tensor expression
+  EXPRESSION_PERFORMANCE(
+    sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
+
+  // lazyAssign
+  EXPRESSION_PERFORMANCE(
+    sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
+
+  TensorCheckErr(A1, A2);
+  TensorCheckErr(A1, A3);
+  TensorCheckErr(C1, C2);
+  TensorCheckErr(C1, C3);
+}
+
+TEST(sgdUpdate, CPU) {
+  testMatrixCase(testSgdUpdate<CpuMatrix>);
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST(sgdUpdate, GPU) {
+  testMatrixCase(testSgdUpdate<GpuMatrix>);
+}
+#endif
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  hl_start();
+  hl_init(0);
+  return RUN_ALL_TESTS();
+}
+
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index ae5bc5a86a1790ce30a8d7f83c9564f52d7cf7ea..62de5b25e4cc803d9ccc605fba29a1d29a3ea69c 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,169 +16,18 @@ limitations under the License. */
 /// This unittest checks GpuMatrix/CpuMatrix get same result, so disable when
 /// only cpu version.
 
-#include "paddle/utils/Util.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
 #include <gtest/gtest.h>
+#include "TensorCheck.h"
 #include "paddle/gserver/tests/TestUtil.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
 #include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
-
-template <class T>
-void VectorCheckEqual(const VectorT<T>& vector1, const VectorT<T>& vector2) {
-  CHECK(vector1.getSize() == vector2.getSize());
-
-  const T* data1 = vector1.getData();
-  const T* data2 = vector2.getData();
-  size_t size = vector1.getSize();
-  int count = 0;
-  for (size_t i = 0; i < size; i++) {
-    if (data1[i] != data2[i]) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void MatrixCheckEqual(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      if (data1[i * width + j] != data2[i * width + j]) {
-        count++;
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      real a = data1[i * width + j];
-      real b = data2[i * width + j];
-      if (fabs(a - b) > err) {
-        if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
-          count++;
-        }
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void testBilinearFwdBwd(int numSamples,
-                        int imgSizeH,
-                        int imgSizeW,
-                        int channels) {
-  int inWidth = imgSizeH * imgSizeW * channels;
-  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
-  real ratioH = 0.5;
-  real ratioW = 0.5;
-  // forward
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-
-  input->randomizeUniform();
-  inputGpu->copyFrom(*input);
-
-  target->bilinearForward(*input,
-                          imgSizeH,
-                          imgSizeW,
-                          2 * imgSizeH,
-                          2 * imgSizeW,
-                          channels,
-                          ratioH,
-                          ratioW);
-  targetGpu->bilinearForward(*inputGpu,
-                             imgSizeH,
-                             imgSizeW,
-                             2 * imgSizeH,
-                             2 * imgSizeW,
-                             channels,
-                             ratioH,
-                             ratioW);
-
-  // check
-  targetCheck->copyFrom(*targetGpu);
-  MatrixCheckErr(*target, *targetCheck);
-
-  // backward
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheckGrad =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->bilinearBackward(*targetGrad,
-                              2 * imgSizeH,
-                              2 * imgSizeW,
-                              imgSizeH,
-                              imgSizeW,
-                              channels,
-                              ratioH,
-                              ratioW);
-  inputGpuGrad->bilinearBackward(*targetGpuGrad,
-                                 2 * imgSizeH,
-                                 2 * imgSizeW,
-                                 imgSizeH,
-                                 imgSizeW,
-                                 channels,
-                                 ratioH,
-                                 ratioW);
-
-  // check
-  targetCheckGrad->copyFrom(*inputGpuGrad);
-  MatrixCheckErr(*inputGrad, *targetCheckGrad);
-}
-
-TEST(Matrix, BilinearFwdBwd) {
-  for (auto numSamples : {5, 10}) {
-    for (auto channels : {8, 16}) {
-      for (auto imgSizeH : {14, 28}) {
-        for (auto imgSizeW : {16, 30}) {
-          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
-          testBilinearFwdBwd(numSamples, imgSizeH, imgSizeW, channels);
-        }
-      }
-    }
-  }
-}
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;
 
 void testMatrixProjectionForward(int contextStart,
                                  int contextLength,
@@ -232,12 +81,7 @@ void testMatrixProjectionForward(int contextStart,
                                       beginPad,
                                       padding);
 
-  // check
-  MatrixPtr outputCheck =
-      std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
-  outputCheck->copyFrom(*gpuOutput);
-
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
+  TensorCheckEqual(*cpuOutput, *gpuOutput);
 }
 
 void testMatrixProjectionBackward(int contextStart,
@@ -294,15 +138,9 @@ void testMatrixProjectionBackward(int contextStart,
                                                    beginPad);
   }
 
-  // check
-  MatrixPtr inputGradCheck = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  inputGradCheck->copyFrom(*gpuInputGrad);
-  MatrixCheckErr(*cpuInputGrad, *inputGradCheck);
-
+  TensorCheckErr(*cpuInputGrad, *gpuInputGrad);
   if (padding) {
-    MatrixPtr weightGradChcek = std::make_shared<CpuMatrix>(pad, inputDim);
-    weightGradChcek->copyFrom(*gpuWeightGrad);
-    MatrixCheckErr(*cpuWeightGrad, *weightGradChcek);
+    TensorCheckErr(*cpuWeightGrad, *gpuWeightGrad);
   }
 }
 
@@ -361,15 +199,8 @@ void testMatrixMaxSequence(int batchSize, int inputDim) {
   cpuOutput->maxSequenceForward(*cpuInput, *cpuSequence, *cpuIndex);
   gpuOutput->maxSequenceForward(*gpuInput, *gpuSequence, *gpuIndex);
 
-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
-
-  IVectorPtr indexCheck = nullptr;
-  IVector::resizeOrCreate(indexCheck, newBatchSize * inputDim, false);
-  indexCheck->copyFrom(*gpuIndex);
-  VectorCheckEqual(*cpuIndex, *indexCheck);
+  TensorCheckEqual(*cpuOutput, *gpuOutput);
+  TensorCheckEqual(*cpuIndex, *gpuIndex);
 
   // backward
   MatrixPtr cpuOutputGrad = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
@@ -385,10 +216,7 @@ void testMatrixMaxSequence(int batchSize, int inputDim) {
   cpuInputGrad->maxSequenceBackward(*cpuOutputGrad, *cpuSequence, *cpuIndex);
   gpuInputGrad->maxSequenceBackward(*gpuOutputGrad, *gpuSequence, *gpuIndex);
 
-  // check
-  MatrixPtr inputGradCheck = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  inputGradCheck->copyFrom(*gpuInputGrad);
-  MatrixCheckEqual(*cpuInputGrad, *inputGradCheck);
+  TensorCheckEqual(*cpuInputGrad, *gpuInputGrad);
 }
 
 TEST(Matrix, maxSequence) {
@@ -431,6 +259,8 @@ void testMatrixZeroAtOffset(int height, int width) {
   int columnOffset = rand() % width;  // NOLINT we just use rand() for test.
   int numColumns = rand() % (width - columnOffset);  // NOLINT
 
+  if (numColumns == 0) return;
+
   cpuA->zeroAtOffset(columnOffset, numColumns);
   gpuA->zeroAtOffset(columnOffset, numColumns);
 
@@ -442,304 +272,26 @@ void testMatrixZeroAtOffset(int height, int width) {
     }
   }
 
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-  MatrixCheckEqual(*cpuA, *cpuTest);
-}
-
-void testMatrixBinaryAdd(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  cpuA->add(*cpuB);
-  gpuA->add(*gpuB);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-}
-
-void testMatrixAssign(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  cpuA->assign(2.5);
-  gpuA->assign(2.5);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-}
-
-void testMatrixAdd(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  cpuA->add(2.5);
-  gpuA->add(2.5);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-}
-
-void testMatrixSqrt(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  cpuA->sqrt();
-  gpuA->sqrt();
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixTanhDerivative(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  cpuA->tanhDerivative(*cpuB);
-  gpuA->tanhDerivative(*gpuB);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixTanh(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  cpuA->tanh(*cpuB);
-  gpuA->tanh(*gpuB);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixTernarySub(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  cpuA->sub(*cpuB, *cpuC);
-  gpuA->sub(*gpuB, *gpuC);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-}
-
-void testMatrixSumOfSquaresBp(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  cpuA->sumOfSquaresBp(*cpuB, *cpuC);
-  gpuA->sumOfSquaresBp(*gpuB, *gpuC);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixBinaryRowScale(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, 1);
-
-  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr gpuA1 = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB1 = std::make_shared<GpuMatrix>(height, 1);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  cpuA1->copyFrom(*cpuA);
-  cpuB1->copyFrom(*cpuB);
-  gpuA1->copyFrom(*cpuA);
-  gpuB1->copyFrom(*cpuB);
-
-  cpuA->addColVector(*cpuB);
-  gpuA->addColVector(*gpuB);
-  cpuA1->addColumnVector(*cpuB1);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-
-  MatrixCheckEqual(*cpuA, *cpuA1);
-}
-
-void testMatrixAddBias(int height, int width, real scale) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-
-  cpuA->addBias(*cpuB, scale);
-  gpuA->addBias(*gpuB, scale);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
+  TensorCheckEqual(*cpuA, *gpuA);
+  TensorCheckEqual(*cpuA, *cpuTest);
 }
 
-void testMatrixTernaryRowScale(int height, int width) {
+void testMatrixDeepSwap(int height, int width) {
   MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
   MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
-
-  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC1 = std::make_shared<CpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-  cpuA1->copyFrom(*cpuA);
-  cpuB1->copyFrom(*cpuB);
-  cpuC1->copyFrom(*cpuC);
-
-  int columnOffset = rand() % width;  // NOLINT
-
-  cpuA->rowScale(columnOffset, *cpuB, *cpuC);
-  gpuA->rowScale(columnOffset, *gpuB, *gpuC);
-  cpuA1->rowScale2(columnOffset, *cpuB1, *cpuC1);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-
-  MatrixCheckEqual(*cpuA, *cpuA1);
-}
-
-void testMatrixTernaryRowDotMul(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-
-  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC1 = std::make_shared<CpuMatrix>(height, width);
-
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  cpuA1->copyFrom(*cpuA);
-  cpuB1->copyFrom(*cpuB);
-  cpuC1->copyFrom(*cpuC);
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  int columnOffset = rand() % width;  // NOLINT
-
-  cpuA->rowDotMul(columnOffset, *cpuB, *cpuC);
-  gpuA->rowDotMul(columnOffset, *gpuB, *gpuC);
-  cpuA1->rowDotMul2(columnOffset, *cpuB1, *cpuC1);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *cpuA1);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixAddDotMulMMV(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(1, width);
-
-  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC1 = std::make_shared<CpuMatrix>(1, width);
+  MatrixPtr cpuCopyA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuCopyB = std::make_shared<CpuMatrix>(height, width);
 
   cpuA->randomizeUniform();
   cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-  cpuA1->copyFrom(*cpuA);
-  cpuB1->copyFrom(*cpuB);
-  cpuC1->copyFrom(*cpuC);
+  cpuCopyA->copyFrom(*cpuA);
+  cpuCopyB->copyFrom(*cpuB);
 
-  cpuA->addDotMulMMV(*cpuB, *cpuC);
-  gpuA->addDotMulMMV(*gpuB, *gpuC);
-  cpuA1->addDotMulMMV2(*cpuB1, *cpuC1);
+  // swap matrix cpuA and cpuB
+  cpuA->deepSwap(*cpuB);
 
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-  MatrixCheckEqual(*cpuA, *cpuA1);
+  TensorCheckEqual(*cpuA, *cpuCopyB);
+  TensorCheckEqual(*cpuB, *cpuCopyA);
 }
 
 void testMatrixTranspose(int height, int width) {
@@ -753,9 +305,7 @@ void testMatrixTranspose(int height, int width) {
   cpu->transpose(cpuT, false);
   gpu->transpose(gpuT, false);
 
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(width, height);
-  outputCheck->copyFrom(*gpuT);
-  MatrixCheckEqual(*cpuT, *outputCheck);
+  TensorCheckEqual(*cpuT, *gpuT);
 }
 
 void testMatrixInverse(int height) {
@@ -776,529 +326,127 @@ void testMatrixInverse(int height) {
   cpu->inverse(cpuI, false);
   gpu->inverse(gpuI, false);
 
-  outputCheck->copyFrom(*gpuI);
-  MatrixCheckErr(*cpuI, *outputCheck);
-
-  outputCheck->mul(cpu, cpuI);
-  cpu->setDiag(1.0);
-  MatrixCheckErr(*cpu, *outputCheck);
-}
-
-TEST(Matrix, unary) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-
-      // applyUnary
-      testMatrixAssign(height, width);
-      testMatrixAdd(height, width);
-      testMatrixSqrt(height, width);
-
-      // applyBinary
-      testMatrixBinaryAdd(height, width);
-      testMatrixTanh(height, width);
-      testMatrixTanhDerivative(height, width);
-
-      // applyTernary
-      testMatrixTernarySub(height, width);
-      testMatrixSumOfSquaresBp(height, width);
-
-      // asRowVector
-      testMatrixAddBias(height, width, 1.0);
-      testMatrixAddBias(height, width, 3.5);
-      testMatrixAddDotMulMMV(height, width);
-
-      // asColVector
-      testMatrixTernaryRowScale(height, width);
-      testMatrixBinaryRowScale(height, width);
-
-      // sum
-      testMatrixGetSum(height, width);
-
-      // transpose
-      testMatrixTranspose(height, width);
-    }
-    // inverse
-    testMatrixInverse(height);
-  }
-}
-
-void testMatrixSoftmax(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->zero();
-  gpuOutput->zero();
-  cpuInput->softmax(*cpuOutput);
-  gpuInput->softmax(*gpuOutput);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckErr(*cpuOutput, *outputCheck);
-}
-
-void testSequenceSoftmax(int batchSize) {
-  // forward
-  int inputDim = 1;
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  cpuInput->sequenceSoftmax(*cpuInput, *cpuSequence);
-  gpuInput->sequenceSoftmax(*gpuInput, *gpuSequence);
-
-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  outputCheck->copyFrom(*gpuInput);
-  MatrixCheckErr(*cpuInput, *outputCheck);
-}
-
-void testMatrixSoftmaxThreshold(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  cpuInput->getData()[0] = 100.0;
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->zero();
-  gpuOutput->zero();
-  cpuInput->softmax(*cpuOutput);
-  gpuInput->softmax(*gpuOutput);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuOutput);
-  // check output zero
-  int cpuCount = 0;
-  int gpuCount = 0;
-  auto zeroNum = [](MatrixPtr out, int& count) {
-    for (size_t i = 0; i < out->getHeight(); i++) {
-      for (size_t j = 0; j < out->getWidth(); j++) {
-        if (out->getElement(i, j) == 0) count++;
-      }
-    }
-  };
-  zeroNum(cpuOutput, cpuCount);
-  zeroNum(outputCheck, gpuCount);
-  EXPECT_EQ(cpuCount, 0) << "Cpu softmax output value 0";
-  EXPECT_EQ(gpuCount, 0) << "Gpu softmax output value 0";
-}
-
-void testMatrixSoftmaxBp(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-  gpuOutput->softmaxBackward(*gpuInput);
-
-  MatrixPtr sftMaxSum = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr sftMaxDot = std::make_shared<CpuMatrix>(height, width);
-  sftMaxDot->dotMul(*cpuOutput, *cpuInput);
-  sftMaxSum->colMerge(*sftMaxDot);
-  cpuOutput->softmaxDerivative(*cpuInput, *sftMaxSum);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckErr(*cpuOutput, *outputCheck);
-}
-
-TEST(Matrix, softmax) {
-  for (auto height : {1, 11, 73, 128, 200}) {
-    for (auto width : {1, 32, 100, 512, 1000}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-
-      testMatrixSoftmax(height, width);
-      testMatrixSoftmaxBp(height, width);
-      testMatrixSoftmaxThreshold(height, width);
-    }
-    testSequenceSoftmax(height);
-  }
-}
-
-void testMatrixAddDotMulVMM(int height, int width, int endCol = 0) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(1, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
-
-  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC1 = std::make_shared<CpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-  cpuA1->copyFrom(*cpuA);
-  cpuB1->copyFrom(*cpuB);
-  cpuC1->copyFrom(*cpuC);
-
-  if (!endCol) {
-    cpuA->addDotMulVMM(*cpuB, *cpuC);
-    gpuA->addDotMulVMM(*gpuB, *gpuC);
-    cpuA1->addDotMulVMM2(*cpuB1, *cpuC1);
-
-    MatrixCheckErr(*cpuA, *cpuA1);
-  } else {
-    MatrixPtr subCpuA = cpuA->subColMatrix(0, endCol);
-    MatrixPtr subCpuB = cpuB->subColMatrix(0, endCol);
-    MatrixPtr subCpuC = cpuC->subColMatrix(0, endCol);
-    MatrixPtr subGpuA = gpuA->subColMatrix(0, endCol);
-    MatrixPtr subGpuB = gpuB->subColMatrix(0, endCol);
-    MatrixPtr subGpuC = gpuC->subColMatrix(0, endCol);
-    subCpuA->addDotMulVMM(*subCpuB, *subCpuC);
-    subGpuA->addDotMulVMM(*subGpuB, *subGpuC);
-  }
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixRowSum(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, 1);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA1 = std::make_shared<GpuMatrix>(height, 1);
-  MatrixPtr gpuB1 = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  cpuA1->copyFrom(*cpuA);
-  cpuB1->copyFrom(*cpuB);
-  gpuA1->copyFrom(*cpuA);
-  gpuB1->copyFrom(*cpuB);
-
-  cpuA->colMerge(*cpuB);
-  gpuA->colMerge(*gpuB);
-
-  cpuB1->rowSum(*cpuA1);
-  gpuB1->rowSum(*gpuA1);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, 1);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-  outputCheck->copyFrom(*gpuA1);
-  MatrixCheckErr(*cpuA1, *outputCheck);
-}
-
-void testMatrixRowMax(int height, int width, int endCol = 0) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, 1);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-
-  if (!endCol) {
-    cpuB->rowMax(*cpuA);
-    gpuB->rowMax(*gpuA);
-  } else {
-    MatrixPtr subCpuB = cpuB->subColMatrix(0, endCol);
-    MatrixPtr subGpuB = gpuB->subColMatrix(0, endCol);
-    subCpuB->rowMax(*cpuA);
-    subGpuB->rowMax(*gpuA);
-  }
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, 1);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixColSum(int height, int width, int endCol = 0) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(1, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-
-  if (!endCol) {
-    cpuA->accumulateColSum(*cpuB);
-    gpuA->accumulateColSum(*gpuB);
-  } else {
-    MatrixPtr subCpuA = cpuA->subColMatrix(0, endCol);
-    MatrixPtr subGpuA = gpuA->subColMatrix(0, endCol);
-    MatrixPtr subCpuB = cpuB->subColMatrix(0, endCol);
-    MatrixPtr subGpuB = gpuB->subColMatrix(0, endCol);
-    subCpuA->accumulateColSum(*subCpuB);
-    subGpuA->accumulateColSum(*subGpuB);
-  }
+  TensorCheckErr(*cpuI, *gpuI);
 
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixColMax(int height, int width, int endCol = 0) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(1, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-
-  if (!endCol) {
-    cpuB->colMax(*cpuA);
-    gpuB->colMax(*gpuA);
-  } else {
-    MatrixPtr subCpuA = cpuA->subColMatrix(0, endCol);
-    MatrixPtr subGpuA = gpuA->subColMatrix(0, endCol);
-    MatrixPtr subCpuB = cpuB->subColMatrix(0, endCol);
-    MatrixPtr subGpuB = gpuB->subColMatrix(0, endCol);
-    subCpuB->colMax(*subCpuA);
-    subGpuB->colMax(*subGpuA);
-  }
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixCollectBias(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(1, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-
-  real scale = 1.0f / (rand() % 10);  // NOLINT
-
-  cpuA->collectBias(*cpuB, scale);
-  gpuA->collectBias(*gpuB, scale);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixSumOfSquares(int height, int width, int endCol = 0) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, 1);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  if (!endCol) {
-    cpuA->sumOfSquares(*cpuB, *cpuC);
-    gpuA->sumOfSquares(*gpuB, *gpuC);
-  } else {
-    MatrixPtr subCpuB = cpuB->subColMatrix(0, endCol);
-    MatrixPtr subCpuC = cpuC->subColMatrix(0, endCol);
-    MatrixPtr subGpuB = gpuB->subColMatrix(0, endCol);
-    MatrixPtr subGpuC = gpuC->subColMatrix(0, endCol);
-    cpuA->sumOfSquares(*subCpuB, *subCpuC);
-    gpuA->sumOfSquares(*subGpuB, *subGpuC);
-  }
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, 1);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixBinaryClassificationError(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
-
-  MatrixPtr cpuA2 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB2 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC2 = std::make_shared<CpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-  cpuA2->copyFrom(*cpuA);
-  cpuB2->copyFrom(*cpuB);
-  cpuC2->copyFrom(*cpuC);
-
-  real scale = 0.5;
-  int columnOffset = rand() % width;  // NOLINT
-
-  cpuA->binaryClassificationError(columnOffset, *cpuB, *cpuC, scale);
-  gpuA->binaryClassificationError(columnOffset, *gpuB, *gpuC, scale);
-  cpuA2->binaryClassificationError2(columnOffset, *cpuB2, *cpuC2, scale);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-  MatrixCheckErr(*cpuA, *cpuA2);
-}
-
-TEST(Matrix, aggregate) {
-  for (auto height : {1, 11, 16, 32, 64, 73, 128, 200, 1024, 2345}) {
-    for (auto width : {1, 9, 16, 32, 64, 100, 512, 1000, 1024, 2453}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-      testMatrixRowSum(height, width);
-      testMatrixRowMax(height, width);
-      testMatrixColSum(height, width);
-      testMatrixColMax(height, width);
-      testMatrixCollectBias(height, width);
-      testMatrixTernaryRowDotMul(height, width);
-      testMatrixAddDotMulVMM(height, width);
-
-      testMatrixSumOfSquares(height, width);
-      testMatrixBinaryClassificationError(height, width);
-    }
-  }
+  outputCheck->mul(cpu, cpuI);
+  cpu->setDiag(1.0);
+  TensorCheckErr(*cpu, *outputCheck);
 }
 
-TEST(Matrix, aggregate2) {
-  for (auto height : {16, 32, 128, 512, 1024}) {
-    for (auto width :
-         {16, 32, 64, 128, 256, 512, 768, 1024, 2048, 3072, 4096}) {
+TEST(Matrix, unary) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
       VLOG(3) << " height=" << height << " width=" << width;
 
-      int endCol = rand() % width;  // NOLINT
-      testMatrixRowMax(height, width, endCol);
-      testMatrixSumOfSquares(height, width, endCol);
-      testMatrixColSum(height, width, endCol);
-      testMatrixColMax(height, width, endCol);
-      testMatrixAddDotMulVMM(height, width, endCol);
+      testMatrixDeepSwap(height, width);
+      testMatrixZeroAtOffset(height, width);
+      testMatrixGetSum(height, width);
+      testMatrixTranspose(height, width);
     }
+    // inverse
+    testMatrixInverse(height);
   }
 }
 
-void testMatrixAddAtOffset(int height, int width1, int width2) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width1);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width2);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width1);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width2);
+void testMatrixSoftmax(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
 
   cpuInput->randomizeUniform();
   gpuInput->copyFrom(*cpuInput);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-
-  int columnOffset = 0;
-  int offset = std::abs(width1 - width2);
-  if (offset) {
-    columnOffset = rand() % offset;  // NOLINT
-  }
-  cpuOutput->addAtOffset(*cpuInput, columnOffset);
-  gpuOutput->addAtOffset(*gpuInput, columnOffset);
+  cpuOutput->zero();
+  gpuOutput->zero();
+  cpuInput->softmax(*cpuOutput);
+  gpuInput->softmax(*gpuOutput);
 
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width2);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
+  TensorCheckErr(*cpuOutput, *gpuOutput);
 }
 
-void testMatrixAssignAtOffset(int height, int width1, int width2) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width1);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width2);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width1);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width2);
-
+void testSequenceSoftmax(int batchSize) {
+  // forward
+  int inputDim = 1;
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
   cpuInput->randomizeUniform();
   gpuInput->copyFrom(*cpuInput);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
 
-  int columnOffset = 0;
-  int offset = std::abs(width1 - width2);
-  if (offset) {
-    columnOffset = rand() % offset;  // NOLINT
-  }
-  cpuOutput->assignAtOffset(*cpuInput, columnOffset);
-  gpuOutput->assignAtOffset(*gpuInput, columnOffset);
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
 
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width2);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
+  cpuInput->sequenceSoftmax(*cpuInput, *cpuSequence);
+  gpuInput->sequenceSoftmax(*gpuInput, *gpuSequence);
+
+  TensorCheckErr(*cpuInput, *gpuInput);
 }
 
-TEST(Matrix, AtOffset) {
-  for (auto height : {1, 11, 73, 128, 200}) {
-    for (auto width1 : {1, 32, 100, 512, 1000}) {
-      for (auto width2 : {1, 32, 100, 512, 1000}) {
-        VLOG(3) << " height=" << height << " width1=" << width1
-                << " width2=" << width2;
+void testMatrixSoftmaxThreshold(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
+
+  cpuInput->randomizeUniform();
+  cpuInput->getData()[0] = 100.0;
+  gpuInput->copyFrom(*cpuInput);
+  cpuOutput->zero();
+  gpuOutput->zero();
+  cpuInput->softmax(*cpuOutput);
+  gpuInput->softmax(*gpuOutput);
 
-        testMatrixAddAtOffset(height, width1, width2);
-        testMatrixAssignAtOffset(height, width1, width2);
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuOutput);
+  // check output zero
+  int cpuCount = 0;
+  int gpuCount = 0;
+  auto zeroNum = [](MatrixPtr out, int& count) {
+    for (size_t i = 0; i < out->getHeight(); i++) {
+      for (size_t j = 0; j < out->getWidth(); j++) {
+        if (out->getElement(i, j) == 0) count++;
       }
     }
-  }
+  };
+  zeroNum(cpuOutput, cpuCount);
+  zeroNum(outputCheck, gpuCount);
+  EXPECT_EQ(cpuCount, 0) << "Cpu softmax output value 0";
+  EXPECT_EQ(gpuCount, 0) << "Gpu softmax output value 0";
 }
 
-void testMatrixSelectRows(int numSamples, int tableSize, int inputDim) {
-  MatrixPtr cpuTable = std::make_shared<CpuMatrix>(tableSize, inputDim);
-  MatrixPtr gpuTable = std::make_shared<GpuMatrix>(tableSize, inputDim);
-  cpuTable->randomizeUniform();
-  gpuTable->copyFrom(*cpuTable);
-
-  IVectorPtr cpuIds;
-  IVectorPtr gpuIds;
-  cpuIds = VectorT<int>::create(numSamples, false);
-  gpuIds = VectorT<int>::create(numSamples, true);
-  cpuIds->rand(tableSize);
-  gpuIds->copyFrom(*cpuIds);
+void testMatrixSoftmaxBp(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
 
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, inputDim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
   cpuOutput->randomizeUniform();
   gpuOutput->copyFrom(*cpuOutput);
+  gpuOutput->softmaxBackward(*gpuInput);
 
-  cpuOutput->selectRows(*cpuTable, *cpuIds);
-  gpuOutput->selectRows(*gpuTable, *gpuIds);
+  MatrixPtr sftMaxSum = std::make_shared<CpuMatrix>(height, 1);
+  MatrixPtr sftMaxDot = std::make_shared<CpuMatrix>(height, width);
+  sftMaxDot->dotMul(*cpuOutput, *cpuInput);
+  sftMaxSum->colMerge(*sftMaxDot);
+  cpuOutput->softmaxDerivative(*cpuInput, *sftMaxSum);
 
-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(numSamples, inputDim);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
+  TensorCheckErr(*cpuOutput, *gpuOutput);
+}
+
+TEST(Matrix, softmax) {
+  for (auto height : {1, 11, 73, 128, 200}) {
+    for (auto width : {1, 32, 100, 512, 1000}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+
+      testMatrixSoftmax(height, width);
+      testMatrixSoftmaxBp(height, width);
+      testMatrixSoftmaxThreshold(height, width);
+    }
+    testSequenceSoftmax(height);
+  }
 }
 
 void testMatrixAddToRows(int numSamples, int tableSize, int inputDim) {
@@ -1322,10 +470,7 @@ void testMatrixAddToRows(int numSamples, int tableSize, int inputDim) {
   cpuOutput->addToRows(*cpuTable, *cpuIds);
   gpuOutput->addToRows(*gpuTable, *gpuIds);
 
-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(tableSize, inputDim);
-  outputCheck->copyFrom(*gpuTable);
-  MatrixCheckErr(*cpuTable, *outputCheck);
+  TensorCheckErr(*cpuTable, *gpuTable);
 }
 
 TEST(Matrix, tableProjection) {
@@ -1334,7 +479,6 @@ TEST(Matrix, tableProjection) {
       for (auto inputDim : {20, 50}) {
         VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
                 << " inputDim=" << inputDim;
-        testMatrixSelectRows(numSamples, tableSize, inputDim);
         testMatrixAddToRows(numSamples, tableSize, inputDim);
       }
     }
@@ -1368,9 +512,7 @@ void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
   cpuC->mul(cpuA, cpuB, alpha, beta);
   gpuC->mul(gpuA, gpuB, alpha, beta);
 
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(heightC, widthC);
-  outputCheck->copyFrom(*gpuC);
-  MatrixCheckErr(*cpuC, *outputCheck);
+  TensorCheckErr(*cpuC, *gpuC);
 }
 
 void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
@@ -1442,9 +584,7 @@ void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
   subCpuC->mul(subCpuA, subCpuB, alpha, beta);
   subGpuC->mul(subGpuA, subGpuB, alpha, beta);
 
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(heightC, widthC);
-  outputCheck->copyFrom(*gpuC);
-  MatrixCheckErr(*cpuC, *outputCheck);
+  TensorCheckErr(*cpuC, *gpuC);
 }
 
 TEST(Matrix, mul) {
@@ -1498,9 +638,7 @@ void testVectorReset(int size) {
   cpu->reset(value);
   gpu->reset(value);
 
-  std::shared_ptr<CpuVectorT<T>> out = std::make_shared<CpuVectorT<T>>(size);
-  out->copyFrom(*gpu);
-  VectorCheckEqual(*cpu, *out);
+  TensorCheckEqual(*cpu, *gpu);
 }
 
 template <class T>
@@ -1526,9 +664,7 @@ void testVecortSelectFrom(int size) {
   cpuDst->selectFrom(*cpuSrc, *cpuIds);
   gpuDst->selectFrom(*gpuSrc, *gpuIds);
 
-  std::shared_ptr<CpuVectorT<T>> out = std::make_shared<CpuVectorT<T>>(size);
-  out->copyFrom(*gpuDst);
-  VectorCheckEqual(*cpuDst, *out);
+  TensorCheckEqual(*cpuDst, *gpuDst);
 }
 
 template <class T>
@@ -1539,9 +675,7 @@ void testVecotrZeroMem(int size) {
   cpu->zeroMem();
   gpu->zeroMem();
 
-  std::shared_ptr<CpuVectorT<T>> out = std::make_shared<CpuVectorT<T>>(size);
-  out->copyFrom(*gpu);
-  VectorCheckEqual(*cpu, *out);
+  TensorCheckEqual(*cpu, *gpu);
 }
 
 template <class T>
@@ -1562,9 +696,7 @@ void testVectorIsEqual(int size) {
   cpuA->isEqualTo(*cpuB, value);
   gpuA->isEqualTo(*gpuB, value);
 
-  std::shared_ptr<CpuVectorT<T>> out = std::make_shared<CpuVectorT<T>>(size);
-  out->copyFrom(*gpuA);
-  VectorCheckEqual(*cpuA, *out);
+  TensorCheckEqual(*cpuA, *gpuA);
 }
 
 TEST(Vector, Equal) {
@@ -1595,9 +727,7 @@ void testMatrixTopK(int samples, int dim, int beamSize) {
   cpuSrc->rowMax(*cpuIds, *cpuVal);
   gpuSrc->rowMax(*gpuIds, *gpuVal);
 
-  MatrixPtr outVal = std::make_shared<CpuMatrix>(samples, beamSize);
-  outVal->copyFrom(*gpuVal);
-  MatrixCheckEqual(*cpuVal, *outVal);
+  TensorCheckEqual(*cpuVal, *gpuVal);
 }
 
 TEST(Matrix, topK) {
@@ -1633,9 +763,7 @@ void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) {
   cpuSrc->rowMax(*cpuIds, *cpuVal);
   gpuSrc->rowMax(*gpuIds, *gpuVal);
 
-  MatrixPtr outCheckMaxVal = std::make_shared<CpuMatrix>(samples, beamSize);
-  outCheckMaxVal->copyFrom(*gpuVal);
-  MatrixCheckEqual(*cpuVal, *outCheckMaxVal);
+  TensorCheckEqual(*cpuVal, *gpuVal);
 
   IVectorPtr outCheckIds = std::make_shared<CpuIVector>(samples * beamSize);
   outCheckIds->copyFrom(*gpuIds);
@@ -1665,42 +793,6 @@ TEST(SMatrix, topK) {
   }
 }
 
-void testMatrixCopyByRowIndex(int outHeight, int inHeight, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(inHeight, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(inHeight, width);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(outHeight, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(outHeight, width);
-  cpuOutput->zero();
-  gpuOutput->zero();
-
-  IVectorPtr cpuRowIndex = IVector::create(outHeight, false);
-  IVectorPtr gpuRowIndex = IVector::create(outHeight, true);
-  cpuRowIndex->rand(inHeight);
-  gpuRowIndex->copyFrom(*cpuRowIndex);
-
-  cpuOutput->copyByRowIndex(*cpuInput, *cpuRowIndex);
-  gpuOutput->copyByRowIndex(*gpuInput, *gpuRowIndex);
-
-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(outHeight, width);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
-}
-
-TEST(Matrix, copyByRowIndex) {
-  for (auto outHeight : {31, 500, 1000}) {
-    for (auto inHeight : {17, 257, 500, 1200}) {
-      for (auto width : {512, 1024}) {
-        VLOG(3) << outHeight << " " << inHeight << " " << width;
-        testMatrixCopyByRowIndex(outHeight, inHeight, width);
-      }
-    }
-  }
-}
-
 void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) {
   MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
   MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
@@ -1721,10 +813,7 @@ void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) {
   cpuOutput->sequenceAvgForward(*cpuInput, *cpuSequence, mode);
   gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode);
 
-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckErr(*cpuOutput, *outputCheck);
+  TensorCheckErr(*cpuOutput, *gpuOutput);
 }
 
 TEST(Matrix, sequenceAvgForward) {
@@ -1739,45 +828,6 @@ TEST(Matrix, sequenceAvgForward) {
   }
 }
 
-void testCosSim(int heightX, int heightY, int width, real scale) {
-  MatrixPtr prevOutX = CpuMatrix::create(heightX, width, false, false);
-  MatrixPtr prevOutY = CpuMatrix::create(heightY, width, false, false);
-  MatrixPtr output = CpuMatrix::create(heightX, 1, false, false);
-
-  prevOutX->randomizeUniform();
-  prevOutY->randomizeUniform();
-  prevOutX->add(-0.5);
-  prevOutY->add(-0.5);
-  output->randomizeUniform();
-
-  MatrixPtr prevOutXGpu = GpuMatrix::create(heightX, width, false, true);
-  MatrixPtr prevOutYGpu = GpuMatrix::create(heightY, width, false, true);
-  MatrixPtr outputGpu = GpuMatrix::create(heightX, 1, false, true);
-
-  prevOutXGpu->copyFrom(*prevOutX);
-  prevOutYGpu->copyFrom(*prevOutY);
-  outputGpu->copyFrom(*output);
-
-  output->cosSim(*prevOutX, *prevOutY, scale);
-  outputGpu->cosSim(*prevOutXGpu, *prevOutYGpu, scale);
-
-  MatrixPtr outputCheck = CpuMatrix::create(heightX, 1, false, false);
-  outputCheck->copyFrom(*outputGpu);
-  MatrixCheckErr(*output, *outputCheck);
-}
-
-TEST(Matrix, cosSim) {
-  for (auto heightX : {10, 100, 1000}) {
-    for (auto heightY : {1, heightX}) {
-      for (auto width : {10, 100, 1000}) {
-        for (auto scale : {1.0, 2.0}) {
-          testCosSim(heightX, heightY, width, scale);
-        }
-      }
-    }
-  }
-}
-
 void testCosSimDerivate(int heightX, int heightY, int width, real scale) {
   MatrixPtr prevOutX = CpuMatrix::create(heightX, width, false, false);
   MatrixPtr prevOutY = CpuMatrix::create(heightY, width, false, false);
@@ -1817,12 +867,8 @@ void testCosSimDerivate(int heightX, int heightY, int width, real scale) {
                             *prevGradYGpu,
                             scale);
 
-  MatrixPtr prevGradXCheck = CpuMatrix::create(heightX, width, false, false);
-  MatrixPtr prevGradYCheck = CpuMatrix::create(heightY, width, false, false);
-  prevGradXCheck->copyFrom(*prevGradXGpu);
-  prevGradYCheck->copyFrom(*prevGradYGpu);
-  MatrixCheckErr(*prevGradX, *prevGradXCheck);
-  MatrixCheckErr(*prevGradY, *prevGradYCheck);
+  TensorCheckErr(*prevGradX, *prevGradXGpu);
+  TensorCheckErr(*prevGradY, *prevGradYGpu);
 }
 
 TEST(Matrix, cosSimDerivate) {
@@ -1837,80 +883,6 @@ TEST(Matrix, cosSimDerivate) {
   }
 }
 
-void testParamReluForward(int height, int width, int w_height, int w_width) {
-  MatrixPtr output = CpuMatrix::create(height, width, false, false);
-  MatrixPtr input = CpuMatrix::create(height, width, false, false);
-  MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
-
-  output->randomizeUniform();
-  input->randomizeUniform();
-  w->randomizeUniform();
-  input->add(-0.5);
-
-  MatrixPtr outputGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr inputGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr wGpu = GpuMatrix::create(w_height, w_width, false, true);
-
-  inputGpu->copyFrom(*input);
-  wGpu->copyFrom(*w);
-
-  output->paramReluForward(*input, *w);
-  outputGpu->paramReluForward(*inputGpu, *wGpu);
-
-  MatrixPtr outputCheck = CpuMatrix::create(height, width, false, false);
-  outputCheck->copyFrom(*outputGpu);
-  MatrixCheckEqual(*output, *outputCheck);
-}
-
-TEST(Matrix, paramReluForward) {
-  for (auto height : {10, 100}) {
-    for (auto width : {10, 100}) {
-      for (auto w_height : {1, 2}) {
-        for (auto w_width : {1, 2}) {
-          testParamReluForward(height, width, w_height, w_width);
-        }
-      }
-    }
-  }
-}
-
-void testParamReluBackwardW(int height, int width, int w_height, int w_width) {
-  MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
-  MatrixPtr input = CpuMatrix::create(height, width, false, false);
-  MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
-
-  oGrad->randomizeUniform();
-  input->randomizeUniform();
-  w->randomizeUniform();
-  input->add(-0.5);
-
-  MatrixPtr oGradGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr inputGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr wGpu = GpuMatrix::create(w_height, w_width, false, true);
-
-  oGradGpu->copyFrom(*oGrad);
-  inputGpu->copyFrom(*input);
-  wGpu->copyFrom(*w);
-
-  w->paramReluBackwardW(*oGrad, *input);
-  wGpu->paramReluBackwardW(*oGradGpu, *inputGpu);
-  MatrixPtr wCheck = CpuMatrix::create(w_height, w_width, false, false);
-  wCheck->copyFrom(*wGpu);
-  MatrixCheckErr(*w, *wCheck);
-}
-
-TEST(Matrix, paramReluBackwardW) {
-  for (auto height : {10, 100}) {
-    for (auto width : {10, 100}) {
-      for (auto w_height : {1, 2}) {
-        for (auto w_width : {1, 2}) {
-          testParamReluBackwardW(height, width, w_height, w_width);
-        }
-      }
-    }
-  }
-}
-
 void testParamReluBackwardDiff(int height,
                                int width,
                                int w_height,
@@ -1939,9 +911,7 @@ void testParamReluBackwardDiff(int height,
   diff->paramReluBackwardDiff(*oGrad, *input, *w);
   diffGpu->paramReluBackwardDiff(*oGradGpu, *inputGpu, *wGpu);
 
-  MatrixPtr diffCheck = CpuMatrix::create(height, width, false, false);
-  diffCheck->copyFrom(*diffGpu);
-  MatrixCheckErr(*diff, *diffCheck);
+  TensorCheckErr(*diff, *diffGpu);
 }
 
 TEST(Matrix, paramReluBackwardDiff) {
@@ -1972,9 +942,7 @@ void testClassificationError(int numSamples, int dim) {
   cpuError->classificationError(cpuOutput, cpuLabel);
   gpuError->classificationError(gpuOutput, gpuLabel);
 
-  MatrixPtr check = std::make_shared<CpuMatrix>(numSamples, 1);
-  check->copyFrom(*gpuError);
-  MatrixCheckEqual(*cpuError, *check);
+  TensorCheckEqual(*cpuError, *gpuError);
 }
 
 TEST(Matrix, classificationError) {
@@ -2139,9 +1107,8 @@ void testAvgPoolFwdBwd(int numSamples,
                             outW,
                             padH,
                             padW);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-  targetCheck->copyFrom(*targetGpu);
-  MatrixCheckErr(*target, *targetCheck);
+
+  TensorCheckErr(*target, *targetGpu);
 
   MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
   MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
@@ -2180,10 +1147,8 @@ void testAvgPoolFwdBwd(int numSamples,
                                 1.0,
                                 padH,
                                 padW);
-  MatrixPtr targetBwdCheck =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-  targetBwdCheck->copyFrom(*inputGpuGrad);
-  MatrixCheckErr(*inputGrad, *targetBwdCheck);
+
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
 }
 
 TEST(Matrix, PoolFwdBwd) {
@@ -2248,11 +1213,9 @@ void testMaxOutFwdBwd(
 
   MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
   MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
 
   IVectorPtr id = CpuIVector::create(numSamples * outWidth, false);
   IVectorPtr idGpu = GpuIVector::create(numSamples * outWidth, true);
-  IVectorPtr idCheck = CpuIVector::create(numSamples * outWidth, false);
 
   input->randomizeUniform();
   inputGpu->copyFrom(*input);
@@ -2260,11 +1223,8 @@ void testMaxOutFwdBwd(
   target->maxoutForward(*input, *id, outChannels, groups);
   targetGpu->maxoutForward(*inputGpu, *idGpu, outChannels, groups);
 
-  // check
-  targetCheck->copyFrom(*targetGpu);
-  MatrixCheckErr(*target, *targetCheck);
-  idCheck->copyFrom(*idGpu);
-  VectorCheckEqual(*id, *idCheck);
+  TensorCheckErr(*target, *targetGpu);
+  TensorCheckEqual(*id, *idGpu);
 
   // backward
   MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
@@ -2273,8 +1233,6 @@ void testMaxOutFwdBwd(
   MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
   MatrixPtr targetGpuGrad =
       GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheckGrad =
-      CpuMatrix::create(numSamples, inWidth, false, false);
 
   inputGrad->randomizeUniform();
   targetGrad->randomizeUniform();
@@ -2284,9 +1242,7 @@ void testMaxOutFwdBwd(
   inputGrad->maxoutBackward(*targetGrad, *id, outChannels, groups);
   inputGpuGrad->maxoutBackward(*targetGpuGrad, *idGpu, outChannels, groups);
 
-  // check
-  targetCheckGrad->copyFrom(*inputGpuGrad);
-  MatrixCheckErr(*inputGrad, *targetCheckGrad);
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
 }
 
 TEST(Matrix, MaxOutFwdBwd) {
@@ -2306,113 +1262,6 @@ TEST(Matrix, MaxOutFwdBwd) {
   }
 }
 
-void testAddSharedBias(int numSamples, int dim, int channel) {
-  MatrixPtr cpuData = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuData = std::make_shared<GpuMatrix>(numSamples, dim);
-
-  MatrixPtr cpuBias = std::make_shared<CpuMatrix>(1, channel);
-  MatrixPtr gpuBias = std::make_shared<GpuMatrix>(1, channel);
-
-  cpuData->randomizeUniform();
-  gpuData->copyFrom(*cpuData);
-  cpuBias->randomizeUniform();
-  gpuBias->copyFrom(*cpuBias);
-
-  cpuData->addSharedBias(*cpuBias, 1.0);
-  gpuData->addSharedBias(*gpuBias, 1.0);
-
-  MatrixPtr check = std::make_shared<CpuMatrix>(numSamples, dim);
-  check->copyFrom(*gpuData);
-  MatrixCheckErr(*cpuData, *check);
-}
-
-void testCollectSharedBias(int numSamples, int dim, int channel) {
-  MatrixPtr cpuData = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuData = std::make_shared<GpuMatrix>(numSamples, dim);
-
-  MatrixPtr cpuBias = std::make_shared<CpuMatrix>(1, channel);
-  MatrixPtr gpuBias = std::make_shared<GpuMatrix>(1, channel);
-
-  cpuData->randomizeUniform();
-  gpuData->copyFrom(*cpuData);
-  cpuBias->randomizeUniform();
-  gpuBias->copyFrom(*cpuBias);
-
-  cpuBias->collectSharedBias(*cpuData, 1.0);
-  gpuBias->collectSharedBias(*gpuData, 1.0);
-
-  MatrixPtr check = std::make_shared<CpuMatrix>(1, channel);
-  check->copyFrom(*gpuBias);
-  MatrixCheckErr(*cpuBias, *check);
-}
-
-TEST(Matrix, sharedBias) {
-  for (auto numSamples : {1, 100, 520}) {
-    for (auto dim : {100 * 16, 100 * 32}) {
-      for (auto channel : {8, 16}) {
-        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
-                << " channel=" << channel;
-        testAddSharedBias(numSamples, dim, channel);
-        testCollectSharedBias(numSamples, dim, channel);
-      }
-    }
-  }
-}
-
-void testMultiBinaryLabelCrossEntropy(int numSamples, int dim) {
-  MatrixPtr output = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, dim);
-
-  MatrixPtr cpuEntropy = std::make_shared<CpuMatrix>(numSamples, 1);
-  MatrixPtr gpuEntropy = std::make_shared<GpuMatrix>(numSamples, 1);
-
-  MatrixPtr cpuGrad = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuGrad = std::make_shared<GpuMatrix>(numSamples, dim);
-
-  MatrixPtr cpuLabel = std::make_shared<CpuSparseMatrix>(
-      numSamples, dim, numSamples, NO_VALUE, SPARSE_CSR, false);
-  MatrixPtr gpuLabel = std::make_shared<GpuSparseMatrix>(
-      numSamples, dim, numSamples, NO_VALUE, SPARSE_CSR, false);
-  for (int i = 0; i < numSamples; i++) {
-    const unsigned int id = rand() % dim;  // NOLINT
-    cpuLabel->setRow(i, 1, &id, nullptr);
-    gpuLabel->setRow(i, 1, &id, nullptr);
-  }
-
-  output->randomizeUniform();
-  cpuOutput->zeroMem();
-  output->softmax(*cpuOutput);
-  gpuOutput->copyFrom(*cpuOutput);
-
-  cpuEntropy->zeroMem();
-  gpuEntropy->zeroMem();
-  cpuEntropy->multiBinaryLabelCrossEntropy(*cpuOutput, *cpuLabel);
-  gpuEntropy->multiBinaryLabelCrossEntropy(*gpuOutput, *gpuLabel);
-
-  MatrixPtr check1 = std::make_shared<CpuMatrix>(numSamples, 1);
-  check1->copyFrom(*gpuEntropy);
-  MatrixCheckErr(*cpuEntropy, *check1);
-
-  cpuGrad->zeroMem();
-  gpuGrad->zeroMem();
-  cpuGrad->multiBinaryLabelCrossEntropyBp(*cpuOutput, *cpuLabel);
-  gpuGrad->multiBinaryLabelCrossEntropyBp(*gpuOutput, *gpuLabel);
-
-  MatrixPtr check2 = std::make_shared<CpuMatrix>(numSamples, dim);
-  check2->copyFrom(*gpuGrad);
-  MatrixCheckErr(*cpuGrad, *check2);
-}
-
-TEST(Matrix, multiBinaryCrossEntropy) {
-  for (auto numSamples : {100, 1000, 10000}) {
-    for (auto dim : {100, 1000, 10000}) {
-      VLOG(3) << " numSamples=" << numSamples << " dim=" << dim;
-      testMultiBinaryLabelCrossEntropy(numSamples, dim);
-    }
-  }
-}
-
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/math/tests/test_matrixUtil.h b/paddle/math/tests/test_matrixUtil.h
index 5300e7168b9dc61b65e64346424e65c11665cf99..9aa74b15193723970d80b5d1a4e0ac95341cd45a 100644
--- a/paddle/math/tests/test_matrixUtil.h
+++ b/paddle/math/tests/test_matrixUtil.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
index 837c2f47ba05a04988431e14cb6bc2490f42d32e..60ebae015381a3901c14d0cd4c1225e54ac5726f 100644
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #ifndef PADDLE_ONLY_CPU
 
-#include <cmath>
+#include <cuda_runtime.h>
 #include <gtest/gtest.h>
+#include <cmath>
 #include <vector>
-#include <cuda_runtime.h>
 #include "hl_cuda.h"
 #include "hl_perturbation_util.cuh"
 
diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp
index d7aa20eb984417ff3907b078a263c5651d6209d3..6f6de238bacaade85d728b7d773145326229015a 100644
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,10 +17,10 @@ limitations under the License. */
 //  so disable when
 /// only cpu version.
 
-#include "paddle/utils/Util.h"
+#include <gtest/gtest.h>
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Util.h"
 #include "test_matrixUtil.h"
-#include <gtest/gtest.h>
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 81d53f065b84b2699141fc599b9efba794bbd25a..65d01a15718ae2bebd4869eff0e5407524bc0e7c 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -225,6 +225,8 @@ void Argument::resizeAndCopyFrom(const Argument& src,
   }
   resizeAndCopy(udp, src.udp, useGpu, stream);
   resizeAndCopy(strs, src.strs, useGpu, stream);
+  frameWidth = src.frameWidth;
+  frameHeight = src.frameHeight;
 }
 
 int32_t Argument::resizeAndCopyFrom(const Argument& src,
@@ -243,6 +245,8 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
                                     bool useGpu,
                                     hl_stream_t stream) {
   dataId = src.dataId;
+  frameWidth = src.frameWidth;
+  frameHeight = src.frameHeight;
 
   if (!src.sequenceStartPositions) {
     // non-sequence input, copy samples directly
@@ -549,11 +553,10 @@ void Argument::getSeqInfo(std::vector<SeqInfo>* seqInfo) const {
     }
     seqInfo->push_back(info);
   }
-  std::sort(seqInfo->begin(),
-            seqInfo->end(),
-            [](const SeqInfo& a, const SeqInfo& b) {
-              return a.topLevelLength > b.topLevelLength;
-            });
+  std::sort(
+      seqInfo->begin(), seqInfo->end(), [](const SeqInfo& a, const SeqInfo& b) {
+        return a.topLevelLength > b.topLevelLength;
+      });
 }
 
 void Argument::checkSubset() const {
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index 2b20122debf935562d36f29d872e8ef3243111e0..afd2de0202bf0f14ec3d4c5b856455a3488e41f6 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -18,9 +18,9 @@ limitations under the License. */
 
 #include "paddle/math/Matrix.h"
 #include "paddle/math/Vector.h"
+#include "paddle/parameter/Parameter.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/Util.h"
-#include "paddle/parameter/Parameter.h"
 
 namespace paddle {
 
diff --git a/paddle/parameter/AverageOptimizer.cpp b/paddle/parameter/AverageOptimizer.cpp
index 593594761ed57495b92a30a8f3e8e86cdb45bfce..e51ca5652090e6fba5e2070fc8f8c1d10e9ecc7a 100644
--- a/paddle/parameter/AverageOptimizer.cpp
+++ b/paddle/parameter/AverageOptimizer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/parameter/AverageOptimizer.h b/paddle/parameter/AverageOptimizer.h
index ccc2612608db574274f3e0acaacec7f9eb404223..9fd3f75baa6dd54620e7e7a93f14deb598999cf4 100644
--- a/paddle/parameter/AverageOptimizer.h
+++ b/paddle/parameter/AverageOptimizer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/parameter/FirstOrderOptimizer.cpp b/paddle/parameter/FirstOrderOptimizer.cpp
index a9be07d062992ff24175339c630426d27e84c22b..dbb738e98b5874f5bb33026ad585a6c3ef327d1d 100644
--- a/paddle/parameter/FirstOrderOptimizer.cpp
+++ b/paddle/parameter/FirstOrderOptimizer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
-#include "paddle/utils/Flags.h"
-
 #include "FirstOrderOptimizer.h"
+#include "paddle/math/TrainingAlgorithmOp.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/Util.h"
 
 #include <cmath>
 
-P_DEFINE_bool(log_clipping, false, "enable log clipping or not");
+DEFINE_bool(log_clipping, false, "enable log clipping or not");
 
 namespace paddle {
 
@@ -115,19 +115,28 @@ void SparseMomentumParameterOptimizer::finishBatch() {
 void AdagradParameterOptimizer::update(const VectorPtr vecs[],
                                        const ParameterConfig& config,
                                        size_t sparseId) const {
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
-                                                1.0f);
-  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
-                                     *vecs[PARAMETER_GRADIENT_SQURESUM1]);
-  vecs[PARAMETER_LEARNING_RATE]->add(optConfig_.ada_epsilon());
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate_ * config.learning_rate(),
-                                   config.momentum(),
-                                   applyDecay_ ? config.decay_rate() : 0);
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& accum_buffer = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
+
+  real epsilon = optConfig_.ada_epsilon();
+  real learningRate = learningRate_ * config.learning_rate();
+  real momentum = config.momentum();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
+
+  adagradApply(value,
+               grad,
+               mom,
+               accum_buffer,
+               accum,
+               lr,
+               epsilon,
+               learningRate,
+               momentum,
+               decayRate);
 }
 
 ParameterOptimizer::TraverseCallback
@@ -152,37 +161,41 @@ void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[],
                                         const ParameterConfig& config,
                                         size_t sparseId) const {
   CHECK(sparseId == -1LU) << "Sparse update is not supported";
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], rou_, 1.0f - rou_);
-
-  // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
-  vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
-                                        *vecs[PARAMETER_GRADIENT_SQURESUM],
-                                        epsilon_,
-                                        epsilon_);
-  vecs[PARAMETER_LEARNING_RATE]->sqrt();
-
-  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
-      *vecs[PARAMETER_GRADIENT],
-      *vecs[PARAMETER_LEARNING_RATE],
-      rou_,
-      1.0f - rou_);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate_ * config.learning_rate(),
-                                   config.momentum(),
-                                   applyDecay_ ? config.decay_rate() : 0);
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum_update = *vecs[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
+
+  real learningRate = learningRate_ * config.learning_rate();
+  real momentum = config.momentum();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
+
+  adadeltaApply(value,
+                grad,
+                mom,
+                accum,
+                accum_update,
+                lr,
+                rou_,
+                epsilon_,
+                learningRate,
+                momentum,
+                decayRate);
 }
 
 void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
                                        const ParameterConfig& config,
                                        size_t sparseId) const {
-  real accumulatedRou = rou_;
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& sum1 = *vecs[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
 
+  real accumulatedRou = rou_;
   bool firstTime = timer_ == 0;
   if (sparseId != -1LU) {
     CHECK_LT(sparseId, t0Vec_.size());
@@ -191,40 +204,36 @@ void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
     t0Vec_[sparseId] = timer_ + 1;
   }
 
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT],
-      accumulatedRou,
-      firstTime ? 1.0f : 1.0f - rou_);
-
-  // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou_);
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
-  // Basiclly if the sign of the gradient changes more often,
-  // the learning rate will be decreased.
-  vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
-  vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
-                                           -1.0f);
-  vecs[PARAMETER_LEARNING_RATE]->add(optConfig_.ada_epsilon());
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate_ * config.learning_rate(),
-                                   config.momentum(),
-                                   applyDecay_ ? config.decay_rate() : 0);
+  real epsilon = optConfig_.ada_epsilon();
+  real learningRate = learningRate_ * config.learning_rate();
+  real momentum = config.momentum();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
+
+  rmspropApply(value,
+               grad,
+               mom,
+               sum,
+               sum1,
+               lr,
+               accumulatedRou,
+               rou_,
+               epsilon,
+               learningRate,
+               momentum,
+               decayRate,
+               firstTime);
 }
 
 void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
                                               const ParameterConfig& config,
                                               size_t sparseId) const {
-  real accumulatedRou = rou_;
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
 
+  real accumulatedRou = rou_;
   bool firstTime = timer_ == 0;
   if (sparseId != -1LU) {
     CHECK_LT(sparseId, t0Vec_.size());
@@ -233,77 +242,62 @@ void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
     t0Vec_[sparseId] = timer_ + 1;
   }
 
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT],
-      accumulatedRou,
-      firstTime ? 1.0f : 1.0f - rou_);
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
-  // Basiclly if the bigger the magnitude gradient is,
-  // the smaller the learning rate will be.
-  vecs[PARAMETER_LEARNING_RATE]->assign(optConfig_.ada_epsilon());
-  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate_ * config.learning_rate(),
-                                   config.momentum(),
-                                   applyDecay_ ? config.decay_rate() : 0);
+  real epsilon = optConfig_.ada_epsilon();
+  real learningRate = learningRate_ * config.learning_rate();
+  real momentum = config.momentum();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
+
+  decayedAdagradApply(value,
+                      grad,
+                      mom,
+                      sum,
+                      lr,
+                      accumulatedRou,
+                      rou_,
+                      epsilon,
+                      learningRate,
+                      momentum,
+                      decayRate,
+                      firstTime);
 }
 
 void AdamParameterOptimizer::update(const VectorPtr vecs[],
                                     const ParameterConfig& config,
                                     size_t sparseId) const {
   CHECK(sparseId == -1UL) << "Sparse update is not supported";
-  Vector* m = vecs[PARAMETER_MOMENTUM].get();
-  Vector* g = vecs[PARAMETER_GRADIENT].get();
-  Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
-  Vector* theta = vecs[PARAMETER_VALUE].get();
-
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  m->add(*g, beta1_, 1 - beta1_);
-
-  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
-  g->square();
-  v->add(*g, beta2_, 1 - beta2_);
-
-  // tmp = m_t / ( \sqrt{v_t} + \epsilon )
-  // \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
-  g->sqrt(*v);
-  g->dotDiv(*m, *g, 0., epsilon_);
-  real alpha = config.learning_rate() * learningRate_;
-  alpha = alpha * std::sqrt(1 - std::pow(beta2_, step_)) /
-          (1 - std::pow(beta1_, step_));
-  theta->add(*theta, 1.0, *g, -alpha);
+  real beta1_power = std::pow(beta1_, step_);
+  real beta2_power = std::pow(beta2_, step_);
+  real learningRate = config.learning_rate() * learningRate_;
+
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& v = *vecs[PARAMETER_SECOND_MOMENTUM];
+
+  adamApply(value,
+            grad,
+            mom,
+            v,
+            beta1_,
+            beta2_,
+            beta1_power,
+            beta2_power,
+            epsilon_,
+            learningRate);
 }
 
 void AdamaxParameterOptimizer::update(const VectorPtr vecs[],
                                       const ParameterConfig& config,
                                       size_t sparseId) const {
   CHECK(sparseId == -1UL) << "Sparse update is not supported";
-  Vector* m = vecs[PARAMETER_MOMENTUM].get();
-  Vector* g = vecs[PARAMETER_GRADIENT].get();
-  Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
-  Vector* theta = vecs[PARAMETER_VALUE].get();
-
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  m->add(*g, beta1_, 1 - beta1_);
+  real learningRate = config.learning_rate() * learningRate_;
 
-  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
-  u->mulScalar(beta2_);
-  g->abs();
-  u->max(*u, *g);
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& u = *vecs[PARAMETER_WEIGHTED_INFINITY_NORM];
 
-  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
-  g->dotDiv(*m, *u);
-  real learningRate = config.learning_rate() * learningRate_;
-  learningRate /= (1 - std::pow(beta1_, step_));
-  theta->add(*theta, 1.0, *g, -learningRate);
+  adamaxApply(value, grad, mom, u, beta1_, beta2_, step_, learningRate);
 }
 
 void OptimizerWithGradientClipping::update(const VectorPtr vecs[],
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
index a9a2ffdd41310d1927df012be8328d0e4bd3af0f..095019b74f4f667991a0d4c5d5511e371889539f 100644
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/parameter/LearningRateScheduler.cpp b/paddle/parameter/LearningRateScheduler.cpp
index a7412500ccfa05707286f0ad493ad8280eee1cbc..66448b2c5f6f440c3a7fdec2bd552d8dc7d0927f 100644
--- a/paddle/parameter/LearningRateScheduler.cpp
+++ b/paddle/parameter/LearningRateScheduler.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/parameter/LearningRateScheduler.h b/paddle/parameter/LearningRateScheduler.h
index e987c3dcde120b8c88d58de7a18ee5c6db85bb5c..53b9dba446e1c5908c1b990b026e6fbd4d8fd260 100644
--- a/paddle/parameter/LearningRateScheduler.h
+++ b/paddle/parameter/LearningRateScheduler.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/parameter/OptimizerFunctions.cpp b/paddle/parameter/OptimizerFunctions.cpp
index 6fd7964347644214533007dc1e11e6fa45ee9ea6..a4af1b470585937827305100e3fbf603e67c8241 100644
--- a/paddle/parameter/OptimizerFunctions.cpp
+++ b/paddle/parameter/OptimizerFunctions.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/parameter/OptimizerFunctions.h b/paddle/parameter/OptimizerFunctions.h
index a5f8b2c56942720335c0df6c9d71fd4e15494600..4f7370b6baf2b92469e73d54ecb25a23d73c3506 100644
--- a/paddle/parameter/OptimizerFunctions.h
+++ b/paddle/parameter/OptimizerFunctions.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/parameter/OptimizerWithRegularizer.cpp b/paddle/parameter/OptimizerWithRegularizer.cpp
index 5381e7bef3b177884d85671ef6e3dfbc0de1d5ed..85f13c8bc08c534224a1a8365d541737980b439f 100644
--- a/paddle/parameter/OptimizerWithRegularizer.cpp
+++ b/paddle/parameter/OptimizerWithRegularizer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/parameter/OptimizerWithRegularizer.h b/paddle/parameter/OptimizerWithRegularizer.h
index ebe23c7397f6d3f14976422342953e493a6fbee1..0e1c444d280886866c3b153988b1887a43c161c6 100644
--- a/paddle/parameter/OptimizerWithRegularizer.h
+++ b/paddle/parameter/OptimizerWithRegularizer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/parameter/ParallelParameter.cpp b/paddle/parameter/ParallelParameter.cpp
index 99b20a59ca2a8b4a84a5bcbd0fab135ac54de61c..cea77e5b1787c25ecb9ccd42e948bf90973fd4cb 100644
--- a/paddle/parameter/ParallelParameter.cpp
+++ b/paddle/parameter/ParallelParameter.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include <fstream>
+#include "paddle/utils/Logging.h"
 
 #include "ParallelParameter.h"
 
diff --git a/paddle/parameter/ParallelParameter.h b/paddle/parameter/ParallelParameter.h
index 2b65321fe201ae166dbbd6629e9a0ab0c6481699..417e386dc74d308a6c0aefa2640f0f37de8dbf1f 100644
--- a/paddle/parameter/ParallelParameter.h
+++ b/paddle/parameter/ParallelParameter.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,19 +16,19 @@ limitations under the License. */
 
 #include <stdint.h>
 
+#include <sys/time.h>
+#include <unistd.h>
 #include <iostream>
 #include <string>
 #include <vector>
-#include <sys/time.h>
-#include <unistd.h>
 
 #include "hl_gpu.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Locks.h"
+#include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/parameter/ParameterUpdateFunctions.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/Locks.h"
 #include "paddle/utils/TypeDefs.h"
-#include "paddle/math/Vector.h"
 
 #include "ParameterConfig.pb.h"
 
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
index 7e37bf225ba25e8bae269cf45b69ce418a54d1a3..1673fc6e533e416dfe4db557a1a8968667d1bfff 100644
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,25 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "Parameter.h"
 #include <fstream>
-#include "paddle/math/MathUtils.h"
 #include "AverageOptimizer.h"
 #include "FirstOrderOptimizer.h"
-#include "Parameter.h"
-#include "paddle/utils/Logging.h"
 #include "OptimizerFunctions.h"
 #include "OptimizerWithRegularizer.h"
 #include "ParameterUpdateFunctions.h"
-#include "paddle/math/SparseRowMatrix.h"
-#include "paddle/math/CpuSparseMatrix.h"
 #include "hl_gpu.h"
+#include "paddle/math/CpuSparseMatrix.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/SparseRowMatrix.h"
 #include "paddle/utils/CommandLineParser.h"
+#include "paddle/utils/Logging.h"
 
-P_DEFINE_int32(enable_grad_share,
-               (100 * 1024 * 1024),
-               "threshold for enable gradient parameter share for batch "
-               "multi-cpu training");
-P_DEFINE_int32(
+DEFINE_int32(enable_grad_share,
+             (100 * 1024 * 1024),
+             "threshold for enable gradient parameter share for batch "
+             "multi-cpu training");
+DEFINE_int32(
     grad_share_block_num,
     64,
     "block number of gradient parameter share for batch multi-cpu training");
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index 1c159d669a6a0f7b56c377e0b1cfa35b3fb75d53..532c6770e596c33dfe7fd42f32157b2c6c19e18e 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -23,14 +23,14 @@ limitations under the License. */
 #include "ParameterConfig.pb.h"
 #include "TrainerConfig.pb.h"
 
+#include "ParameterUpdaterHook.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+#include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/Locks.h"
+#include "paddle/utils/ThreadLocal.h"
 #include "paddle/utils/TypeDefs.h"
-#include "paddle/math/Vector.h"
-#include "paddle/math/Matrix.h"
 #include "paddle/utils/Util.h"
-#include "paddle/utils/ThreadLocal.h"
-#include "ParameterUpdaterHook.h"
-#include "paddle/utils/GlobalConstants.h"
 
 namespace paddle {
 
diff --git a/paddle/parameter/ParameterOptimizer.cpp b/paddle/parameter/ParameterOptimizer.cpp
index 2a71d6aee4dae556956616bd317156cfaf8732f0..7c8c6978e22c0e3cb3a3bdf8a7709a091cc2292b 100644
--- a/paddle/parameter/ParameterOptimizer.cpp
+++ b/paddle/parameter/ParameterOptimizer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/parameter/ParameterOptimizer.h b/paddle/parameter/ParameterOptimizer.h
index 21a148333c2fd3aa127c5b3bb8160784864f4cce..2bdc793d605e01f8e055087bb3e0973168cb0213 100644
--- a/paddle/parameter/ParameterOptimizer.h
+++ b/paddle/parameter/ParameterOptimizer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp
index 510ec5bf48a7576f646ecf01b02c5047c637afeb..c8af7105c78dcbf9f625a348b7f38efcf278469e 100644
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/parameter/ParameterUpdateFunctions.h b/paddle/parameter/ParameterUpdateFunctions.h
index 2d98030bd2389469fbd32940af6162203557620c..2d277e47e7eafc118fa37343e93e8a331a260aa9 100644
--- a/paddle/parameter/ParameterUpdateFunctions.h
+++ b/paddle/parameter/ParameterUpdateFunctions.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/TypeDefs.h"
 #include "paddle/math/Vector.h"
+#include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
 
diff --git a/paddle/parameter/ParameterUpdaterBase.cpp b/paddle/parameter/ParameterUpdaterBase.cpp
index e706742053fc49df9c99081774f425622941e38c..49e2ae2b393f4a5e6c0986bc5e645011f5a3eca1 100644
--- a/paddle/parameter/ParameterUpdaterBase.cpp
+++ b/paddle/parameter/ParameterUpdaterBase.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fstream>
-#include "paddle/utils/Logging.h"
 #include "ParameterUpdaterBase.h"
+#include <fstream>
 #include "hl_gpu.h"
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/parameter/ParameterUpdaterBase.h b/paddle/parameter/ParameterUpdaterBase.h
index ffd2980261530382ee09f2c98e354d0e56fd8038..5401046f676892c415f3e29ed1adbec75f5abe74 100644
--- a/paddle/parameter/ParameterUpdaterBase.h
+++ b/paddle/parameter/ParameterUpdaterBase.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp
index 7d85a32c0cf527d39c252c2021b7bad0eb58753d..f826e8448c666bb3305c150f2bd95aade23223fb 100644
--- a/paddle/parameter/ParameterUpdaterHook.cpp
+++ b/paddle/parameter/ParameterUpdaterHook.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,16 +14,16 @@ limitations under the License. */
 
 #include "ParameterUpdaterHook.h"
 
+#include <atomic>
 #include <fstream>
-#include <unordered_map>
 #include <mutex>
-#include <atomic>
 #include <thread>
+#include <unordered_map>
 
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
-#include "paddle/utils/Util.h"
 #include "paddle/utils/Flags.h"
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
@@ -156,7 +156,8 @@ private:
 
 static WeakKVCache<std::pair<std::string, int>,
                    IParameterUpdaterHook,
-                   StringIntPairHasher> g_hookCache_;
+                   StringIntPairHasher>
+    g_hookCache_;
 
 /**
  * ParameterUpdaterHook actually factory method.
diff --git a/paddle/parameter/ParameterUpdaterHook.h b/paddle/parameter/ParameterUpdaterHook.h
index 553282bcaaa2e90910eaafbe2e03a4afadf04a85..1f4506441da725e13ed2b0f339e6e2f5f30bcc6a 100644
--- a/paddle/parameter/ParameterUpdaterHook.h
+++ b/paddle/parameter/ParameterUpdaterHook.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/parameter/Regularizer.cpp b/paddle/parameter/Regularizer.cpp
index a9bddc1596656ba36d6c445781f42991684f0c52..8511900150363a2247d508833eeb42b2d87beec1 100644
--- a/paddle/parameter/Regularizer.cpp
+++ b/paddle/parameter/Regularizer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
-#include "paddle/utils/Flags.h"
 #include "Regularizer.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/parameter/Regularizer.h b/paddle/parameter/Regularizer.h
index 5baaccc00db5f858272dbfa6751647915bfa6e3c..6d5477309812da4dda4bb7a4a869a2fde7f6bcf0 100644
--- a/paddle/parameter/Regularizer.h
+++ b/paddle/parameter/Regularizer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/parameter/Weight.cpp b/paddle/parameter/Weight.cpp
index c138010607412fa257a6c7360a27d855197f88ad..3738a58d7f84081db9b6179cef9361322553a627 100644
--- a/paddle/parameter/Weight.cpp
+++ b/paddle/parameter/Weight.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Weight.h"
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/parameter/Weight.h b/paddle/parameter/Weight.h
index 531b571cbc0055257e9a88037515834364439dc3..6e7a49154e87b89929cea066872c282f9410e776 100644
--- a/paddle/parameter/Weight.h
+++ b/paddle/parameter/Weight.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
index 1a64fe335257a3107be03cfd333cb483c5ab452d..aa57a6346917b259dbb89f6ad2340fb8db28f3e3 100644
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <stdlib.h>
 #include <paddle/utils/Util.h>
+#include <stdlib.h>
 
 #include <gtest/gtest.h>
-#include <paddle/utils/Flags.h>
 #include <paddle/parameter/ParameterUpdateFunctions.h>
+#include <paddle/utils/Flags.h>
 #include <paddle/utils/Stat.h>
 #include <paddle/utils/Thread.h>
 
diff --git a/paddle/pserver/BaseClient.cpp b/paddle/pserver/BaseClient.cpp
index ff83970ab1b11f74ceb4009cc8f469f7b54a7272..b4ac7a2506921b2409baaff077cc3541f3dc8d73 100644
--- a/paddle/pserver/BaseClient.cpp
+++ b/paddle/pserver/BaseClient.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <vector>
+#include "BaseClient.h"
 #include <string.h>
-#include "paddle/utils/Stat.h"
+#include <vector>
 #include "paddle/utils/CommandLineParser.h"
-#include "BaseClient.h"
+#include "paddle/utils/Stat.h"
 
-P_DECLARE_string(pservers);
+DECLARE_string(pservers);
 
 namespace paddle {
 
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
index 3a501172b70a91e02ecda0f9f78e0c025ac67936..262afafbe2d61305a158d945fac2d3b265012cbd 100644
--- a/paddle/pserver/BaseClient.h
+++ b/paddle/pserver/BaseClient.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pserver/ProtoServer.h"
+#include "ParameterService.pb.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/pserver/ProtoServer.h"
 #include "paddle/utils/Queue.h"
 #include "paddle/utils/TypeDefs.h"
-#include "ParameterService.pb.h"
 
 namespace paddle {
 
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
index 1830170a163fa47114c75a2a88a731ea31060142..cbc105e651faa0f283b3becb10449f4e1bc78b38 100644
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,42 +12,42 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <sys/types.h>
-#include <sys/socket.h>
+#include <fcntl.h>
 #include <netdb.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
-#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
 
 #include <arpa/inet.h>
-#include <sys/ioctl.h>
 #include <net/if.h>
 #include <net/if_arp.h>
+#include <sys/ioctl.h>
 #include <sstream>
 
 #include "LightNetwork.h"
-#include "paddle/utils/Util.h"
-#include "paddle/utils/StringUtil.h"
 #include "RDMANetwork.h"
+#include "paddle/utils/StringUtil.h"
+#include "paddle/utils/Util.h"
 
 /// quick ack can reduce the latency of small message
-P_DEFINE_bool(small_messages,
-              false,
-              "if message size is small, recommend set it True to enable quick "
-              "ack and no delay");
+DEFINE_bool(small_messages,
+            false,
+            "if message size is small, recommend set it True to enable quick "
+            "ack and no delay");
 
 /// reasonable sock_send_buf_size can control the traffic injected into switch
 /// network. Injecting too many data into traffic could cause packets loss which
 /// cause long latency and degrade the efficiency of communication.
-P_DEFINE_int32(sock_send_buf_size,
-               1024 * 1024 * 40,
-               "restrict sock send buff size, can reduce network congestion if "
-               "set carefully");
+DEFINE_int32(sock_send_buf_size,
+             1024 * 1024 * 40,
+             "restrict sock send buff size, can reduce network congestion if "
+             "set carefully");
 
 /// reasonable size can hold bursted packets and reduce packets loss
-P_DEFINE_int32(sock_recv_buf_size,
-               1024 * 1024 * 40,
-               "restrict sock recv buff size");
+DEFINE_int32(sock_recv_buf_size,
+             1024 * 1024 * 40,
+             "restrict sock recv buff size");
 
 namespace paddle {
 
diff --git a/paddle/pserver/LightNetwork.h b/paddle/pserver/LightNetwork.h
index b7d7bc7902abb18aae03fc4d8a3972f0298199fe..c4a06deb940e8f39af2fcb6de54de1b6cb2d1483 100644
--- a/paddle/pserver/LightNetwork.h
+++ b/paddle/pserver/LightNetwork.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,10 +16,10 @@ limitations under the License. */
 
 #include "SocketChannel.h"
 
+#include <atomic>
 #include <memory>
 #include <thread>
 #include <vector>
-#include <atomic>
 
 #include "paddle/utils/Thread.h"
 
diff --git a/paddle/pserver/ParameterClient2.cpp b/paddle/pserver/ParameterClient2.cpp
index 28cc0ae2dd36273397015e618f6e14ea43398964..a97859f83fe6495b298e920346c964ef2a9b146c 100644
--- a/paddle/pserver/ParameterClient2.cpp
+++ b/paddle/pserver/ParameterClient2.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,34 +15,27 @@ limitations under the License. */
 #include <unistd.h>
 
 #include "ParameterClient2.h"
-#include "paddle/utils/StringUtil.h"
+#include "paddle/math/SparseRowMatrix.h"
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/Stat.h"
-#include "paddle/math/SparseRowMatrix.h"
+#include "paddle/utils/StringUtil.h"
 
-P_DEFINE_string(pservers, "127.0.0.1", "Comma separated addresses of pservers");
-P_DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
+DEFINE_string(pservers, "127.0.0.1", "Comma separated addresses of pservers");
+DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
 
 namespace paddle {
 
-template <class T>
-void copyToRepeatedField(google::protobuf::RepeatedField<T>* dest,
-                         const T* src,
+template <typename T1, typename T2>
+void copyToRepeatedField(google::protobuf::RepeatedField<T1>* dest,
+                         const T2* src,
                          size_t size) {
   dest->Clear();
   dest->Reserve(size);
-
   for (size_t i = 0; i < size; ++i) {
     dest->AddAlreadyReserved(src[i]);
   }
 }
 
-template <class T>
-void copyToRepeatedField(const std::vector<T>& src,
-                         google::protobuf::RepeatedField<T>* dest) {
-  copyToRepeatedField(dest, &src[0], src.size());
-}
-
 ParameterClient2::ParameterClient2(bool separate, int port, int numPorts)
     : BaseClient(separate, numPorts), port_(port) {
 #ifndef PADDLE_DISABLE_TIMER
@@ -618,6 +611,8 @@ void PreparedOperations::addOperationHelper(Operation* op, CpuMatrixPtr mat) {
       pmat.mutable_values(), mat->getData(), pmat.num_cols() * pmat.num_rows());
 }
 
+static inline real addTwo(real a, double b) { return a + b; }
+
 void ParameterClient2::doOperation(PreparedOperations& ops,
                                    bool waitForGradient,
                                    bool sendBackGradient,
@@ -682,8 +677,11 @@ void ParameterClient2::doOperation(PreparedOperations& ops,
         CpuVectorPtr rvec = resultVectors[i];
         if (!rvec) continue;
         CHECK_EQ(rvec->getSize(), (size_t)vec.dim());
-        CpuVector avec(rvec->getSize(), const_cast<real*>(vec.values().data()));
-        rvec->add(avec);
+        std::transform(rvec->getData(),
+                       rvec->getData() + rvec->getSize(),
+                       vec.values().data(),
+                       rvec->getData(),
+                       addTwo);
       }
 
       CHECK_EQ(resultMatrices.size(), (size_t)result.matrices_size());
@@ -693,11 +691,12 @@ void ParameterClient2::doOperation(PreparedOperations& ops,
         if (!rmat) continue;
         CHECK_EQ(rmat->getHeight(), (size_t)mat.num_rows());
         CHECK_EQ(rmat->getWidth(), (size_t)mat.num_cols());
-        CpuMatrixPtr amat =
-            std::make_shared<CpuMatrix>(const_cast<real*>(mat.values().data()),
-                                        rmat->getHeight(),
-                                        rmat->getWidth());
-        rmat->add(*amat);
+
+        std::transform(rmat->getData(),
+                       rmat->getData() + rmat->getElementCnt(),
+                       mat.values().data(),
+                       rmat->getData(),
+                       addTwo);
       }
     }
   }
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
index af8dd41ec4327fcf78625e7aa5d4b136ca7d14dd..eed71ccb43b0fec76a74a7f00662c32c97c26ff4 100644
--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,25 +16,25 @@ limitations under the License. */
 
 #include <atomic>
 #include <mutex>
-#include <vector>
 #include <unordered_map>
+#include <vector>
 
-#include "paddle/utils/Locks.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
+#include "paddle/pserver/BaseClient.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/Locks.h"
 #include "paddle/utils/Queue.h"
 #include "paddle/utils/TypeDefs.h"
 #include "paddle/utils/Util.h"
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/pserver/BaseClient.h"
 
 #include "ParameterService.pb.h"
 
-#include "SparseParameterDistribution.h"
 #include "ProtoServer.h"
+#include "SparseParameterDistribution.h"
 
-P_DECLARE_int32(parallel_thread_num);
+DECLARE_int32(parallel_thread_num);
 
 namespace paddle {
 
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index b7f999f8b132e59ce8b7dffe5c4d43615e4c564c..856fa0ad1ab30e3fc554ac96dd3bed71b1548579 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -21,20 +21,20 @@ limitations under the License. */
 
 #include "paddle/parameter/AverageOptimizer.h"
 #include "paddle/parameter/FirstOrderOptimizer.h"
-#include "paddle/utils/Flags.h"
 #include "paddle/parameter/OptimizerFunctions.h"
 #include "paddle/parameter/OptimizerWithRegularizer.h"
-#include "paddle/parameter/ParameterUpdateFunctions.h"
 #include "paddle/parameter/ParameterOptimizer.h"
+#include "paddle/parameter/ParameterUpdateFunctions.h"
 #include "paddle/parameter/Regularizer.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/utils/Flags.h"
 #include "paddle/utils/GlobalConstants.h"
+#include "paddle/utils/Stat.h"
 
-P_DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
-P_DEFINE_double(async_lagged_ratio_min,
-                1.0,
-                "control config_.async_lagged_grad_discard_ratio() min value");
-P_DEFINE_double(
+DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
+DEFINE_double(async_lagged_ratio_min,
+              1.0,
+              "control config_.async_lagged_grad_discard_ratio() min value");
+DEFINE_double(
     async_lagged_ratio_default,
     1.5,
     "if async_lagged_grad_discard_ratio is not set in trainer_config.conf"
diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h
index ccaea42e7d0cb1865234702315fd4bbd00e548d5..b0cf22e1fb158e76fcee1ce6ef1f375995803ce6 100644
--- a/paddle/pserver/ParameterServer2.h
+++ b/paddle/pserver/ParameterServer2.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,30 +15,30 @@ limitations under the License. */
 #pragma once
 
 #include <atomic>
+#include <limits>
 #include <mutex>
 #include <string>
-#include <vector>
-#include <unordered_map>
 #include <type_traits>
-#include <limits>
+#include <unordered_map>
+#include <vector>
 
 #include <stddef.h>
 #include <stdlib.h>
 
-#include "paddle/utils/Locks.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/parameter/ParameterOptimizer.h"
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/Stat.h"
 #include "paddle/utils/ThreadLocal.h"
 #include "paddle/utils/TypeDefs.h"
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Stat.h"
 
 #include "ParameterService.pb.h"
 
 #include "ProtoServer.h"
 
-P_DECLARE_int32(port);
+DECLARE_int32(port);
 
 namespace paddle {
 
diff --git a/paddle/pserver/ParameterServer2Main.cpp b/paddle/pserver/ParameterServer2Main.cpp
index b15ef8c3ccc9220d38514e927e0e10184e901846..ffc521f2c143d95ff07c3825e0a746cb31743d9b 100644
--- a/paddle/pserver/ParameterServer2Main.cpp
+++ b/paddle/pserver/ParameterServer2Main.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
-#include "paddle/utils/StringUtil.h"
 #include <fstream>
+#include "paddle/utils/StringUtil.h"
+#include "paddle/utils/Util.h"
 
-#include "paddle/utils/Flags.h"
 #include "ParameterServer2.h"
 #include "RDMANetwork.h"
+#include "paddle/utils/Flags.h"
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/pserver/ProtoServer.cpp b/paddle/pserver/ProtoServer.cpp
index 2f6d911a017d231692c42f2a235cf1e15257f7ae..410317ece28ec79dd668e91ff9fbed11f20a5acc 100644
--- a/paddle/pserver/ProtoServer.cpp
+++ b/paddle/pserver/ProtoServer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/pserver/ProtoServer.h b/paddle/pserver/ProtoServer.h
index cf08e24ff3ef47d9c17bfe14d7d3aff1537b8ce8..3acdcc27dab532f964dc97636be020138180e780 100644
--- a/paddle/pserver/ProtoServer.h
+++ b/paddle/pserver/ProtoServer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -100,7 +100,8 @@ protected:
                              ResponseCallback callback);
 
   typedef std::function<void(std::unique_ptr<MsgReader> msgReader,
-                             ResponseCallback callback)> ServiceFunction;
+                             ResponseCallback callback)>
+      ServiceFunction;
 
   /**
    * @brief register one RPC function in function mapping
diff --git a/paddle/pserver/RDMANetwork.h b/paddle/pserver/RDMANetwork.h
index 4e492a3afd120462ac6e056b9df850063c503a53..caef65134bf798851c0b826f7cc8bf2ab64b3f58 100644
--- a/paddle/pserver/RDMANetwork.h
+++ b/paddle/pserver/RDMANetwork.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/pserver/SocketChannel.cpp b/paddle/pserver/SocketChannel.cpp
index 4ebc47d32659d82f32b9da529aec7ec3f46f77a9..05998891649cee30e23e556d9311c3a383f43e10 100644
--- a/paddle/pserver/SocketChannel.cpp
+++ b/paddle/pserver/SocketChannel.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #include "SocketChannel.h"
 
-#include <stdio.h>
-#include <sys/types.h>
-#include <sys/socket.h>
 #include <netdb.h>
 #include <netinet/in.h>
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
 #include <unistd.h>
 #include "RDMANetwork.h"
 
diff --git a/paddle/pserver/SocketChannel.h b/paddle/pserver/SocketChannel.h
index 472b37a12283ca1c358034427d491804af765171..6c3dd20d7be60a53bfcf443d22888109ff9e3935 100644
--- a/paddle/pserver/SocketChannel.h
+++ b/paddle/pserver/SocketChannel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/pserver/SparseParameterDistribution.cpp b/paddle/pserver/SparseParameterDistribution.cpp
index 2085b22a95138fa8caf474a081fb46229688966f..6dd725db30cd6d50539d1b2b30ab9e42a081c7b3 100644
--- a/paddle/pserver/SparseParameterDistribution.cpp
+++ b/paddle/pserver/SparseParameterDistribution.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -20,26 +20,26 @@ limitations under the License. */
 
 #include "SparseParameterDistribution.h"
 
-P_DEFINE_bool(check_sparse_distribution_in_pserver,
-              false,
-              "check whether sparse parameter exhibts balanced distribution at "
-              "all pservers");
-P_DEFINE_bool(show_check_sparse_distribution_log,
-              false,
-              "show logs details for sparse parameter distribution in pserver");
-P_DEFINE_int32(check_sparse_distribution_batches,
-               100,
-               "run sparse parameter distribution check for N batches");
-P_DEFINE_double(
+DEFINE_bool(check_sparse_distribution_in_pserver,
+            false,
+            "check whether sparse parameter exhibts balanced distribution at "
+            "all pservers");
+DEFINE_bool(show_check_sparse_distribution_log,
+            false,
+            "show logs details for sparse parameter distribution in pserver");
+DEFINE_int32(check_sparse_distribution_batches,
+             100,
+             "run sparse parameter distribution check for N batches");
+DEFINE_double(
     check_sparse_distribution_ratio,
     0.6,
     "if parameters dispatched to different pservers exhibit unbalanced "
     " distribution for check_sparse_distribution_ratio * "
     " check_sparse_distribution_batches times, crash program");
-P_DEFINE_double(check_sparse_distribution_unbalance_degree,
-                2.0,
-                "the ratio of maximum data size and minimun data size for "
-                "different pserver");
+DEFINE_double(check_sparse_distribution_unbalance_degree,
+              2.0,
+              "the ratio of maximum data size and minimun data size for "
+              "different pserver");
 
 namespace paddle {
 
diff --git a/paddle/pserver/SparseParameterDistribution.h b/paddle/pserver/SparseParameterDistribution.h
index af2b43af0ff58e842e53745dec7b501da6d36fe2..24b14106cf64060afa61ecede9e981301ea5634a 100644
--- a/paddle/pserver/SparseParameterDistribution.h
+++ b/paddle/pserver/SparseParameterDistribution.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include <unistd.h>
 
-#include "paddle/utils/Logging.h"
 #include <atomic>
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp
index 24c90f10785a6f5870ab291a5c5e6c13fbc0d49f..066a6c02939695e7050a7693365d7c449f70e723 100644
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #include "paddle/utils/Util.h"
 
-#include <stdio.h>
-#include <sys/types.h>
-#include <sys/socket.h>
 #include <netdb.h>
 #include <netinet/in.h>
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
 
 #include <thread>
 
@@ -195,9 +195,9 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
   channel_.reset(new SocketChannel(sockfd));
 }
 
-P_DEFINE_string(server_addr, "127.0.0.1", "Server address");
-P_DEFINE_int64(dim, 10000000, "Data size");
-P_DEFINE_int32(loop_time, 100000, "test loop time");
+DEFINE_string(server_addr, "127.0.0.1", "Server address");
+DEFINE_int64(dim, 10000000, "Data size");
+DEFINE_int32(loop_time, 100000, "test loop time");
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/pserver/test/test_ParameterServer2.cpp b/paddle/pserver/test/test_ParameterServer2.cpp
index eb813e92d6d696db6c2ced543a00594b69c7f5af..8e7231a9e1aee7b61f8dfa42f1367b79fee81a2b 100644
--- a/paddle/pserver/test/test_ParameterServer2.cpp
+++ b/paddle/pserver/test/test_ParameterServer2.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,18 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <gtest/gtest.h>
 #include <paddle/pserver/ParameterClient2.h>
 #include <paddle/pserver/ParameterServer2.h>
-#include <gtest/gtest.h>
 #include <paddle/utils/Flags.h>
 #include <paddle/utils/Util.h>
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_int32(num_gradient_servers);
-P_DEFINE_string(server_addr, "127.0.0.1", "assign server address");
-P_DEFINE_int32(server_cpu, 0, "assign server cpu");
+DECLARE_int32(num_gradient_servers);
+DEFINE_string(server_addr, "127.0.0.1", "assign server address");
+DEFINE_int32(server_cpu, 0, "assign server cpu");
 
 class ParameterServer2Tester : public ParameterServer2 {
 public:
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
index 79d1f2743a1c2e6050afe48d6cf86a1084a4500c..9f86ee80f4e5cc99ea3597b3ed37a387578f032a 100644
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,15 +16,15 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 
-#include "paddle/utils/Stat.h"
+#include "ParameterService.pb.h"
 #include "paddle/math/Vector.h"
 #include "paddle/pserver/ProtoServer.h"
-#include "ParameterService.pb.h"
+#include "paddle/utils/Stat.h"
 
-P_DEFINE_string(server_addr, "127.0.0.1", "Server address");
-P_DEFINE_int64(dim, 50000000, "Data size");
-P_DEFINE_bool(test_proto_server, true, "whether to test ProtoServer");
-P_DEFINE_bool(benchmark, false, "Do benchmark. Skip some tests");
+DEFINE_string(server_addr, "127.0.0.1", "Server address");
+DEFINE_int64(dim, 50000000, "Data size");
+DEFINE_bool(test_proto_server, true, "whether to test ProtoServer");
+DEFINE_bool(benchmark, false, "Do benchmark. Skip some tests");
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/pserver/test/test_ProtoServer.sh b/paddle/pserver/test/test_ProtoServer.sh
index a87b1b1ddcd0ea7f48a6a37575dd17fd53fd0751..970c90b494c2a256cf22f3de7b7ea7964fed58ab 100755
--- a/paddle/pserver/test/test_ProtoServer.sh
+++ b/paddle/pserver/test/test_ProtoServer.sh
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/py_paddle/__init__.py b/paddle/py_paddle/__init__.py
index f8399f9c63d81f5a52bf2b277789c26d809f0153..5504d1d50c523315036bfaaf6641c5216269a5e5 100644
--- a/paddle/py_paddle/__init__.py
+++ b/paddle/py_paddle/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index d64c7b20cb65a4b8dfebfc516cfc2c3fdc247114..edcefba6a854df518fd2eb8c1fea5b72c5f5d6a8 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/py_paddle/util.py b/paddle/py_paddle/util.py
index e1f310580f95cfb210ba89589bab668433818b23..ce105d249aaf3e838443d3e0cf5996fe8c783a22 100644
--- a/paddle/py_paddle/util.py
+++ b/paddle/py_paddle/util.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/scripts/cluster_train/conf.py b/paddle/scripts/cluster_train/conf.py
index f1114a59201b9e57a14b739a327b622327c515f7..c77d7584d3c89144761875b0fbc70369e355930a 100644
--- a/paddle/scripts/cluster_train/conf.py
+++ b/paddle/scripts/cluster_train/conf.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/scripts/cluster_train/paddle.py b/paddle/scripts/cluster_train/paddle.py
index 7343a600c1bf5522ac8b0cd90a38f8a362ba7ae6..9b03ed1d8f6a28259a6cb45f096575b5f3d27ca7 100644
--- a/paddle/scripts/cluster_train/paddle.py
+++ b/paddle/scripts/cluster_train/paddle.py
@@ -1,5 +1,5 @@
 #!/usr/bin/python
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/scripts/docker/Dockerfile b/paddle/scripts/docker/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..207f97c4a69e6681702d3fe73475885d9b867ce9
--- /dev/null
+++ b/paddle/scripts/docker/Dockerfile
@@ -0,0 +1,56 @@
+FROM ubuntu:14.04
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt-get update \
+    && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
+    libgoogle-glog-dev libgflags-dev libgtest-dev \
+    libatlas-dev libatlas3-base g++ m4 python-pip \
+    python-protobuf python-numpy python-dev swig openssh-server \
+    wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
+    sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
+    clang-3.8 llvm-3.8 libclang-3.8-dev \
+    && apt-get clean -y
+RUN cd /usr/src/gtest && cmake . && make && cp *.a /usr/lib
+RUN pip install -U BeautifulSoup docopt PyYAML pillow \
+    sphinx sphinx_rtd_theme recommonmark
+
+# cmake tends to hide and blur the dependencies between code modules, as
+# noted here https://github.com/PaddlePaddle/Paddle/issues/763. We are
+# thinking about using Bazel to fix this problem, e.g.,
+# https://github.com/PaddlePaddle/Paddle/issues/681#issuecomment-263996102. To
+# start the trail of fixing, we add Bazel to our Dockerfiles.
+RUN apt-get update && apt-get install -y curl software-properties-common \
+    && add-apt-repository ppa:webupd8team/java \
+    && echo "oracle-java8-installer shared/accepted-oracle-license-v1-1 select true" | debconf-set-selections \
+    && echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list \
+    && curl https://bazel.build/bazel-release.pub.gpg | apt-key add - \
+    && apt-get update && apt-get install -y oracle-java8-installer bazel
+
+ARG WITH_AVX
+ARG WITH_DOC
+ARG WITH_SWIG_PY
+ARG WITH_STYLE_CHECK
+
+ENV WITH_GPU=OFF
+ENV WITH_AVX=${WITH_AVX:-ON}
+ENV WITH_DOC=${WITH_DOC:-ON}
+ENV WITH_SWIG_PY=${WITH_SWIG_PY:-ON}
+ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
+
+RUN mkdir /paddle
+COPY . /paddle/
+RUN /paddle/paddle/scripts/docker/build.sh
+VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
+
+RUN echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
+RUN pip install /usr/local/opt/paddle/share/wheels/*.whl
+RUN paddle version  # print version after build
+
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+RUN mkdir /var/run/sshd
+RUN echo 'root:root' | chpasswd
+RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+EXPOSE 22
+CMD ["/usr/sbin/sshd", "-D"]
diff --git a/paddle/scripts/docker/Dockerfile.cpu b/paddle/scripts/docker/Dockerfile.cpu
deleted file mode 100644
index 69b8363b7ac9eed033ec4958e189e233b3dc2689..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.cpu
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=OFF
-ENV IS_DEVEL=OFF
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=ON
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.cpu-demo b/paddle/scripts/docker/Dockerfile.cpu-demo
deleted file mode 100644
index ccbd183ee3c1ac27fc624f22847f53eb7d60b83d..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.cpu-demo
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=OFF
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=ON
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=ON
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.cpu-devel b/paddle/scripts/docker/Dockerfile.cpu-devel
deleted file mode 100644
index 36460384f383ba10c4bff1d9875cd053d6391b97..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.cpu-devel
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=OFF
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=ON
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.cpu-noavx b/paddle/scripts/docker/Dockerfile.cpu-noavx
deleted file mode 100644
index fa3b7427b0ad3973423894fa7af54ae5a2514e06..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.cpu-noavx
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=OFF
-ENV IS_DEVEL=OFF
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=OFF
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.cpu-noavx-demo b/paddle/scripts/docker/Dockerfile.cpu-noavx-demo
deleted file mode 100644
index 61315f762dee4d64251ef3d8db5b11b30a3ddb3a..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.cpu-noavx-demo
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=OFF
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=ON
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=OFF
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.cpu-noavx-devel b/paddle/scripts/docker/Dockerfile.cpu-noavx-devel
deleted file mode 100644
index 76365311990b527ea473be840770bfeb6025d74f..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.cpu-noavx-devel
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=OFF
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=OFF
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.gpu b/paddle/scripts/docker/Dockerfile.gpu
index 1e023ae2818dbb27c457ff17b01fc4ab02815eba..33f6adfea2a602c53beb4685e0bf3f87452e2d53 100644
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
@@ -1,12 +1,56 @@
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt-get update \
+    && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
+    libgoogle-glog-dev libgflags-dev libgtest-dev \
+    libatlas-dev libatlas3-base g++ m4 python-pip \
+    python-protobuf python-numpy python-dev swig openssh-server \
+    wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
+    sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
+    clang-3.8 llvm-3.8 libclang-3.8-dev \
+    && apt-get clean -y
+RUN cd /usr/src/gtest && cmake . && make && cp *.a /usr/lib
+RUN pip install -U BeautifulSoup docopt PyYAML pillow \
+    sphinx sphinx_rtd_theme recommonmark
+
+# cmake tends to hide and blur the dependencies between code modules, as
+# noted here https://github.com/PaddlePaddle/Paddle/issues/763. We are
+# thinking about using Bazel to fix this problem, e.g.,
+# https://github.com/PaddlePaddle/Paddle/issues/681#issuecomment-263996102. To
+# start the trail of fixing, we add Bazel to our Dockerfiles.
+RUN apt-get update && apt-get install -y curl software-properties-common \
+    && add-apt-repository ppa:webupd8team/java \
+    && echo "oracle-java8-installer shared/accepted-oracle-license-v1-1 select true" | debconf-set-selections \
+    && echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list \
+    && curl https://bazel.build/bazel-release.pub.gpg | apt-key add - \
+    && apt-get update && apt-get install -y oracle-java8-installer bazel
+
+ARG WITH_AVX
+ARG WITH_DOC
+ARG WITH_SWIG_PY
+ARG WITH_STYLE_CHECK
+
 ENV WITH_GPU=ON
-ENV IS_DEVEL=OFF
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=ON
-RUN cd /root/ && bash build.sh
+ENV WITH_AVX=${WITH_AVX:-ON}
+ENV WITH_DOC=${WITH_DOC:-ON}
+ENV WITH_SWIG_PY=${WITH_SWIG_PY:-ON}
+ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
+
+RUN mkdir /paddle
+COPY . /paddle/
+RUN /paddle/paddle/scripts/docker/build.sh
+VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
+
+RUN echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
+RUN pip install /usr/local/opt/paddle/share/wheels/*.whl
+RUN paddle version  # print version after build
+
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+RUN mkdir /var/run/sshd
+RUN echo 'root:root' | chpasswd
+RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+EXPOSE 22
+CMD ["/usr/sbin/sshd", "-D"]
diff --git a/paddle/scripts/docker/Dockerfile.gpu-demo b/paddle/scripts/docker/Dockerfile.gpu-demo
deleted file mode 100644
index 92b0dca4026c89c6749e14f189370183462333b8..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.gpu-demo
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=ON
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=ON
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=ON
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.gpu-devel b/paddle/scripts/docker/Dockerfile.gpu-devel
deleted file mode 100644
index fb6f351fd2f7e0f950e00ac96681de88ca238f70..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.gpu-devel
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=ON
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=ON
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.gpu-noavx b/paddle/scripts/docker/Dockerfile.gpu-noavx
deleted file mode 100644
index 7567e62025506ca2ae8c1d35d595d92ed6de87f3..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.gpu-noavx
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=ON
-ENV IS_DEVEL=OFF
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=OFF
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.gpu-noavx-demo b/paddle/scripts/docker/Dockerfile.gpu-noavx-demo
deleted file mode 100644
index ac52484c5cb513537283e1a0ffbe9df067fefc9a..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.gpu-noavx-demo
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=ON
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=ON
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=OFF
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.gpu-noavx-devel b/paddle/scripts/docker/Dockerfile.gpu-noavx-devel
deleted file mode 100644
index 19202f306b8f71e93af085d5285098a1fbe1dba7..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.gpu-noavx-devel
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=ON
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=OFF
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.m4 b/paddle/scripts/docker/Dockerfile.m4
deleted file mode 100644
index e14493ed9e842351125ab458db53fcc3f38233f6..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.m4
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM PADDLE_BASE_IMAGE
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=PADDLE_WITH_GPU
-ENV IS_DEVEL=PADDLE_IS_DEVEL
-ENV WITH_DEMO=PADDLE_WITH_DEMO
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=PADDLE_WITH_AVX
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
old mode 100644
new mode 100755
index ec5f3bd967d3569ee058a2e12d85fc50ba25c69d..ca3f1c3f1896feaae657f47c121ce6cd858dc2c9
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -7,43 +7,41 @@ function abort(){
 
 trap 'abort' 0
 set -e
-if [ ${USE_UBUNTU_MIRROR} == "ON" ]; then
-    sed -i 's#http://archive\.ubuntu\.com/ubuntu/#mirror://mirrors\.ubuntu\.com/mirrors\.txt#g'\
-      /etc/apt/sources.list
-fi
-apt-get update
-apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
-    libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip\
-    python-protobuf python-numpy python-dev swig
 
 if [ ${WITH_GPU} == 'ON' ]; then
   ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
 fi
 
-cd ~
-git clone https://github.com/baidu/Paddle.git paddle
-cd paddle
-git checkout ${GIT_CHECKOUT}
-mkdir build
-cd build
-cmake .. -DWITH_DOC=OFF -DWITH_GPU=${WITH_GPU} -DWITH_SWIG_PY=ON\
-   -DCUDNN_ROOT=/usr/ -DWITH_STYLE_CHECK=OFF -DWITH_AVX=${WITH_AVX}
+mkdir -p /paddle/build # -p means no error if exists
+cd /paddle/build
+cmake .. \
+      -DWITH_DOC=ON \
+      -DWITH_GPU=${WITH_GPU} \
+      -DWITH_AVX=${WITH_AVX} \
+      -DWITH_SWIG_PY=ON \
+      -DCUDNN_ROOT=/usr/ \
+      -DWITH_STYLE_CHECK=OFF \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 make -j `nproc`
-# because durning make install, there are several warning, so set +e, do not cause abort
 make install
-echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
-pip ${PIP_GENERAL_ARGS} install ${PIP_INSTALL_ARGS} /usr/local/opt/paddle/share/wheels/*.whl
-paddle version  # print version after build
 
-if [ ${WITH_DEMO} == "ON" ]; then
-  apt-get install -y wget unzip perl python-matplotlib tar xz-utils bzip2 gzip coreutils\
-	          sed grep graphviz libjpeg-dev zlib1g-dev
-  pip ${PIP_GENERAL_ARGS} install ${PIP_INSTALL_ARGS}  BeautifulSoup docopt \
-    PyYAML pillow
-fi
-if [ ${IS_DEVEL} == "OFF" ]; then  # clean build packages.
-  cd ~
-  rm -rf paddle
-fi
-apt-get clean -y
+# Install woboq_codebrowser.
+git clone https://github.com/woboq/woboq_codebrowser /woboq
+cd /woboq
+cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
+      -DCMAKE_BUILD_TYPE=Release \
+      .
+make
+
+export WOBOQ_OUT=/usr/share/nginx/html/paddle
+export BUILD_DIR=/paddle/build
+mkdir -p $WOBOQ_OUT
+cp -rv /woboq/data $WOBOQ_OUT/../data
+/woboq/generator/codebrowser_generator \
+    -b /paddle/build \
+    -a \
+    -o $WOBOQ_OUT \
+    -p paddle:/paddle
+/woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
+
 trap : 0
diff --git a/paddle/scripts/docker/generate.sh b/paddle/scripts/docker/generate.sh
deleted file mode 100644
index 2ad7527db127f3bd2018a7a1f5b40dacfecca6da..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/generate.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-set -e
-cd `dirname $0`
-m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=OFF -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=ON\
-   Dockerfile.m4 > Dockerfile.cpu
-
-m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=OFF -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=OFF\
-   Dockerfile.m4 > Dockerfile.cpu-noavx
-
-m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=OFF\
-   Dockerfile.m4 > Dockerfile.cpu-noavx-devel
-
-m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=ON\
-   Dockerfile.m4 > Dockerfile.cpu-devel
-
-
-m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=ON \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=ON\
-   Dockerfile.m4 > Dockerfile.cpu-demo
-
-m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=ON \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=OFF\
-   Dockerfile.m4 > Dockerfile.cpu-noavx-demo
-
-
-m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=OFF -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   -DPADDLE_WITH_AVX=ON \
-   Dockerfile.m4 > Dockerfile.gpu
-
-m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=OFF -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   -DPADDLE_WITH_AVX=OFF \
-   Dockerfile.m4 > Dockerfile.gpu-noavx
-
-
-m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   -DPADDLE_WITH_AVX=ON \
-   Dockerfile.m4 > Dockerfile.gpu-devel
-
-m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   -DPADDLE_WITH_AVX=OFF \
-   Dockerfile.m4 > Dockerfile.gpu-noavx-devel
-
-m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=ON \
-   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   -DPADDLE_WITH_AVX=ON \
-   Dockerfile.m4 > Dockerfile.gpu-demo
-
-
-m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=ON \
-   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   -DPADDLE_WITH_AVX=OFF \
-   Dockerfile.m4 > Dockerfile.gpu-noavx-demo
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 20ea2fedc4d464cdd5403af28bc917770c993b98..283fd34a6d8a2268f3800ec69920e128ac75e7dc 100644
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -21,14 +21,13 @@ function version(){
         echo "    with_double: @WITH_DOUBLE@"
         echo "    with_python: @WITH_PYTHON@"
         echo "    with_rdma: @WITH_RDMA@"
-        echo "    with_glog: @WITH_GLOG@"
-        echo "    with_gflags: @WITH_GFLAGS@"
         echo "    with_metric_learning: @WITH_METRIC@"
         echo "    with_timer: @WITH_TIMER@"
         echo "    with_predict_sdk: @WITH_PREDICT_SDK@"
 }
 
 function ver2num() {
+  set -e
   # convert version to number.
   if [ -z "$1" ]; then # empty argument
     printf "%03d%03d%03d%03d%03d" 0
@@ -41,6 +40,7 @@ function ver2num() {
       printf "%03d%03d%03d%03d%03d" $VERN
     fi
   fi
+  set +e
 }
 
 PADDLE_CONF_HOME="$HOME/.config/paddle"
diff --git a/paddle/scripts/tools/build_docs/Dockerfile b/paddle/scripts/tools/build_docs/Dockerfile
index 5db0b29c4739943f9e677dc7973b392a345b7da1..78dc756bd1175019d90fc852635497fea1eb55e2 100644
--- a/paddle/scripts/tools/build_docs/Dockerfile
+++ b/paddle/scripts/tools/build_docs/Dockerfile
@@ -1,6 +1,7 @@
 FROM paddledev/paddle:cpu-devel-latest
 COPY build.sh /
 RUN pip install sphinx &&\
+    pip install sphinx_rtd_theme &&\
     apt install -y doxygen graphviz &&\
-    pip install breathe recommonmark numpy protobuf==2.6.1
+    pip install recommonmark numpy protobuf==2.6.1
 CMD /build.sh
diff --git a/paddle/scripts/travis/before_install.linux.sh b/paddle/scripts/travis/before_install.linux.sh
index ec2ac1f2240765d1c453ce50ec44286a551a37ba..9620bff6bcf77c6e87f149e8e33408170dd8e507 100755
--- a/paddle/scripts/travis/before_install.linux.sh
+++ b/paddle/scripts/travis/before_install.linux.sh
@@ -1,5 +1,16 @@
 #!/bin/bash
 set -e
+pip install protobuf
+cd /tmp
+wget https://github.com/google/protobuf/archive/v3.0.2.tar.gz -O protobuf.tar.gz
+tar xf protobuf.tar.gz
+cd protobuf*
+./autogen.sh
+./configure --prefix=/usr/
+make -j 2 install
+cd ..
+rm -rf protobuf*
+
 pushd /usr/src/gtest
 cmake .
 make
diff --git a/paddle/scripts/travis/before_install.osx.sh b/paddle/scripts/travis/before_install.osx.sh
index f438e69b822aa45448e1d303bf27af482a3d88d8..bd88ed39132f19ca7cfc4f0dd6acdbc6b83e94ab 100755
--- a/paddle/scripts/travis/before_install.osx.sh
+++ b/paddle/scripts/travis/before_install.osx.sh
@@ -2,9 +2,8 @@
 brew update
 brew tap homebrew/science
 brew install python
-sudo pip install --upgrade protobuf==2.6.0
-brew install homebrew/versions/protobuf260 --without-python
-brew install cmake python glog gflags openblas wget md5sha1sum
+sudo pip install --upgrade protobuf
+brew install cmake python glog gflags openblas wget md5sha1sum protobuf
 
 wget https://github.com/google/googletest/archive/release-1.8.0.tar.gz -O gtest.tar.gz
 tar xf gtest.tar.gz
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index 242fd982aa0015bfe9cb910c52afc3b42ab1028b..9caeb21beb15ee5281f9a6aefcfd59b94b91e48a 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+./build_submodules.sh
 source ./common.sh
 CMAKE_EXTRA=""
 if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
diff --git a/paddle/scripts/travis/build_submodules.sh b/paddle/scripts/travis/build_submodules.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d458bf92bf455609de601c60402101d09765dfe4
--- /dev/null
+++ b/paddle/scripts/travis/build_submodules.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+set -e
+WORK_DIR=$PWD
+PROJ_ROOT=$(git rev-parse --show-cdup)
+SUBMODULES=$(grep path ${PROJ_ROOT}.gitmodules | sed 's/^.*path = //')
+
+for module in $SUBMODULES
+do
+  case $module in
+    "warp-ctc")
+      if [ -d ${PROJ_ROOT}warp-ctc/build ]; then
+        rm -rf ${PROJ_ROOT}warp-ctc/build
+      fi
+      mkdir ${PROJ_ROOT}warp-ctc/build
+      cd ${PROJ_ROOT}warp-ctc/build
+      cmake ..; make
+    ;;
+  esac
+done
+cd $WORK_DIR
diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/docs.sh
index c2a4809d75b97a9d8d8b83cf197e90bd62b48603..0bbb76a8a3caa27da0911af0fe87df7fbff617b4 100755
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/docs.sh
@@ -47,17 +47,20 @@ if [ $? -eq 0 ]; then
 fi
 set -e
 
-# Commit
-git add .
-git config user.name "Travis CI"
-git config user.email "paddle-dev@baidu.com"
-git commit -m "Deploy to GitHub Pages: ${SHA}"
-
-# Set ssh private key
-openssl aes-256-cbc -K $SSL_KEY -iv $SSL_IV -in ../../paddle/scripts/travis/deploy_key.enc -out deploy_key -d
-chmod 600 deploy_key
-eval `ssh-agent -s`
-ssh-add deploy_key
-
-# Push
-git push $SSH_REPO $TARGET_BRANCH
+if [ -n $SSL_KEY ]; then  # Only push updated docs for github.com/PaddlePaddle/Paddle.
+  # Commit
+  git add .
+  git config user.name "Travis CI"
+  git config user.email "paddle-dev@baidu.com"
+  git commit -m "Deploy to GitHub Pages: ${SHA}"
+
+  # Set ssh private key
+  openssl aes-256-cbc -K $SSL_KEY -iv $SSL_IV -in ../../paddle/scripts/travis/deploy_key.enc -out deploy_key -d
+  chmod 600 deploy_key
+  eval `ssh-agent -s`
+  ssh-add deploy_key
+
+  # Push
+  git push $SSH_REPO $TARGET_BRANCH
+
+fi
diff --git a/paddle/scripts/travis/main.sh b/paddle/scripts/travis/main.sh
index c49d4546c24ac9304cd6f3c5940ed3d1d32ebb3d..13f2552d29db38041a73edca0acd202945c67484 100755
--- a/paddle/scripts/travis/main.sh
+++ b/paddle/scripts/travis/main.sh
@@ -5,6 +5,8 @@ if [ ${JOB} == "BUILD_AND_TEST" ]; then
   ./build_and_test.sh
 elif [ ${JOB} == "DOCS" ]; then
   ./docs.sh
+elif [ ${JOB} == "PRE_COMMIT" ]; then
+  ./precommit.sh
 else
   echo Unknown job ${JOB}
   exit 1
diff --git a/paddle/scripts/travis/precommit.sh b/paddle/scripts/travis/precommit.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7a59b1131d0a410be9c5cef08e3cc11633d2ba67
--- /dev/null
+++ b/paddle/scripts/travis/precommit.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+function abort(){
+    echo "Your commit not fit PaddlePaddle code style" 1>&2
+    echo "Please use pre-commit scripts to auto-format your code" 1>&2
+    exit 1
+}
+
+trap 'abort' 0
+set -e
+source common.sh
+cd ..
+export PATH=/usr/bin:$PATH
+pre-commit install
+clang-format --version
+
+if ! pre-commit run -a ; then
+  git diff  --exit-code
+fi
+
+trap : 0
diff --git a/paddle/setup.py.in b/paddle/setup.py.in
index 1a15eafd5528a68aa9a68ed020de6decb61bd2a7..b4c38a41b86683f89b6d02e9db97b75e9dca89ea 100644
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/trainer/MergeModel.cpp b/paddle/trainer/MergeModel.cpp
index 1d15c66d4d09d047f4ede83b4ad23733d0175617..91d89b61a32259b8bbe70fda2579f87ec6b9af00 100644
--- a/paddle/trainer/MergeModel.cpp
+++ b/paddle/trainer/MergeModel.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #include <memory>
 
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/pserver/ParameterServer2.h"
 #include "ParamUtil.h"
 #include "Trainer.h"
+#include "paddle/pserver/ParameterServer2.h"
+#include "paddle/utils/PythonUtil.h"
 
-P_DEFINE_string(model_dir, "", "Directory for separated model files");
-P_DEFINE_string(model_file, "", "File for merged model file");
+DEFINE_string(model_dir, "", "Directory for separated model files");
+DEFINE_string(model_file, "", "File for merged model file");
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/trainer/ParamUtil.cpp b/paddle/trainer/ParamUtil.cpp
index 2be9cd62235a262812231579c536a5f0596b69d9..ffbca42e106591ddeb2cefcfafbeb408c544371b 100644
--- a/paddle/trainer/ParamUtil.cpp
+++ b/paddle/trainer/ParamUtil.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,22 +17,22 @@ limitations under the License. */
 #include <fenv.h>
 #include <stdio.h>
 
-#include <iostream>
 #include <iomanip>
-#include <sstream>
+#include <iostream>
 #include <limits>
+#include <sstream>
 
 #include <google/protobuf/text_format.h>
 #include <paddle/utils/Version.h>
 
+#include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
-#include "paddle/utils/GlobalConstants.h"
 
+#include "TesterConfig.h"
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "paddle/gserver/layers/ValidationLayer.h"
-#include "TesterConfig.h"
 
 namespace paddle {
 
diff --git a/paddle/trainer/ParamUtil.h b/paddle/trainer/ParamUtil.h
index 3923941c3d1533621d89313aa09801e98cd5b8a9..2e05595848760c9abd7d916003656c8103151abf 100644
--- a/paddle/trainer/ParamUtil.h
+++ b/paddle/trainer/ParamUtil.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -22,11 +22,11 @@ limitations under the License. */
 #include "paddle/gserver/dataproviders/DataProvider.h"
 #include "paddle/gserver/gradientmachines/GradientMachine.h"
 
+#include <stdlib.h>
+#include <fstream>
+#include "ParameterUpdater.h"
 #include "TrainerConfig.pb.h"
 #include "TrainerConfigHelper.h"
-#include "ParameterUpdater.h"
-#include <fstream>
-#include <stdlib.h>
 
 namespace paddle {
 
diff --git a/paddle/trainer/ParameterUpdater.cpp b/paddle/trainer/ParameterUpdater.cpp
index 6001a0b391fb3425315de3194945a4d04aff7150..8b5b95da5b397aada6972d46db35fee8ca1a2ea4 100644
--- a/paddle/trainer/ParameterUpdater.cpp
+++ b/paddle/trainer/ParameterUpdater.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/trainer/ParameterUpdater.h b/paddle/trainer/ParameterUpdater.h
index b83b4cf55e27b25864499531bbfe483fb75f78a1..e52b5cd318b4d647a4bd126adf2ecfaba08d8363 100644
--- a/paddle/trainer/ParameterUpdater.h
+++ b/paddle/trainer/ParameterUpdater.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -24,8 +24,8 @@ limitations under the License. */
 #include "paddle/parameter/Parameter.h"
 #include "paddle/parameter/ParameterUpdaterBase.h"
 
-#include "paddle/gserver/layers/Layer.h"
 #include "TrainerConfig.pb.h"
+#include "paddle/gserver/layers/Layer.h"
 
 #include <memory>
 #include <vector>
diff --git a/paddle/trainer/RemoteParameterUpdater.cpp b/paddle/trainer/RemoteParameterUpdater.cpp
index d83bb5b10adeff2dc43ad4705e5c55d10856de0d..974e78fa17d6564414962475f81497491bbb0482 100644
--- a/paddle/trainer/RemoteParameterUpdater.cpp
+++ b/paddle/trainer/RemoteParameterUpdater.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #include "RemoteParameterUpdater.h"
 #include "Trainer.h"
-#include "paddle/utils/Stat.h"
 #include "paddle/utils/GlobalConstants.h"
+#include "paddle/utils/Stat.h"
 
-P_DECLARE_int32(trainer_id);
-P_DECLARE_string(save_dir);
+DECLARE_int32(trainer_id);
+DECLARE_string(save_dir);
 
 namespace paddle {
 
diff --git a/paddle/trainer/RemoteParameterUpdater.h b/paddle/trainer/RemoteParameterUpdater.h
index a40884724cc7f963dc6ce5eede750327b2bbfed9..66055c778e439a1edf7d1b6dd2e13b945fa73323 100644
--- a/paddle/trainer/RemoteParameterUpdater.h
+++ b/paddle/trainer/RemoteParameterUpdater.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include <thread>
 #include <functional>
-#include "paddle/pserver/ParameterClient2.h"
+#include <thread>
 #include "ParameterUpdater.h"
-#include "paddle/utils/Util.h"
+#include "paddle/pserver/ParameterClient2.h"
 #include "paddle/utils/Queue.h"
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/trainer/Tester.cpp b/paddle/trainer/Tester.cpp
index 30e92682baec2fc6035ecfa9dbd90415acd5abe1..24fac3e5a8141cbec912d276833ec491385b97ab 100644
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,22 +17,22 @@ limitations under the License. */
 #include <fenv.h>
 #include <stdio.h>
 
-#include <iostream>
 #include <iomanip>
-#include <sstream>
+#include <iostream>
 #include <limits>
+#include <sstream>
 
 #include <google/protobuf/text_format.h>
 
+#include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
-#include "paddle/utils/GlobalConstants.h"
 
+#include "TesterConfig.h"
+#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "paddle/gserver/layers/ValidationLayer.h"
-#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
-#include "TesterConfig.h"
 
 namespace paddle {
 
@@ -46,6 +46,12 @@ Tester::Tester(const std::shared_ptr<TrainerConfigHelper>& config,
       gradientMachine_(gradientMachine),
       parameterUpdater_(parameterUpdater),
       testDataProvider_(testDataProvider) {
+  if (config_->getOptConfig().use_sparse_remote_updater()) {
+    LOG(FATAL) << "It's prohibited to set sparse_remote_update "
+               << "when doing train and test jobs in the same "
+               << "process. You could run paddle --job=test in "
+               << "a separate process.";
+  }
   testEvaluator_.reset(gradientMachine_->makeEvaluator());
   if (intconfig_->distributeTest) {
     testParameterClient_.reset(new ParameterClient2(true));
@@ -66,6 +72,9 @@ Tester::Tester(const std::shared_ptr<TrainerConfigHelper>& config,
 }
 
 void Tester::startTestPeriod() {
+  if (testDataProvider_) {
+    testDataProvider_->reset();
+  }
   testEvaluator_->start();
   testContext_.cost = 0;
   testContext_.numSamples = 0;
@@ -87,33 +96,18 @@ void Tester::testOneDataBatch(const DataBatch& dataBatch,
 void Tester::testOnePeriod() {
   DataBatch dataBatch;
   int64_t batchSize = config_->getOptConfig().batch_size();
-  bool testAllData =
-      intconfig_->testPeriod == 0 || intconfig_->testAllDataInOnePeriod;
-  int batches =
-      testAllData ? std::numeric_limits<int>::max() : intconfig_->testPeriod;
-
   std::vector<Argument> outArgs;
-
   startTestPeriod();
-  for (int i = 0; i < batches; ++i) {
-    int num = testDataProvider_->getNextBatch(batchSize, &dataBatch);
-    if (num == 0) {
-      testDataProvider_->reset();
-      if (intconfig_->prevBatchState) {
-        gradientMachine_->resetState();
-      }
-      if (testAllData) {
-        break;
-      } else {
-        num = testDataProvider_->getNextBatch(batchSize, &dataBatch);
-      }
-    }
+  while (testDataProvider_->getNextBatch(batchSize, &dataBatch) != 0) {
     testOneDataBatch(dataBatch, &outArgs);
   }
   finishTestPeriod();
 }
 
 void Tester::finishTestPeriod() {
+  if (intconfig_->prevBatchState) {
+    gradientMachine_->resetState();
+  }
   testEvaluator_->finish();
   CHECK_GT(testContext_.numSamples, 0)
       << "There is no samples in your test batch. Possibly "
diff --git a/paddle/trainer/Tester.h b/paddle/trainer/Tester.h
index a9de9fe208c61c00fbeebe644222e255308e762b..e892744db278586f2fd5b3cb527aa7c17752c477 100644
--- a/paddle/trainer/Tester.h
+++ b/paddle/trainer/Tester.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -24,12 +24,12 @@ limitations under the License. */
 
 #include "TrainerConfig.pb.h"
 
-#include "ParameterUpdater.h"
+#include <stdlib.h>
+#include <fstream>
 #include "ParamUtil.h"
+#include "ParameterUpdater.h"
 #include "TesterConfig.h"
 #include "TrainerInternalConfig.h"
-#include <fstream>
-#include <stdlib.h>
 
 namespace paddle {
 
diff --git a/paddle/trainer/TesterConfig.h b/paddle/trainer/TesterConfig.h
index 90267e68d768f2a144e0041d0f493072ef9eb9a1..68d4c931ff2df8e24acaa9fe6b35bfd613197c72 100644
--- a/paddle/trainer/TesterConfig.h
+++ b/paddle/trainer/TesterConfig.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -23,9 +23,9 @@ limitations under the License. */
 
 #include "TrainerConfig.pb.h"
 
-#include "ParameterUpdater.h"
-#include <fstream>
 #include <stdlib.h>
+#include <fstream>
+#include "ParameterUpdater.h"
 
 namespace paddle {
 
@@ -39,11 +39,6 @@ struct TesterConfig {
    */
   int testPeriod;
 
-  /**
-   * indicate whether testing data in one period
-   */
-  bool testAllDataInOnePeriod;
-
   /**
    * indicate whether to save previous batch state
    */
diff --git a/paddle/trainer/ThreadParameterUpdater.cpp b/paddle/trainer/ThreadParameterUpdater.cpp
index cc22851d8ecbf594df1e3f2c8aeaa98c07b3765b..9caa92a4d7557c0c8633d881820862bbbd5df87e 100644
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ b/paddle/trainer/ThreadParameterUpdater.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/math/SparseRowMatrix.h"
 #include "paddle/utils/Thread.h"
 
-P_DECLARE_int32(trainer_count);
+DECLARE_int32(trainer_count);
 
 namespace paddle {
 
diff --git a/paddle/trainer/ThreadParameterUpdater.h b/paddle/trainer/ThreadParameterUpdater.h
index 5a5e3f1d4b3c1e915aa6ac01ff503c552e42de1a..d01ac689f97f360b64d4e63032a804f1f24c83e2 100644
--- a/paddle/trainer/ThreadParameterUpdater.h
+++ b/paddle/trainer/ThreadParameterUpdater.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/Util.h"
 #include "paddle/parameter/AverageOptimizer.h"
 #include "paddle/parameter/FirstOrderOptimizer.h"
 #include "paddle/parameter/OptimizerFunctions.h"
 #include "paddle/parameter/OptimizerWithRegularizer.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/parameter/Regularizer.h"
+#include "paddle/utils/Util.h"
 
 #include <memory>
 #include <vector>
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index 8a5162912e5feae9b80ab8fff56bb20e4dac1696..1eec2c432d235ef484b688db08aae8a39f878a85 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,83 +17,77 @@ limitations under the License. */
 #include <fenv.h>
 #include <stdio.h>
 
-#include <iostream>
 #include <iomanip>
-#include <sstream>
+#include <iostream>
 #include <limits>
+#include <sstream>
 
 #include <google/protobuf/text_format.h>
 
+#include "paddle/utils/Excepts.h"
+#include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
-#include "paddle/utils/Excepts.h"
-#include "paddle/utils/GlobalConstants.h"
 
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
-#include "paddle/gserver/layers/ValidationLayer.h"
+#include "RemoteParameterUpdater.h"
 #include "TesterConfig.h"
 #include "ThreadParameterUpdater.h"
-#include "RemoteParameterUpdater.h"
 #include "TrainerConfigHelper.h"
+#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/gserver/layers/ValidationLayer.h"
 
-P_DEFINE_string(config, "", "Trainer config file");
-P_DEFINE_int32(test_period,
-               0,
-               "Run test every so many train batches."
-               " 0 for testing after each pass."
-               " If not 0, test log_period batches."
-               " If 0, test on all test data");
-
-P_DEFINE_bool(local, true, "Train in local mode or not");
-
-P_DEFINE_bool(
-    test_all_data_in_one_period,
-    false,
-    "true will test all data in one test peroid."
-    "Otherwise test (batch_size * log_peroid) data in one test period.");
-
-P_DEFINE_int32(average_test_period,
-               0,
-               "Do test on average parameter every so"
-               " many batches. MUST be devided by FLAGS_log_period."
-               " Default 0 means do not test average parameter");
-
-P_DEFINE_int32(saving_period, 1, "Save parameteres every so many passes");
-P_DEFINE_int64(saving_period_by_batches,
-               0,
-               "Save parameters every so many batches in one pass");
-P_DEFINE_string(save_dir, "", "Directory for saving model parameter");
-P_DEFINE_int32(start_pass,
-               0,
-               "Start training from this pass. "
-               "Will load parameter from the previous pass");
-P_DEFINE_int32(test_pass,
-               -1,
-               "Will load parameter start from this pass to test");
-P_DEFINE_int32(test_wait, 0, "Waiting for pass parameter if not exist");
-P_DEFINE_bool(with_cost, true, "enable cost layer or not");
-P_DEFINE_bool(distribute_test, false, "test in distribute mode");
-
-P_DEFINE_int32(num_passes, 100, "train for so many passes");
-
-P_DEFINE_string(config_args,
-                "",
-                "arguments passed to config file."
-                "Format: key1=value1,key2=value2");
-
-P_DEFINE_bool(save_only_one,
-              false,
-              "Save only parameters in last pass, remove previous.");
-
-P_DEFINE_string(feat_file, "", "File name of extracted feature.");
-P_DEFINE_string(predict_output_dir,
-                "",
-                "Directory that saves the predicted results of output layers");
-P_DEFINE_string(model_list,
-                "",
-                "File that saves the model list when evaluation");
+DEFINE_string(config, "", "Trainer config file");
+
+DEFINE_int32(test_period,
+             0,
+             "if equal 0, do test on all test data at the end of "
+             "each pass. While if equal non-zero, do test on all test "
+             "data every test_period batches");
+DEFINE_bool(test_all_data_in_one_period,
+            false,
+            "This option was deprecated, since we will always do "
+            "test on all test set ");
+
+DEFINE_bool(local, true, "Train in local mode or not");
+
+DEFINE_int32(average_test_period,
+             0,
+             "Do test on average parameter every so"
+             " many batches. MUST be devided by FLAGS_log_period."
+             " Default 0 means do not test average parameter");
+
+DEFINE_int32(saving_period, 1, "Save parameteres every so many passes");
+DEFINE_int64(saving_period_by_batches,
+             0,
+             "Save parameters every so many batches in one pass");
+DEFINE_string(save_dir, "", "Directory for saving model parameter");
+DEFINE_int32(start_pass,
+             0,
+             "Start training from this pass. "
+             "Will load parameter from the previous pass");
+DEFINE_int32(test_pass, -1, "Will load parameter start from this pass to test");
+DEFINE_int32(test_wait, 0, "Waiting for pass parameter if not exist");
+DEFINE_bool(with_cost, true, "enable cost layer or not");
+DEFINE_bool(distribute_test, false, "test in distribute mode");
+
+DEFINE_int32(num_passes, 100, "train for so many passes");
+
+DEFINE_string(config_args,
+              "",
+              "arguments passed to config file."
+              "Format: key1=value1,key2=value2");
+
+DEFINE_bool(save_only_one,
+            false,
+            "Save only parameters in last pass, remove previous.");
+
+DEFINE_string(feat_file, "", "File name of extracted feature.");
+DEFINE_string(predict_output_dir,
+              "",
+              "Directory that saves the predicted results of output layers");
+DEFINE_string(model_list, "", "File that saves the model list when evaluation");
 
 namespace paddle {
 
@@ -205,7 +199,7 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
       (!IGradientMachineMode::dataMustInCpu(mode_, FLAGS_trainer_count));
 
   dataProvider_ = dataProvider;
-  if (!dataProvider_ && config_->hasDataConfig()) {
+  if (!dataProvider_ && config_->hasDataConfig() && !testing_) {
     dataProvider_.reset(DataProvider::create(*config_, *config_, gpuData));
   }
   if (!testDataProvider_) {
@@ -396,10 +390,6 @@ void Trainer::startTrain() {
     dataProvider_->reset();
   }
 
-  if (this->testDataProvider_) {
-    this->testDataProvider_->reset();
-  }
-
   trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
 }
 
@@ -633,8 +623,17 @@ void Trainer::test() { tester_->test(); }
 
 std::unique_ptr<TesterConfig> Trainer::createTesterConfig() {
   TesterConfig* conf = new TesterConfig;
+  if (FLAGS_test_period) {
+    LOG(WARNING) << "The meaning of --test_period is changed: "
+                 << "if equal 0, do test on all test data at the end of "
+                 << "each pass. While if equal non-zero, do test on all test "
+                 << "data every test_period batches ";
+  }
+  if (FLAGS_test_all_data_in_one_period) {
+    LOG(WARNING) << "--test_all_data_in_one_period was deprecated, since "
+                 << "we will always do test on all test set ";
+  }
   conf->testPeriod = FLAGS_test_period;
-  conf->testAllDataInOnePeriod = FLAGS_test_all_data_in_one_period;
   conf->prevBatchState = FLAGS_prev_batch_state;
   conf->logPeriod = FLAGS_log_period;
   conf->loadsaveParametersInPserver = FLAGS_loadsave_parameters_in_pserver;
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
index 899607c7c0f17ef2e91969f5ba1dcfa573518727..7cbf18ace7a5fed053653c73e62d36c388b15123 100644
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -22,19 +22,19 @@ limitations under the License. */
 #include "paddle/gserver/dataproviders/DataProvider.h"
 #include "paddle/gserver/gradientmachines/GradientMachine.h"
 
-#include "TrainerConfigHelper.h"
+#include <stdlib.h>
+#include <fstream>
+#include "ParamUtil.h"
 #include "ParameterUpdater.h"
-#include "TrainerInternal.h"
 #include "Tester.h"
-#include "ParamUtil.h"
-#include <fstream>
-#include <stdlib.h>
+#include "TrainerConfigHelper.h"
+#include "TrainerInternal.h"
 
 #ifdef PADDLE_METRIC_LEARNING
 #include "paddle/internals/metric_learning/MetricTrainer.h"
 #endif
 
-P_DECLARE_int32(num_passes);
+DECLARE_int32(num_passes);
 
 namespace paddle {
 
diff --git a/paddle/trainer/TrainerBenchmark.cpp b/paddle/trainer/TrainerBenchmark.cpp
index 54862e95b4a738b88dc256efbac9102fca383a4f..173653c81688fe4606731c68ea1854268b3f4590 100644
--- a/paddle/trainer/TrainerBenchmark.cpp
+++ b/paddle/trainer/TrainerBenchmark.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -18,9 +18,9 @@ limitations under the License. */
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 
-P_DECLARE_int32(test_period);
+DECLARE_int32(test_period);
 
-P_DEFINE_bool(feed_data, false, "Wether to read data from DataProvider.");
+DEFINE_bool(feed_data, false, "Wether to read data from DataProvider.");
 
 namespace paddle {
 
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp
index ee5b1e0a9c5a8faa6614d76ab938f1f1b8f4e73a..60ac8459a12db801321da4a9d9c1d48ac8bd6d16 100644
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -18,16 +18,16 @@ limitations under the License. */
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/PythonUtil.h"
 
-P_DECLARE_string(config);
-P_DECLARE_string(init_model_path);
-P_DECLARE_int32(start_pass);
-P_DECLARE_string(save_dir);
-P_DECLARE_int32(trainer_id);
-P_DECLARE_bool(local);
-P_DECLARE_bool(with_cost);
-P_DECLARE_bool(with_gpu);
-P_DECLARE_bool(parallel_nn);
-P_DECLARE_string(config_args);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_int32(start_pass);
+DECLARE_string(save_dir);
+DECLARE_int32(trainer_id);
+DECLARE_bool(local);
+DECLARE_bool(with_cost);
+DECLARE_bool(with_gpu);
+DECLARE_bool(parallel_nn);
+DECLARE_string(config_args);
 
 const char *kConfigParserModuleName = "paddle.trainer.config_parser";
 const char *kConfigParserFuncName = "parse_config_and_serialize";
diff --git a/paddle/trainer/TrainerConfigHelper.h b/paddle/trainer/TrainerConfigHelper.h
index d20684964136a553b2d4119e8db5a1de084278bb..f1366cc041b0d983e65a1bf5b02ec2128324c5a8 100644
--- a/paddle/trainer/TrainerConfigHelper.h
+++ b/paddle/trainer/TrainerConfigHelper.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include <memory>
 #include <paddle/utils/Logging.h>
 #include <paddle/utils/Util.h>
+#include <memory>
 
 namespace paddle {
 
diff --git a/paddle/trainer/TrainerInternal.cpp b/paddle/trainer/TrainerInternal.cpp
index b1c3bf26d21d1760cd1710f372aa8a89fb7b101b..f3b465b444167d4624a5e99c30e1257eda53ca2c 100644
--- a/paddle/trainer/TrainerInternal.cpp
+++ b/paddle/trainer/TrainerInternal.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,22 +17,22 @@ limitations under the License. */
 #include <fenv.h>
 #include <stdio.h>
 
-#include <iostream>
 #include <iomanip>
-#include <sstream>
+#include <iostream>
 #include <limits>
+#include <sstream>
 
 #include <google/protobuf/text_format.h>
 
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/gserver/layers/ValidationLayer.h"
+#include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/gserver/layers/ValidationLayer.h"
 
-#include "ThreadParameterUpdater.h"
 #include "RemoteParameterUpdater.h"
+#include "ThreadParameterUpdater.h"
 
 namespace paddle {
 
diff --git a/paddle/trainer/TrainerInternal.h b/paddle/trainer/TrainerInternal.h
index 962d53a30e5454060e8ce864c347c37b9cc98116..7018faab24744f7a087a53130acc56ec6314101e 100644
--- a/paddle/trainer/TrainerInternal.h
+++ b/paddle/trainer/TrainerInternal.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,15 +17,15 @@ limitations under the License. */
 #include "paddle/utils/Util.h"
 
 #include <stdio.h>
-#include <fstream>
 #include <stdlib.h>
+#include <fstream>
 
-#include "hl_gpu.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-#include "TrainerConfig.pb.h"
 #include "ParameterUpdater.h"
+#include "TrainerConfig.pb.h"
 #include "TrainerConfigHelper.h"
 #include "TrainerInternalConfig.h"
+#include "hl_gpu.h"
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
 
 namespace paddle {
 
diff --git a/paddle/trainer/TrainerInternalConfig.cpp b/paddle/trainer/TrainerInternalConfig.cpp
index 0dc74cb3b39309b33a1a92dfa5a45e95defb4120..039fcdb524527d5e8bfa829fc403b6f2fa789991 100644
--- a/paddle/trainer/TrainerInternalConfig.cpp
+++ b/paddle/trainer/TrainerInternalConfig.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,17 +14,17 @@ limitations under the License. */
 
 #include "TrainerInternalConfig.h"
 
-P_DEFINE_int32(show_parameter_stats_period,
-               0,
-               "Whether to show parameter stats during training");
+DEFINE_int32(show_parameter_stats_period,
+             0,
+             "Whether to show parameter stats during training");
 
-P_DEFINE_int32(dot_period, 1, "Print '.' every so many batches");
+DEFINE_int32(dot_period, 1, "Print '.' every so many batches");
 
-P_DEFINE_bool(use_old_updater, false, "Use the old RemoteParameterUpdater");
+DEFINE_bool(use_old_updater, false, "Use the old RemoteParameterUpdater");
 
-P_DECLARE_int32(num_passes);
+DECLARE_int32(num_passes);
 
-P_DECLARE_bool(local);
+DECLARE_bool(local);
 
 namespace paddle {
 
diff --git a/paddle/trainer/TrainerInternalConfig.h b/paddle/trainer/TrainerInternalConfig.h
index b7bfd29abd729b33ca953fb20835c57cbcf3ef74..b47692720efc2ed4f2db84f61ca81fcb52d234c0 100644
--- a/paddle/trainer/TrainerInternalConfig.h
+++ b/paddle/trainer/TrainerInternalConfig.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ limitations under the License. */
 
 #include "TrainerConfig.pb.h"
 
-#include "ParameterUpdater.h"
+#include <stdlib.h>
 #include <fstream>
 #include <sstream>
-#include <stdlib.h>
+#include "ParameterUpdater.h"
 
 namespace paddle {
 /**
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index e23e745d99c7b10fb780cb0c89e27207eefc19c1..947f9cadcc983d58ce31ef462e51dc42e41eaf1b 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,30 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <fenv.h>
+#include "paddle/pserver/ParameterServer2.h"
+#include "paddle/utils/Excepts.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Excepts.h"
-#include "paddle/pserver/ParameterServer2.h"
 
 #include "ParamUtil.h"
 #include "Trainer.h"
 #include "paddle/pserver/RDMANetwork.h"
 
-P_DEFINE_bool(start_pserver, false, "Whether to start pserver");
-P_DECLARE_int32(gpu_id);
-P_DEFINE_string(job, "train", "one of (train, test, checkgrad)");
-P_DECLARE_int32(start_pass);
-P_DECLARE_string(config);
-P_DECLARE_string(init_model_path);
-P_DECLARE_string(rdma_tcp);
+DEFINE_bool(start_pserver, false, "Whether to start pserver");
+DECLARE_int32(gpu_id);
+DEFINE_string(job, "train", "one of (train, test, checkgrad)");
+DECLARE_int32(start_pass);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_string(rdma_tcp);
 
 using namespace paddle;  // NOLINT
 
 int main(int argc, char** argv) {
-// write logs instantly (never buffer log messages)
-#ifdef PADDLE_USE_GLOG
+  // write logs instantly (never buffer log messages)
   FLAGS_logbuflevel = -1;
-#endif
+
   initMain(argc, argv);
   initPython(argc, argv);
 
diff --git a/paddle/trainer/tests/__init__.py b/paddle/trainer/tests/__init__.py
index c90af2ee000d46a032984ee23559e7e99b49ddad..f662d6826321eb840739382558f76327d27b5847 100644
--- a/paddle/trainer/tests/__init__.py
+++ b/paddle/trainer/tests/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/trainer/tests/chunking.conf b/paddle/trainer/tests/chunking.conf
index 01c15fab5f7c0f51b8d4ca3296699c7b11e14a6e..d88df919df8fee9209336ffa29d724dabe6af31b 100644
--- a/paddle/trainer/tests/chunking.conf
+++ b/paddle/trainer/tests/chunking.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/trainer/tests/config_parser_test.py b/paddle/trainer/tests/config_parser_test.py
index c5ec315d6b01b0a5a3f73673e1756e9c06d685ba..db66ebb5b7c13fe53df14a07918aad62ba895ffa 100644
--- a/paddle/trainer/tests/config_parser_test.py
+++ b/paddle/trainer/tests/config_parser_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/trainer/tests/gen_proto_data.py b/paddle/trainer/tests/gen_proto_data.py
index a3dbc10c886e183582b44fee479d5ffb074193ef..8cc6d44673b9f992c28ae95cc06db5ea5aca0642 100644
--- a/paddle/trainer/tests/gen_proto_data.py
+++ b/paddle/trainer/tests/gen_proto_data.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/trainer/tests/picojson.h b/paddle/trainer/tests/picojson.h
index cb657d219e55c1e349ffb77a88945085b4149c78..23bfa164080a6ea392bb6ee15e7e2bec25257ce9 100644
--- a/paddle/trainer/tests/picojson.h
+++ b/paddle/trainer/tests/picojson.h
@@ -30,10 +30,10 @@
 #define picojson_h
 
 #include <algorithm>
+#include <cstddef>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
-#include <cstddef>
 #include <iostream>
 #include <iterator>
 #include <limits>
diff --git a/paddle/trainer/tests/sample_trainer_config.conf b/paddle/trainer/tests/sample_trainer_config.conf
index 15901065b226fad0e8a0b7ee92193c3157db3498..2697832840f35a33c07f1664ef18a229d656d784 100644
--- a/paddle/trainer/tests/sample_trainer_config.conf
+++ b/paddle/trainer/tests/sample_trainer_config.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/trainer/tests/sample_trainer_config_hsigmoid.conf b/paddle/trainer/tests/sample_trainer_config_hsigmoid.conf
index 174cb5e25f1c4d400869852599cb6b80ecbda033..e4abe31d480b69bc2ff4741649b336714818515b 100644
--- a/paddle/trainer/tests/sample_trainer_config_hsigmoid.conf
+++ b/paddle/trainer/tests/sample_trainer_config_hsigmoid.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/trainer/tests/sample_trainer_config_opt_a.conf b/paddle/trainer/tests/sample_trainer_config_opt_a.conf
index f5b1988ddaf5fba486d35b90cf4c267d209ce290..b1744db8d604c88ec47e7104f79b38bb9d0e4442 100644
--- a/paddle/trainer/tests/sample_trainer_config_opt_a.conf
+++ b/paddle/trainer/tests/sample_trainer_config_opt_a.conf
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/trainer/tests/sample_trainer_config_opt_b.conf b/paddle/trainer/tests/sample_trainer_config_opt_b.conf
index f5b1988ddaf5fba486d35b90cf4c267d209ce290..b1744db8d604c88ec47e7104f79b38bb9d0e4442 100644
--- a/paddle/trainer/tests/sample_trainer_config_opt_b.conf
+++ b/paddle/trainer/tests/sample_trainer_config_opt_b.conf
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/trainer/tests/sample_trainer_config_parallel.conf b/paddle/trainer/tests/sample_trainer_config_parallel.conf
index e35a1f26dad2f81f70fe31f9f8c921606ea8461b..e2b8b3ecdab83b4614dbe468c3a295c05867f7f9 100644
--- a/paddle/trainer/tests/sample_trainer_config_parallel.conf
+++ b/paddle/trainer/tests/sample_trainer_config_parallel.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf b/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf
index d254cc5700abfba652b6e969201272ad7d422b7a..d19222360c2f424ddb306b155dfef07921098a6b 100644
--- a/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf
+++ b/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/trainer/tests/sample_trainer_config_rnn.conf b/paddle/trainer/tests/sample_trainer_config_rnn.conf
index cbb666302943021e31a65715c619202aafd8ce70..b720d4d5a6ca59e207832a8c5410c2cb6074c439 100644
--- a/paddle/trainer/tests/sample_trainer_config_rnn.conf
+++ b/paddle/trainer/tests/sample_trainer_config_rnn.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
index 613fd325e10fb742ec18d1451e49da975fcfce18..d669fbc40cbc19df309d8bf20c942a9d8fc8f47d 100644
--- a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
+++ b/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/trainer/tests/sample_trainer_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_rnn_gen.conf
index ec1c12cc896fb7868ea906447378966821dd3e93..2b337282f6285afb527e9bbf138d2e8184700d8d 100644
--- a/paddle/trainer/tests/sample_trainer_rnn_gen.conf
+++ b/paddle/trainer/tests/sample_trainer_rnn_gen.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/trainer/tests/testPyDataWrapper.py b/paddle/trainer/tests/testPyDataWrapper.py
index 4607bec24e1fec6f8b9996eb32fe991dbbe3ed79..2c29a274339747b78fbd6c27ae4070f0abbd4028 100644
--- a/paddle/trainer/tests/testPyDataWrapper.py
+++ b/paddle/trainer/tests/testPyDataWrapper.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddle/trainer/tests/test_Compare.cpp b/paddle/trainer/tests/test_Compare.cpp
index 03312f9e470e0f8b01e229237d25a7ac8e088c5c..72fc76bea35e433eeb08ba625b4bf6afdda491fb 100644
--- a/paddle/trainer/tests/test_Compare.cpp
+++ b/paddle/trainer/tests/test_Compare.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,18 +16,18 @@ limitations under the License. */
 
 #include "paddle/trainer/Trainer.h"
 
-#include <cstdlib>
 #include <gtest/gtest.h>
+#include <cstdlib>
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
 static const string& configFile = "trainer/tests/sample_trainer_config.conf";
 
-P_DECLARE_int32(gpu_id);
-P_DECLARE_bool(use_gpu);
-P_DECLARE_string(config);
-P_DECLARE_string(config_args);
+DECLARE_int32(gpu_id);
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_string(config_args);
 
 struct comData {
   vector<Argument> outArgs;
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/trainer/tests/test_CompareSparse.cpp
index a7c6862ce3362556fa60cc3309445347476e7f33..a7000eb77e1bbeab4f6e38c0322f82bde7164080 100644
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -25,22 +25,22 @@ using namespace std;     // NOLINT
 static const string& configFile1 =
     "trainer/tests/sample_trainer_config_qb_rnn.conf";
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_string(config);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_int32(seed);
-P_DECLARE_int32(num_passes);
-P_DECLARE_int32(saving_period);
-
-P_DECLARE_int32(num_gradient_servers);
-P_DECLARE_int32(port);
-P_DECLARE_bool(local);
-P_DECLARE_bool(use_old_updater);
-P_DECLARE_bool(parallel_nn);
-P_DECLARE_string(config_args);
-P_DEFINE_double(max_diff_ratio,
-                0.0f,
-                "max diff ratio allowed for parameters value");
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_int32(gpu_id);
+DECLARE_int32(seed);
+DECLARE_int32(num_passes);
+DECLARE_int32(saving_period);
+
+DECLARE_int32(num_gradient_servers);
+DECLARE_int32(port);
+DECLARE_bool(local);
+DECLARE_bool(use_old_updater);
+DECLARE_bool(parallel_nn);
+DECLARE_string(config_args);
+DEFINE_double(max_diff_ratio,
+              0.0f,
+              "max diff ratio allowed for parameters value");
 
 int gNumDevices = 0;
 
diff --git a/paddle/trainer/tests/test_CompareTwoNets.cpp b/paddle/trainer/tests/test_CompareTwoNets.cpp
index 81320da6ac9c6e880b936a6b1e2650796bb50ff7..80c61e259e71dd31d7637072248b22a2910c532e 100644
--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/trainer/tests/test_CompareTwoNets.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,35 +12,35 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <gtest/gtest.h>
 #include <paddle/utils/PythonUtil.h>
-#include <cstdlib>
 #include <algorithm>
-#include <gtest/gtest.h>
+#include <cstdlib>
 
 #include "paddle/trainer/Trainer.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_int32(gpu_id);
+DECLARE_int32(gpu_id);
 
-P_DECLARE_bool(local);
-P_DECLARE_bool(use_gpu);
+DECLARE_bool(local);
+DECLARE_bool(use_gpu);
 
-P_DECLARE_string(config);
-P_DECLARE_string(nics);
+DECLARE_string(config);
+DECLARE_string(nics);
 
-P_DEFINE_string(config_file_a, "", "config of one network to compare");
-P_DEFINE_string(config_file_b, "", "config of another network to compare");
-P_DEFINE_bool(need_high_accuracy,
-              false,
-              "whether need to run in double accuracy");
-P_DEFINE_double(
+DEFINE_string(config_file_a, "", "config of one network to compare");
+DEFINE_string(config_file_b, "", "config of another network to compare");
+DEFINE_bool(need_high_accuracy,
+            false,
+            "whether need to run in double accuracy");
+DEFINE_double(
     max_diff_ratio,
     0.0f,
     "max diff ratio allowed for outputs and parameters (value/gradient)");
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_int32(seed);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_int32(seed);
 
 struct ComData {
   vector<Argument> outArgs;
diff --git a/paddle/trainer/tests/test_CompareTwoOpts.cpp b/paddle/trainer/tests/test_CompareTwoOpts.cpp
index a52f2fa7e7708925dbcb173167b17bbfef93a4da..383505f8131264844069d6f0fa13f4e0ac1f97af 100644
--- a/paddle/trainer/tests/test_CompareTwoOpts.cpp
+++ b/paddle/trainer/tests/test_CompareTwoOpts.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,30 +12,30 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <gtest/gtest.h>
 #include <paddle/utils/PythonUtil.h>
-#include <cstdlib>
 #include <algorithm>
-#include <gtest/gtest.h>
+#include <cstdlib>
 
 #include "paddle/trainer/Trainer.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_int32(gpu_id);
+DECLARE_int32(gpu_id);
 
-P_DECLARE_bool(local);
-P_DECLARE_bool(use_gpu);
+DECLARE_bool(local);
+DECLARE_bool(use_gpu);
 
-P_DECLARE_string(config);
-P_DECLARE_string(nics);
+DECLARE_string(config);
+DECLARE_string(nics);
 
-P_DEFINE_string(config_file_a, "", "config of one network to compare");
-P_DEFINE_string(config_file_b, "", "config of another network to compare");
-P_DEFINE_bool(need_high_accuracy,
-              true,
-              "whether need to run in double accuracy (recommended)");
-P_DEFINE_double(
+DEFINE_string(config_file_a, "", "config of one network to compare");
+DEFINE_string(config_file_b, "", "config of another network to compare");
+DEFINE_bool(need_high_accuracy,
+            true,
+            "whether need to run in double accuracy (recommended)");
+DEFINE_double(
     max_diff_ratio,
     0.0f,
     "max diff ratio allowed for outputs and parameters (value/gradient)");
diff --git a/paddle/trainer/tests/test_Prediction.cpp b/paddle/trainer/tests/test_Prediction.cpp
index 6db33439b319e84e99e828246ca672fa8274e4bf..0c79404eee1c0902c5c8e8eefd139da3da584636 100644
--- a/paddle/trainer/tests/test_Prediction.cpp
+++ b/paddle/trainer/tests/test_Prediction.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -18,11 +18,11 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 
-P_DECLARE_string(config);
-P_DECLARE_string(config_args);
-P_DEFINE_string(merger,
-                "./paddle_merge_model",
-                "path to paddle_merge_model binary");
+DECLARE_string(config);
+DECLARE_string(config_args);
+DEFINE_string(merger,
+              "./paddle_merge_model",
+              "path to paddle_merge_model binary");
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
index e53291386c6b553e26248dae75e321d4b7246823..66ec65e340a435a7260028611828fb28845e0728 100644
--- a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
+++ b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifndef PADDLE_NO_PYTHON
+#include <DataConfig.pb.h>
 #include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
 #include <paddle/gserver/dataproviders/DataProvider.h>
-#include <DataConfig.pb.h>
 #include <paddle/math/Matrix.h>
 #include <paddle/parameter/Argument.h>
+#include <paddle/utils/PythonUtil.h>
+#include <fstream>
+#include <typeinfo>
 #include <unordered_map>
 #include <unordered_set>
-#include <typeinfo>
-#include <fstream>
 #include "picojson.h"
 
 void checkEqual(const paddle::Argument& expect, const paddle::Argument& actual);
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index 900c05af851aede67253535228d75d211dee6a85..371282dd6bb9a995bc6ae8b2a5bd708f831d7e33 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -28,10 +28,10 @@ static const string& configFile3 = "trainer/tests/chunking.conf";
 static const string& configFile4 =
     "trainer/tests/sample_trainer_config_parallel.conf";
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_string(config);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_bool(allow_only_one_model_on_one_gpu);
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_int32(gpu_id);
+DECLARE_bool(allow_only_one_model_on_one_gpu);
 
 void checkGradientTest(const string& configFile,
                        bool useGpu,
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
index da2954d1664fc18cb78e6217807ff9799d220f7f..ee21008aec56da289dab88f72f57a1703e392fad 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/utils/PythonUtil.h>
 #include <paddle/utils/GlobalConstants.h>
+#include <paddle/utils/PythonUtil.h>
 #include "paddle/trainer/Trainer.h"
 #include "paddle/trainer/TrainerInternal.h"
 
@@ -27,12 +27,12 @@ static const string& configFile1 = "trainer/tests/sample_trainer_config.conf";
 static const string& configFile2 =
     "trainer/tests/sample_trainer_config_parallel.conf";
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_string(config);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_int32(seed);
-P_DECLARE_int32(num_passes);
-P_DECLARE_int32(saving_period);
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_int32(gpu_id);
+DECLARE_int32(seed);
+DECLARE_int32(num_passes);
+DECLARE_int32(saving_period);
 
 class TrainerForTest : public paddle::Trainer {
 public:
@@ -122,10 +122,10 @@ TEST(average_window_cpu, gpu4) {
 #endif
 
 // 3. test trainer + pserver.
-P_DECLARE_int32(num_gradient_servers);
-P_DECLARE_int32(port);
-P_DECLARE_bool(local);
-P_DECLARE_bool(use_old_updater);
+DECLARE_int32(num_gradient_servers);
+DECLARE_int32(port);
+DECLARE_bool(local);
+DECLARE_bool(use_old_updater);
 
 double checkRemoteParameterUpdater(TrainerForTest& trainer) {
   auto gradientMachine = trainer.getGradientMachine();
diff --git a/paddle/trainer/tests/test_config.conf b/paddle/trainer/tests/test_config.conf
index 664e18cb986811ffca2a4865c5f50045ace122e1..d1bb9b877fe26702948586dbe90b9ff0ee27c1d6 100644
--- a/paddle/trainer/tests/test_config.conf
+++ b/paddle/trainer/tests/test_config.conf
@@ -1,5 +1,5 @@
 #edit-mode: -*- python -*-
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -59,7 +59,6 @@ pool = img_pool_layer(input=fc2,
                       padding_y=2,
                       stride=2,
                       stride_y=3,
-                      img_width=3,
                       pool_type=CudnnAvgPooling())
 
 concat = concat_layer(input=[fc3, fc4])
diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
index 49e8a97ad057246addf29274dd9c436d1481de91..03446b3b2f6d5ff42fbf0d735a24d88bd0429747 100644
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <fstream>
 
-#include <paddle/utils/PythonUtil.h>
 #include <paddle/trainer/Trainer.h>
+#include <paddle/utils/PythonUtil.h>
 
 #include <gtest/gtest.h>
 
@@ -30,7 +30,7 @@ static string modelDir = "trainer/tests/rnn_gen_test_model_dir/t1";  // NOLINT
 static string expectFile =                                           // NOLINT
     "trainer/tests/rnn_gen_test_model_dir/r1.test";                  // NOLINT
 
-P_DECLARE_string(config_args);
+DECLARE_string(config_args);
 
 vector<float> readRetFile(const string& fname) {
   ifstream inFile(fname);
diff --git a/paddle/utils/BarrierStat.cpp b/paddle/utils/BarrierStat.cpp
index 82c5b84e5960753d5ec4c35bd667a8e43269e9e1..a6dbdcae3f32c894d35e8114488d4a3264c6c5f2 100644
--- a/paddle/utils/BarrierStat.cpp
+++ b/paddle/utils/BarrierStat.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,23 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/utils/BarrierStat.h"
+#include <string.h>
 #include <sys/types.h>
-#include <iomanip>
 #include <algorithm>
-#include <string.h>
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/BarrierStat.h"
+#include <iomanip>
 #include "paddle/utils/Flags.h"
+#include "paddle/utils/Stat.h"
 
-P_DEFINE_bool(log_barrier_abstract,
-              true,
-              "if true, show abstract of barrier performance");
-P_DEFINE_int32(log_barrier_lowest_nodes,
-               5,
-               "how many lowest node will be logged");
-P_DEFINE_bool(log_barrier_show_log,
-              false,  // for performance tuning insight
-              "if true, always show barrier abstract even with little gap");
+DEFINE_bool(log_barrier_abstract,
+            true,
+            "if true, show abstract of barrier performance");
+DEFINE_int32(log_barrier_lowest_nodes,
+             5,
+             "how many lowest node will be logged");
+DEFINE_bool(log_barrier_show_log,
+            false,  // for performance tuning insight
+            "if true, always show barrier abstract even with little gap");
 
 namespace paddle {
 
diff --git a/paddle/utils/BarrierStat.h b/paddle/utils/BarrierStat.h
index 661340ad275365ab567175d4280abdab18444fac..a9c925eff66838d58d540d7be5476e6207a30bec 100644
--- a/paddle/utils/BarrierStat.h
+++ b/paddle/utils/BarrierStat.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,18 +15,17 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
-#include <string>
 #include <sys/time.h>
-#include <memory>
 #include <iostream>
+#include <list>
+#include <memory>
 #include <mutex>
+#include <string>
 #include <unordered_map>
-#include <list>
 
-#include "Logging.h"
 #include "Locks.h"
+#include "Logging.h"
 #include "ThreadLocal.h"
-#include "Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/ClassRegistrar.h b/paddle/utils/ClassRegistrar.h
index ee58ccb2ad42ac9e5380e3a80fe0044965eab083..1ac27bafabd1945d1d01e3bead22b0dd200d8688 100644
--- a/paddle/utils/ClassRegistrar.h
+++ b/paddle/utils/ClassRegistrar.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/utils/CommandLineParser.cpp b/paddle/utils/CommandLineParser.cpp
index 307e304bb03d79fa9a640ece9c84845919b0d9c4..63f16bc54c575a0d5ae02141be3c467ee784b095 100644
--- a/paddle/utils/CommandLineParser.cpp
+++ b/paddle/utils/CommandLineParser.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,223 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "CommandLineParser.h"
-#ifndef PADDLE_USE_GFLAGS
-#include "paddle/utils/StringUtil.h"
-#include <algorithm>
-#include <iostream>
-#include <iomanip>
-#include <stdlib.h>
-#include <string>
-#include <vector>
-#include <utility>
-#include <tuple>
 
-namespace paddle {
-
-static constexpr int kStatusOK = 0;
-static constexpr int kStatusInvalid = 1;
-static constexpr int kStatusNotFound = 2;
-
-/**
- * \brief: Convert a string to any type value.
- *
- * \note: It will specialize by type T that is supported.
- */
-template <typename T>
-bool StringToValue(const std::string& content, T* value) {
-  bool ok;
-  *value = str::toWithStatus<T>(content, &ok);
-  return ok;
-}
-
-template <>
-bool StringToValue<bool>(const std::string& content, bool* value) {
-  std::string tmp = content;
-
-  std::transform(tmp.begin(),
-                 tmp.end(),
-                 tmp.begin(),
-                 [](char in) -> char {
-                   if (in <= 'Z' && in >= 'A') {
-                     return in - ('Z' - 'z');
-                   } else {
-                     return in;
-                   }
-                 });  // tolower.
-
-  if (tmp == "true" || tmp == "1") {
-    *value = true;
-    return true;
-  } else if (tmp == "false" || tmp == "0") {
-    *value = false;
-    return true;
-  } else {
-    return false;
-  }
-}
-
-template <>
-bool StringToValue<std::string>(const std::string& content,
-                                std::string* value) {
-  *value = content;
-  return true;
-}
-
-/**
- * \brief Parse argument "--blah=blah".
- *
- * \param argument: The command line argument string, such as "--blah=blah"
- * \param [out] extraInfo: The details error message for parse argument.
- * \return: kStatusOK, kStatusInvalid, kStatusNotFound
- */
-template <typename T>
-int ParseArgument(const std::string& argument, std::string* extraInfo) {
-  for (auto& command :
-       flags_internal::CommandLineFlagRegistry<T>::Instance()->commands) {
-    std::string& name = command.name;
-    T* value = command.value;
-
-    std::string prefix = "--";
-    prefix += name;
-    prefix += "=";
-    std::string content;
-    if (str::startsWith(argument, prefix)) {
-      content = argument.substr(prefix.size(), argument.size() - prefix.size());
-    } else {
-      prefix = "-";
-      prefix += name;
-      prefix += "=";
-      if (str::startsWith(argument, prefix)) {
-        content =
-            argument.substr(prefix.size(), argument.size() - prefix.size());
-      }
-    }
-
-    if (!content.empty()) {
-      if (StringToValue(content, value)) {
-        return kStatusOK;
-      } else {
-        *extraInfo = name;
-        return kStatusInvalid;
-      }
-    }
-  }
-  return kStatusNotFound;
-}
-
-/**
- * @brief ParseBoolArgumentExtra
- * parse '--flag_name', '-flag_name' as true; '--noflag_name', '-noflag_name' as
- * false
- */
-static int ParseBoolArgumentExtra(const std::string& argument,
-                                  std::string* extraInfo) {
-  (void)(extraInfo);  // unused extraInfo, just make api same.
-
-  //! @warning: The order and content of prefixes is DESIGNED for parsing
-  //! command line. The length of prefixes are 1, 2, 3, 4. The parse logic takes
-  //! use of this fact. DO NOT CHANGE IT without reading how to parse command
-  //! below.
-  static const std::vector<std::pair<const char*, bool>> prefixes = {
-      {"-", true}, {"--", true}, {"-no", false}, {"--no", false}};
-
-  for (flags_internal::CommandLineFlagRegistry<bool>::Command& command :
-       flags_internal::CommandLineFlagRegistry<bool>::Instance()->commands) {
-    if (argument.size() > command.name.size()) {
-      //! Use the length of prefix is 1, 2, 3, 4.
-      size_t diff = argument.size() - command.name.size() - 1UL;
-      if (diff < prefixes.size()) {
-        const std::string& prefix = std::get<0>(prefixes[diff]);
-        if (argument == prefix + command.name) {
-          *command.value = std::get<1>(prefixes[diff]);
-          return kStatusOK;
-        }
-      }
-    }
-  }
-  return kStatusNotFound;
-}
-
-/**
- * \brief: Print command line arguments' usage with type T.
- */
-template <typename T>
-static void PrintTypeUsage() {
-  for (auto& command :
-       flags_internal::CommandLineFlagRegistry<T>::Instance()->commands) {
-    std::string& name = command.name;
-    name = "--" + name;  // Program will exit, so modify name is safe.
-    std::string& desc = command.text;
-    T& defaultValue = command.defaultValue;
-    std::cerr << std::setw(20) << name << ": " << desc
-              << "[default:" << defaultValue << "]." << std::endl;
-  }
-}
-
-template <typename... TS>
-static void PrintTypeUsages() {
-  int unused[] = {0, (PrintTypeUsage<TS>(), 0)...};
-  (void)(unused);
-}
-/**
- * \brief: Print all usage, and exit(1)
- */
-static void PrintUsageAndExit(const char* argv0) {
-  std::cerr << "Program " << argv0 << " Flags: " << std::endl;
-  PrintTypeUsages<bool, int32_t, std::string, double, int64_t, uint64_t>();
-  exit(1);
-}
-
-/**
- * \brief: Print the error flags, usage, and exit.
- */
-static void PrintParseError(const std::string& name,
-                            const char* actualInput,
-                            const char* arg0) {
-  std::cerr << "Parse command flag " << name << " error! User input is "
-            << actualInput << std::endl;
-  PrintUsageAndExit(arg0);
-}
-
-void ParseCommandLineFlags(int* argc, char** argv, bool withHelp) {
-  int unused_argc = 1;
-  std::string extra;
-  for (int i = 1; i < *argc; ++i) {
-    std::string arg = argv[i];
-    int s = kStatusInvalid;
-#define ParseArgumentWithType(type)           \
-  s = ParseArgument<type>(arg, &extra);       \
-  if (s == kStatusOK) {                       \
-    continue;                                 \
-  } else if (s == kStatusInvalid) {           \
-    PrintParseError(extra, argv[i], argv[0]); \
-  }
-
-    ParseArgumentWithType(bool);  // NOLINT
-    ParseArgumentWithType(int32_t);
-    ParseArgumentWithType(double);  // NOLINT
-    ParseArgumentWithType(int64_t);
-    ParseArgumentWithType(uint64_t);
-    ParseArgumentWithType(std::string);
-
-#undef ParseArgumentWithType
-    s = ParseBoolArgumentExtra(arg, &extra);
-    if (s == kStatusOK) {
-      continue;
-    }
-
-    if (withHelp && (arg == "--help" || arg == "-h")) {
-      PrintUsageAndExit(argv[0]);
-    }
-
-    // NOT Found for all flags.
-    std::swap(argv[unused_argc++], argv[i]);
-  }
-  *argc = unused_argc;
-}
-
-}  // namespace paddle
-#else
 namespace paddle {
 #ifndef GFLAGS_NS
 #define GFLAGS_NS google
@@ -246,4 +30,3 @@ void ParseCommandLineFlags(int* argc, char** argv, bool withHelp) {
 }
 
 }  // namespace paddle
-#endif
diff --git a/paddle/utils/CommandLineParser.h b/paddle/utils/CommandLineParser.h
index c46567913e253bdda645f129449773040c0ec93d..4e89f90bb910cee1adc7fb8dace81ff58435351f 100644
--- a/paddle/utils/CommandLineParser.h
+++ b/paddle/utils/CommandLineParser.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,167 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#ifndef PADDLE_USE_GFLAGS
-#include "DisableCopy.h"
-#include <string>
-#include <vector>
-#include <stdint.h>
 
-namespace paddle {
-
-namespace flags_internal {
-
-/**
- * Command line flag registry for special type T. It will store all command
- * arguments settings. such as name, default value.
- */
-template <typename T>
-struct CommandLineFlagRegistry {
-  /**
-   * The factory method of CommandLineFlagRegistry
-   *
-   * \return: The singleton instance of CommandLineFlagRegistry.
-   */
-  static CommandLineFlagRegistry* Instance() {
-    static CommandLineFlagRegistry instance_;
-    return &instance_;
-  }
-
-  struct Command {
-    /// name of argument.
-    std::string name;
-    /// address of actual variable. such as FLAGS_xxx.
-    T* value;
-    /// usage text.
-    std::string text;
-    /// default value of this command.
-    T defaultValue;
-  };
-
-  /// the command line arguments of type T.
-  std::vector<Command> commands;
-
-  DISABLE_COPY(CommandLineFlagRegistry);
-
-private:
-  inline CommandLineFlagRegistry() {}
-};
-
-/**
- *Helper class to register command line flag.
- */
-template <typename T>
-struct CommandLineFlagRegister {
-  /**
-   * \brief: Register a command line argument
-   *
-   * \param [in] name: The command line name.
-   * \param [inout] val: The command line argument instance, FLAGS_xxx.
-   * \param [in] desc: The command line helper message.
-   */
-  CommandLineFlagRegister(const std::string& name,
-                          T* val,
-                          const std::string desc) {
-    CommandLineFlagRegistry<T>::Instance()->commands.push_back(
-        {name, val, desc, *val});
-  }
-};
-
-/**
- * \brief: Define a command line arguments.
- *
- * \param type: The variable type, such as int, double, etc.
- * \param name: The variable name. The command line argument is '--name', the
- *variable
- *is 'FLAGS_name'
- * \param default_value: The default value of command line argument.
- * \param text: The description in command line argument.
- */
-#define PADDLE_DEFINE_variable(type, name, default_value, text) \
-  type FLAGS_##name = default_value;                            \
-  namespace paddle_flags_internal {                             \
-  paddle::flags_internal::CommandLineFlagRegister<type>         \
-      flags_internal_var_##name(#name, &FLAGS_##name, text);    \
-  }  // namespace paddle_flags_internal
-
-/**
- * Declare a variable to use.
- */
-#define PADDLE_DECLARE_variable(type, name) extern type FLAGS_##name;
-
-// DEFINE macro for each types.
-#define P_DEFINE_int32(name, default_value, text) \
-  PADDLE_DEFINE_variable(int32_t, name, default_value, text)
-
-#define P_DEFINE_bool(name, default_value, text) \
-  PADDLE_DEFINE_variable(bool, name, default_value, text)
-
-#define P_DEFINE_string(name, default_value, text) \
-  PADDLE_DEFINE_variable(std::string, name, default_value, text)
-
-#define P_DEFINE_double(name, default_value, text) \
-  PADDLE_DEFINE_variable(double, name, default_value, text)
-
-#define P_DEFINE_int64(name, default_value, text) \
-  PADDLE_DEFINE_variable(int64_t, name, default_value, text)
-
-#define P_DEFINE_uint64(name, default_value, text) \
-  PADDLE_DEFINE_variable(uint64_t, name, default_value, text)
-
-// Declare macro for each types.
-#define P_DECLARE_int32(name) PADDLE_DECLARE_variable(int32_t, name)
-#define P_DECLARE_bool(name) PADDLE_DECLARE_variable(bool, name)
-#define P_DECLARE_string(name) PADDLE_DECLARE_variable(std::string, name)
-#define P_DECLARE_double(name) PADDLE_DECLARE_variable(double, name)
-#define P_DECLARE_int64(name) PADDLE_DECLARE_variable(int64_t, name)
-#define P_DECLARE_uint64(name) PADDLE_DECLARE_variable(uint64_t, name)
-}  // namespace flags_internal
-
-/**
- * \brief Parse command line flags. If parse error, just failed and exit 1.
- *
- * \param [inout] argc: The command argument count. This method will modify
- *argc, and left unused arguments.
- * \param [inout] argv: The command argument values. This method will modify
- *argv, and left unused arguments.
- * \param [in] withHelp: True will parse '-h' and '--help' to print usage.
- *
- * \note: The Command line flags format basically as follow:
- *
- *  * If the type of flag is not bool, then the follow format of command line
- *    will be parsed:
- *    * --flag_name=value
- *    * -flag_name=value
- *
- *  * If the flag is bool, then:
- *    * --flag_name=value, -flag_name=value will be parsed.
- *       * if value.tolower() == "true"| "1" will be treated as true.
- *       * else if value.tolower() == "false" | "0" will be treated as false.
- *    * --flag_name will be parsed as true.
- *    * --noflag_name will be parsed as false.
- */
-void ParseCommandLineFlags(int* argc, char** argv, bool withHelp = true);
-
-}  // namespace paddle
-
-#else  // if use gflags.
 #include <gflags/gflags.h>
 
-#define P_DEFINE_int32 DEFINE_int32
-#define P_DEFINE_bool DEFINE_bool
-#define P_DEFINE_string DEFINE_string
-#define P_DEFINE_double DEFINE_double
-#define P_DEFINE_int64 DEFINE_int64
-#define P_DEFINE_uint64 DEFINE_uint64
-#define P_DECLARE_int32 DECLARE_int32
-#define P_DECLARE_bool DECLARE_bool
-#define P_DECLARE_string DECLARE_string
-#define P_DECLARE_double DECLARE_double
-#define P_DECLARE_int64 DECLARE_int64
-#define P_DECLARE_uint64 DECLARE_uint64
 namespace paddle {
 void ParseCommandLineFlags(int* argc, char** argv, bool withHelp = true);
 
 }  // namespace paddle
-
-#endif
diff --git a/paddle/utils/CompilerMacros.h b/paddle/utils/CompilerMacros.h
deleted file mode 100644
index 4236d750c4d8bf722fdf3e371dc95b2d9aa8223d..0000000000000000000000000000000000000000
--- a/paddle/utils/CompilerMacros.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#define ATTR_NORETURN __attribute__((noreturn))
diff --git a/paddle/utils/CpuId.cpp b/paddle/utils/CpuId.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8eefdd2980e7f56a836df6fd2ff8c31b81a55555
--- /dev/null
+++ b/paddle/utils/CpuId.cpp
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/CpuId.h"
+#include "paddle/utils/Util.h"
+
+#ifdef _WIN32
+
+#include <intrin.h>
+
+/// for MSVC
+#define CPUID(info, x) __cpuidex(info, x, 0)
+
+#else
+
+#include <cpuid.h>
+
+/// for GCC/Clang
+#define CPUID(info, x) __cpuid_count(x, 0, info[0], info[1], info[2], info[3])
+
+#endif
+
+namespace paddle {
+
+SIMDFlags::SIMDFlags() {
+  unsigned int cpuInfo[4];
+  // CPUID: https://en.wikipedia.org/wiki/CPUID
+  // clang-format off
+  CPUID(cpuInfo, 0x00000001);
+  simd_flags_ |= cpuInfo[3] & (1 << 25) ? SIMD_SSE   : SIMD_NONE;
+  simd_flags_ |= cpuInfo[3] & (1 << 26) ? SIMD_SSE2  : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 <<  0) ? SIMD_SSE3  : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 <<  9) ? SIMD_SSSE3 : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 << 19) ? SIMD_SSE41 : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 << 20) ? SIMD_SSE42 : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 << 12) ? SIMD_FMA3  : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 << 28) ? SIMD_AVX   : SIMD_NONE;
+
+  CPUID(cpuInfo, 0x00000007);
+  simd_flags_ |= cpuInfo[1] & (1 <<  5) ? SIMD_AVX2  : SIMD_NONE;
+  simd_flags_ |= cpuInfo[1] & (1 << 16) ? SIMD_AVX512: SIMD_NONE;
+
+  CPUID(cpuInfo, 0x80000001);
+  simd_flags_ |= cpuInfo[2] & (1 << 16) ? SIMD_FMA4  : SIMD_NONE;
+  // clang-fotmat on
+}
+
+SIMDFlags const* SIMDFlags::instance() {
+  static SIMDFlags instance;
+  return &instance;
+}
+
+}  // namespace paddle
diff --git a/paddle/utils/CpuId.h b/paddle/utils/CpuId.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a354da75851ed7cca4e85e77714624634951f00
--- /dev/null
+++ b/paddle/utils/CpuId.h
@@ -0,0 +1,100 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "DisableCopy.h"
+
+namespace paddle {
+
+// clang-format off
+enum simd_t {
+  SIMD_NONE   = 0,          ///< None
+  SIMD_SSE    = 1 << 0,     ///< SSE
+  SIMD_SSE2   = 1 << 1,     ///< SSE 2
+  SIMD_SSE3   = 1 << 2,     ///< SSE 3
+  SIMD_SSSE3  = 1 << 3,     ///< SSSE 3
+  SIMD_SSE41  = 1 << 4,     ///< SSE 4.1
+  SIMD_SSE42  = 1 << 5,     ///< SSE 4.2
+  SIMD_FMA3   = 1 << 6,     ///< FMA 3
+  SIMD_FMA4   = 1 << 7,     ///< FMA 4
+  SIMD_AVX    = 1 << 8,     ///< AVX
+  SIMD_AVX2   = 1 << 9,     ///< AVX 2
+  SIMD_AVX512 = 1 << 10,    ///< AVX 512
+};
+// clang-format on
+
+class SIMDFlags final {
+public:
+  DISABLE_COPY(SIMDFlags);
+
+  SIMDFlags();
+
+  static SIMDFlags const* instance();
+
+  inline bool check(int flags) const {
+    return !((simd_flags_ & flags) ^ flags);
+  }
+
+private:
+  int simd_flags_ = SIMD_NONE;
+};
+
+/**
+ * @brief   Check SIMD flags at runtime.
+ *
+ * For example.
+ * @code{.cpp}
+ *
+ * if (HAS_SIMD(SIMD_AVX2 | SIMD_FMA4)) {
+ *      avx2_fm4_stub();
+ * } else if (HAS_SIMD(SIMD_AVX)) {
+ *      avx_stub();
+ * }
+ *
+ * @endcode
+ */
+#define HAS_SIMD(__flags) SIMDFlags::instance()->check(__flags)
+
+/**
+ * @brief   Check SIMD flags at runtime.
+ *
+ * 1. Check all SIMD flags at runtime:
+ *
+ * @code{.cpp}
+ * if (HAS_AVX && HAS_AVX2) {
+ *      avx2_stub();
+ * }
+ * @endcod
+ *
+ * 2. Check one SIMD flag at runtime:
+ *
+ * @code{.cpp}
+ * if (HAS_SSE41 || HAS_SSE42) {
+ *      sse4_stub();
+ * }
+ * @endcode
+ */
+// clang-format off
+#define HAS_SSE     HAS_SIMD(SIMD_SSE)
+#define HAS_SSE2    HAS_SIMD(SIMD_SSE2)
+#define HAS_SSE3    HAS_SIMD(SIMD_SSE3)
+#define HAS_SSSE3   HAS_SIMD(SIMD_SSSE3)
+#define HAS_SSE41   HAS_SIMD(SIMD_SSE41)
+#define HAS_SSE42   HAS_SIMD(SIMD_SSE42)
+#define HAS_FMA3    HAS_SIMD(SIMD_FMA3)
+#define HAS_FMA4    HAS_SIMD(SIMD_FMA4)
+#define HAS_AVX     HAS_SIMD(SIMD_AVX)
+#define HAS_AVX2    HAS_SIMD(SIMD_AVX2)
+#define HAS_AVX512  HAS_SIMD(SIMD_AVX512)
+// clang-format on
+
+}  // namespace paddle
diff --git a/paddle/utils/CustomStackTrace.cpp b/paddle/utils/CustomStackTrace.cpp
index 8740fe662ea21ce93c7c0d9505cdeb75975b3020..66b38218a7c7ec146f366ded516ebe22d012e47f 100644
--- a/paddle/utils/CustomStackTrace.cpp
+++ b/paddle/utils/CustomStackTrace.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "CustomStackTrace.h"
-#include "CommandLineParser.h"
 #include <iostream>
+#include "CommandLineParser.h"
 
-P_DEFINE_bool(
+DEFINE_bool(
     layer_stack_error_only_current_thread,
     true,
     "Dump current thread or whole process layer stack when signal error "
diff --git a/paddle/utils/CustomStackTrace.h b/paddle/utils/CustomStackTrace.h
index 878e14eb5fcf870bf6c29758a1b9a297c13ce730..6992e856223494d6575ef3261d82cbdf4e375885 100644
--- a/paddle/utils/CustomStackTrace.h
+++ b/paddle/utils/CustomStackTrace.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
+#include <functional>
 #include <stack>
 #include <thread>
 #include <unordered_map>
-#include <functional>
 
 #include "ThreadLocal.h"
 
@@ -96,7 +96,8 @@ public:
    */
   typedef std::function<void(const std::thread::id& /*threadId*/,
                              bool* /*isPushing*/,
-                             const T& /*item*/)> DumpCallback;
+                             const T& /*item*/)>
+      DumpCallback;
 
   /**
    * Dump all thread stack, and all stack will be cleared.
diff --git a/paddle/utils/DisableCopy.h b/paddle/utils/DisableCopy.h
index e991c07cdf68dac2bdf7fd66de03a292a3bec3c8..41de98bbde664651803c8db4c0cd7216b2ff4231 100644
--- a/paddle/utils/DisableCopy.h
+++ b/paddle/utils/DisableCopy.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/utils/Excepts.cpp b/paddle/utils/Excepts.cpp
index b2fad3ac9dd6477e388185d95ebd49c8f0da4c84..4ddce35ed31a8fed3f25cb3b03348b4eda8fcfdd 100644
--- a/paddle/utils/Excepts.cpp
+++ b/paddle/utils/Excepts.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/utils/Excepts.h b/paddle/utils/Excepts.h
index a84a2d33a6a3d0664218151befd6b2af44f72a97..dc3369b7e8c27cf53a03ce56b18a123f291d2d6d 100644
--- a/paddle/utils/Excepts.h
+++ b/paddle/utils/Excepts.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index 6fae24e1b58c5296019cfaefe97905c3e8632210..59d6cbdc513660b87cb013d8aa92c5c8f9289ecb 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,65 +15,61 @@ limitations under the License. */
 #include "Flags.h"
 
 #ifdef PADDLE_ONLY_CPU
-P_DEFINE_bool(use_gpu, false, "Only support CPU training");
+DEFINE_bool(use_gpu, false, "Only support CPU training");
 #else
-P_DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
+DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
 #endif
 
-P_DEFINE_bool(
-    parallel_nn,
-    false,
-    "Whether to use multi-threads to calculate one neural network."
-    "If it was set false, use gpu_id specify which gpu core to use"
-    "(the device property in the trainer config file will be ingored)."
-    "If it was set true, the gpu core is specified by the trainer"
-    "  config file(gpu_id will be ignored).");
-P_DEFINE_int32(trainer_count, 1, "Defined how many trainers to train");
-P_DEFINE_int32(gpu_id, 0, "Which gpu core to use");
-P_DEFINE_int32(port, 20134, "Listening port for pserver");
-P_DEFINE_int32(data_server_port, 21134, "Listening port for dserver");
-P_DEFINE_int32(ports_num,
-               1,
-               "The ports number for parameter send,"
-               " increment based on default port number");
-P_DEFINE_int32(ports_num_for_sparse,
-               0,
-               "The ports number for parameter send,"
-               " increment based on default (port + ports_num)");
-P_DEFINE_string(nics, "xgbe0,xgbe1", "network device name for pservers");
-P_DEFINE_string(rdma_tcp, "tcp", "use rdma or tcp rdma transport protocol");
-P_DEFINE_int32(
-    trainer_id,
-    0,
-    "For distributed training, each trainer must be given an unique id"
-    " ranging from 0 to num_trainers-1. Trainer 0 is the master"
-    " trainer");
-P_DEFINE_int32(num_gradient_servers, 1, "number of gradient servers");
-P_DEFINE_string(comment, "", "A string for commenting this training task");
-P_DEFINE_string(load_missing_parameter_strategy,
-                "fail",
-                "which operation to take on load model fails. support "
-                "fail/rand/zero only.");
-P_DEFINE_int32(log_period, 100, "Log progress every so many batches");
-P_DEFINE_int32(log_period_server,
-               500,
-               "Log progress every so many batches at pserver end");
-P_DEFINE_double(checkgrad_eps, 1e-5, "parameter change size for checkgrad");
-P_DEFINE_int32(enable_parallel_vector,
-               0,
-               "threshold for enable parallel vector");
-P_DEFINE_bool(loadsave_parameters_in_pserver,
-              false,
-              "load and save parameters in pserver. "
-              "only work while parameter set sparse_remote_update.");
-P_DEFINE_int32(beam_size,
-               1,
-               "Beam size used in generating most probable output sequences.");
+DEFINE_bool(parallel_nn,
+            false,
+            "Whether to use multi-threads to calculate one neural network."
+            "If it was set false, use gpu_id specify which gpu core to use"
+            "(the device property in the trainer config file will be ingored)."
+            "If it was set true, the gpu core is specified by the trainer"
+            "  config file(gpu_id will be ignored).");
+DEFINE_int32(trainer_count, 1, "Defined how many trainers to train");
+DEFINE_int32(gpu_id, 0, "Which gpu core to use");
+DEFINE_int32(port, 20134, "Listening port for pserver");
+DEFINE_int32(data_server_port, 21134, "Listening port for dserver");
+DEFINE_int32(ports_num,
+             1,
+             "The ports number for parameter send,"
+             " increment based on default port number");
+DEFINE_int32(ports_num_for_sparse,
+             0,
+             "The ports number for parameter send,"
+             " increment based on default (port + ports_num)");
+DEFINE_string(nics, "xgbe0,xgbe1", "network device name for pservers");
+DEFINE_string(rdma_tcp, "tcp", "use rdma or tcp rdma transport protocol");
+DEFINE_int32(trainer_id,
+             0,
+             "For distributed training, each trainer must be given an unique id"
+             " ranging from 0 to num_trainers-1. Trainer 0 is the master"
+             " trainer");
+DEFINE_int32(num_gradient_servers, 1, "number of gradient servers");
+DEFINE_string(comment, "", "A string for commenting this training task");
+DEFINE_string(load_missing_parameter_strategy,
+              "fail",
+              "which operation to take on load model fails. support "
+              "fail/rand/zero only.");
+DEFINE_int32(log_period, 100, "Log progress every so many batches");
+DEFINE_int32(log_period_server,
+             500,
+             "Log progress every so many batches at pserver end");
+DEFINE_double(checkgrad_eps, 1e-5, "parameter change size for checkgrad");
+DEFINE_int32(enable_parallel_vector, 0, "threshold for enable parallel vector");
+DEFINE_bool(loadsave_parameters_in_pserver,
+            false,
+            "load and save parameters in pserver. "
+            "only work while parameter set sparse_remote_update.");
+DEFINE_int32(beam_size,
+             1,
+             "Beam size used in generating most probable output sequences.");
 
-P_DEFINE_bool(show_layer_stat, false, "show the statistics of each layer");
-P_DEFINE_string(predict_file, "", "File name for saving predict result");
-P_DEFINE_bool(prev_batch_state, false, "batch is continue with next batch");
-P_DEFINE_string(init_model_path,
-                "",
-                "Path of the initial model parameters."
-                "If it was set, start_pass will be ignored.");
+DEFINE_bool(show_layer_stat, false, "show the statistics of each layer");
+DEFINE_string(predict_file, "", "File name for saving predict result");
+DEFINE_bool(prev_batch_state, false, "batch is continue with next batch");
+DEFINE_string(init_model_path,
+              "",
+              "Path of the initial model parameters."
+              "If it was set, start_pass will be ignored.");
diff --git a/paddle/utils/Flags.h b/paddle/utils/Flags.h
index dda60c3f965abd8575677c785b21b058b3400ee5..2ebbcb24eb061531d0807756528d7bf16e6aa124 100644
--- a/paddle/utils/Flags.h
+++ b/paddle/utils/Flags.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,28 +16,28 @@ limitations under the License. */
 
 #include "CommandLineParser.h"
 
-P_DECLARE_bool(parallel_nn);
-P_DECLARE_int32(async_count);
-P_DECLARE_int32(port);
-P_DECLARE_int32(data_server_port);
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_int32(trainer_count);
-P_DECLARE_int32(ports_num);
-P_DECLARE_int32(ports_num_for_sparse);
-P_DECLARE_string(nics);
-P_DECLARE_string(rdma_tcp);
-P_DECLARE_int32(trainer_id);
-P_DECLARE_int32(num_gradient_servers);
-P_DECLARE_string(comment);
-P_DECLARE_string(load_missing_parameter_strategy);
-P_DECLARE_int32(log_period);
-P_DECLARE_int32(log_period_server);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_int32(enable_parallel_vector);
-P_DECLARE_bool(loadsave_parameters_in_pserver);
-P_DECLARE_int32(beam_size);
-P_DECLARE_bool(show_layer_stat);
-P_DECLARE_string(predict_file);
-P_DECLARE_bool(prev_batch_state);
-P_DECLARE_string(init_model_path);
+DECLARE_bool(parallel_nn);
+DECLARE_int32(async_count);
+DECLARE_int32(port);
+DECLARE_int32(data_server_port);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_int32(trainer_count);
+DECLARE_int32(ports_num);
+DECLARE_int32(ports_num_for_sparse);
+DECLARE_string(nics);
+DECLARE_string(rdma_tcp);
+DECLARE_int32(trainer_id);
+DECLARE_int32(num_gradient_servers);
+DECLARE_string(comment);
+DECLARE_string(load_missing_parameter_strategy);
+DECLARE_int32(log_period);
+DECLARE_int32(log_period_server);
+DECLARE_double(checkgrad_eps);
+DECLARE_int32(enable_parallel_vector);
+DECLARE_bool(loadsave_parameters_in_pserver);
+DECLARE_int32(beam_size);
+DECLARE_bool(show_layer_stat);
+DECLARE_string(predict_file);
+DECLARE_bool(prev_batch_state);
+DECLARE_string(init_model_path);
diff --git a/paddle/utils/GlobalConstants.cpp b/paddle/utils/GlobalConstants.cpp
index d769cd1ee7d4403f9fddbe91d2afec2c986d6b18..9e8dade0b228eb642a965eaa5bfe0653fe2749de 100644
--- a/paddle/utils/GlobalConstants.cpp
+++ b/paddle/utils/GlobalConstants.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/utils/GlobalConstants.h b/paddle/utils/GlobalConstants.h
index 4c74c17a50c8cdbc18a075a58f97efc6b3330deb..707346f2c76e59b50722f4f8805ebe56c3cf861b 100644
--- a/paddle/utils/GlobalConstants.h
+++ b/paddle/utils/GlobalConstants.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/utils/Locks.h b/paddle/utils/Locks.h
index 5990e1657021611437e8fe730147dfaf207c800d..0f922f3548d97eb16ca897564faf1bf083f0d5ac 100644
--- a/paddle/utils/Locks.h
+++ b/paddle/utils/Locks.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/utils/Logging.cpp b/paddle/utils/Logging.cpp
index 14303bd4c747db2c10ee24b1601f709a79174850..5a1c6ecb2219f7983609c27f3215c7fc1e9e9ef2 100644
--- a/paddle/utils/Logging.cpp
+++ b/paddle/utils/Logging.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -18,175 +18,9 @@ limitations under the License. */
  */
 
 #include "Logging.h"
-#ifndef PADDLE_USE_GLOG
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <vector>
-#include <thread>
-#include <mutex>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
 
 namespace paddle {
 
-namespace internal {
-
-std::string join(const std::string& part1, const std::string& part2) {
-  const char sep = '/';
-  if (!part2.empty() && part2.front() == sep) {
-    return part2;
-  }
-  std::string ret;
-  ret.reserve(part1.size() + part2.size() + 1);
-  ret = part1;
-  if (!ret.empty() && ret.back() != sep) {
-    ret += sep;
-  }
-  ret += part2;
-  return ret;
-}
-
-static inline bool env2bool(const char* envName, bool defaultValue = false) {
-  char* envValue = getenv(envName);
-  if (envValue == nullptr) {
-    return defaultValue;
-  } else {
-    return memchr("tTyY1\0", envValue[0], 6) != nullptr;
-  }
-}
-
-static inline int env2int(const char* envName, int defaultValue = 0) {
-  char* envValue = getenv(envName);
-  if (envValue == nullptr) {
-    return defaultValue;
-  } else {
-    int retValue = defaultValue;
-    try {
-      retValue = std::stoi(envValue);
-    } catch (...) {
-      // pass
-    }
-    return retValue;
-  }
-}
-
-static inline int env2index(const char* envName,
-                            const std::vector<std::string>& options,
-                            int defaultValue) {
-  char* envValue = getenv(envName);
-  if (envValue == nullptr) {
-    return defaultValue;
-  } else {
-    for (size_t i = 0; i < options.size(); ++i) {
-      if (options[i] == envValue) {
-        return static_cast<int>(i);
-      }
-    }
-    return defaultValue;
-  }
-}
-
-static bool gLogToStderr = env2bool("PLOG_LOGTOSTDERR", true);
-static const std::vector<std::string> gLevelName = {
-    "INFO", "WARNING", "ERROR", "FATAL"};
-static int gMinLogLevel =
-    env2int("PLOG_MINLOGLEVEL", env2index("PLOG_MINLOGLEVEL", gLevelName, 0));
-
-static std::vector<std::vector<int>> gLogFds;
-static std::vector<int> gLogFileFds;
-static bool gLogInited = false;
-static void freeLogFileFds() {
-  for (auto fd : gLogFileFds) {
-    close(fd);
-  }
-}
-
-static void initializeLogFds(char* argv0) {
-  gLogFds.resize(NUM_SEVERITIES);
-
-  for (int i = gMinLogLevel; i < NUM_SEVERITIES && gLogToStderr;
-       ++i) {  // Add stderr
-    std::vector<int>& fds = gLogFds[i];
-    fds.push_back(STDERR_FILENO);
-  }
-
-  char* logDir = getenv("PLOG_LOGDIR");
-
-  for (int i = gMinLogLevel; i < NUM_SEVERITIES && logDir != nullptr; ++i) {
-    std::string filename =
-        join(logDir, std::string(argv0) + "." + gLevelName[i]);
-    int fd = open(filename.c_str(), O_CREAT | O_WRONLY, 0644);
-    if (fd == -1) {
-      fprintf(stderr, "Open log file error!");
-      exit(1);
-    }
-    gLogFileFds.push_back(fd);
-
-    std::vector<int>& curFds = gLogFds[i];
-    curFds.insert(curFds.end(), gLogFileFds.begin(), gLogFileFds.end());
-  }
-
-  atexit(freeLogFileFds);
-  gLogInited = true;
-}
-
-static void (*gFailureFunctionPtr)() ATTR_NORETURN = abort;
-
-LogMessage::LogMessage(const char* fname, int line, int severity)
-    : fname_(fname), line_(line), severity_(severity) {}
-
-LogMessage::~LogMessage() { this->generateLogMessage(); }
-
-void LogMessage::generateLogMessage() {
-  if (!gLogInited) {
-    fprintf(stderr,
-            "%c %s:%d] %s\n",
-            "IWEF"[severity_],
-            fname_,
-            line_,
-            str().c_str());
-  } else {
-    for (auto& fd : gLogFds[this->severity_]) {
-      dprintf(fd,
-              "%c %s:%d] %s\n",
-              "IWEF"[severity_],
-              fname_,
-              line_,
-              str().c_str());
-    }
-  }
-}
-
-LogMessageFatal::LogMessageFatal(const char* file, int line)
-    : LogMessage(file, line, FATAL) {}
-
-LogMessageFatal::~LogMessageFatal() {
-  generateLogMessage();
-  gFailureFunctionPtr();
-}
-}  // namespace internal
-
-void initializeLogging(int argc, char** argv) {
-  internal::initializeLogFds(argv[0]);
-}
-
-namespace logging {
-void setMinLogLevel(int level) { paddle::internal::gMinLogLevel = level; }
-
-void installFailureFunction(void (*callback)() ATTR_NORETURN) {
-  paddle::internal::gFailureFunctionPtr = callback;
-}
-
-}  // namespace logging
-
-}  // namespace paddle
-
-#else
-namespace paddle {
 void initializeLogging(int argc, char** argv) {
   (void)(argc);
   if (!getenv("GLOG_logtostderr")) {
@@ -197,13 +31,16 @@ void initializeLogging(int argc, char** argv) {
 }
 
 namespace logging {
+
 void setMinLogLevel(int level) { FLAGS_minloglevel = level; }
+
 void installFailureFunction(void (*callback)()) {
   google::InstallFailureFunction(callback);
 }
+
 void installFailureWriter(void (*callback)(const char*, int)) {
   google::InstallFailureWriter(callback);
 }
+
 }  // namespace logging
 }  // namespace paddle
-#endif
diff --git a/paddle/utils/Logging.h b/paddle/utils/Logging.h
index e9029b421fa3b68845a54194f4cfa69439a99a0c..d9e551f0891fa0808b8699aea94a0d2ab4f81cb3 100644
--- a/paddle/utils/Logging.h
+++ b/paddle/utils/Logging.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -18,179 +18,25 @@ limitations under the License. */
  */
 
 #pragma once
-#include <sstream>
 #include <memory>
+#include <sstream>
 #include <string>
 
-#ifndef PADDLE_USE_GLOG
-#include "CompilerMacros.h"
-
-//! TODO(yuyang18): Move this utility macro into some global header.
-#define PP_CAT(a, b) PP_CAT_I(a, b)
-#define PP_CAT_I(a, b) PP_CAT_II(~, a##b)
-#define PP_CAT_II(p, res) res
-
-/**
- * Generate Unique Variable Name, Usefully in macro.
- * @SEE
- * http://stackoverflow.com/questions/1082192/how-to-generate-random-variable-names-in-c-using-macros
- */
-#define UNIQUE_NAME(base) PP_CAT(base, __LINE__)
-
+#include <glog/logging.h>
 namespace paddle {
 
-//! Log levels.
-const int INFO = 0;
-const int WARNING = 1;
-const int ERROR = 2;
-const int FATAL = 3;
-const int NUM_SEVERITIES = 4;
-
-namespace internal {
-
-class LogMessage : public std::basic_ostringstream<char> {
-public:
-  LogMessage(const char* fname, int line, int severity);
-  ~LogMessage();
-
-protected:
-  /**
-   * @brief Print log message to stderr, files, etc.
-   */
-  void generateLogMessage();
-
-private:
-  const char* fname_;
-  int line_;
-  int severity_;
-};
-
-// LogMessageFatal ensures the process will exit in failure after
-// logging this message.
-class LogMessageFatal : public LogMessage {
-public:
-  LogMessageFatal(const char* file, int line) __attribute__((cold));
-  ~LogMessageFatal() __attribute__((noreturn));
-};
-
-#define _P_LOG_INFO \
-  ::paddle::internal::LogMessage(__FILE__, __LINE__, paddle::INFO)
-#define _P_LOG_WARNING \
-  ::paddle::internal::LogMessage(__FILE__, __LINE__, paddle::WARNING)
-#define _P_LOG_ERROR \
-  ::paddle::internal::LogMessage(__FILE__, __LINE__, paddle::ERROR)
-#define _P_LOG_FATAL ::paddle::internal::LogMessageFatal(__FILE__, __LINE__)
-
-#define P_LOG(severity) _P_LOG_##severity
-
-#define P_LOG_FIRST_N(severity, n)                                       \
-  static int UNIQUE_NAME(LOG_OCCURRENCES) = 0;                           \
-  if (UNIQUE_NAME(LOG_OCCURRENCES) <= n) ++UNIQUE_NAME(LOG_OCCURRENCES); \
-  if (UNIQUE_NAME(LOG_OCCURRENCES) <= n) P_LOG(severity)
-
-#define P_LOG_IF_EVERY_N(severity, condition, n)                              \
-  static int UNIQUE_NAME(LOG_OCCURRENCES) = 0;                                \
-  if (condition && ((UNIQUE_NAME(LOG_OCCURRENCES) =                           \
-                         (UNIQUE_NAME(LOG_OCCURRENCES) + 1) % n) == (1 % n))) \
-  P_LOG(severity)
-
-#define P_LOG_EVERY_N(severity, n) P_LOG_IF_EVERY_N(severity, true, n)
-
-// TODO(jeff): Define a proper implementation of VLOG_IS_ON
-#define P_VLOG_IS_ON(lvl) ((lvl) <= 0)
-
-#define P_LOG_IF(severity, condition) \
-  if (condition) P_LOG(severity)
-
-#define P_VLOG(lvl) P_LOG_IF(INFO, P_VLOG_IS_ON(lvl))
-
-#define P_VLOG_IF(lvl, cond) P_LOG_IF(INFO, P_VLOG_IS_ON(lvl) && cond)
-
-#define P_VLOG_EVERY_N(lvl, n) P_LOG_IF_EVERY_N(INFO, P_VLOG_IS_ON(lvl), n)
-
-#define PREDICT_FALSE(x) (__builtin_expect(x, 0))
-#define PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
-
-// CHECK dies with a fatal error if condition is not true.  It is *not*
-// controlled by NDEBUG, so the check will be executed regardless of
-// compilation mode.  Therefore, it is safe to do things like:
-//    CHECK(fp->Write(x) == 4)
-#define P_CHECK(condition)         \
-  if (PREDICT_FALSE(!(condition))) \
-  P_LOG(FATAL) << "Check failed: " #condition " "
-
-#define P_CHECK_EQ(val1, val2) P_CHECK((val1) == (val2))
-#define P_CHECK_NE(val1, val2) P_CHECK((val1) != (val2))
-#define P_CHECK_LE(val1, val2) P_CHECK((val1) <= (val2))
-#define P_CHECK_LT(val1, val2) P_CHECK((val1) < (val2))
-#define P_CHECK_GE(val1, val2) P_CHECK((val1) >= (val2))
-#define P_CHECK_GT(val1, val2) P_CHECK((val1) > (val2))
-#define P_CHECK_NOTNULL(val) P_CHECK((val) != NULL)
-
-//! GLOG compatible APIs
-//! NOTE: only implement Paddle actually used APIs.
-#define LOG(x) P_LOG(x)
-#define VLOG(x) P_VLOG(x)
-#define DLOG(x) P_VLOG(5)
-#define CHECK(x) P_CHECK(x)
-#define PCHECK(x) P_CHECK(x)
-#define CHECK_EQ(val1, val2) P_CHECK((val1) == (val2))
-#define CHECK_NE(val1, val2) P_CHECK((val1) != (val2))
-#define CHECK_LE(val1, val2) P_CHECK((val1) <= (val2))
-#define CHECK_LT(val1, val2) P_CHECK((val1) < (val2))
-#define CHECK_GE(val1, val2) P_CHECK((val1) >= (val2))
-#define CHECK_GT(val1, val2) P_CHECK((val1) > (val2))
-#define CHECK_NOTNULL(val) P_CHECK((val) != NULL)
-#define VLOG_IS_ON(x) P_VLOG_IS_ON(x)
-#define LOG_FIRST_N(severity, n) P_LOG_FIRST_N(severity, n)
-#define LOG_IF(severity, condition) P_LOG_IF(severity, condition)
-#define VLOG_EVERY_N(lvl, n) P_VLOG_EVERY_N(lvl, n)
-#define VLOG_IF(lvl, cond) P_VLOG_IF(lvl, cond)
-#define LOG_EVERY_N(severity, n) P_LOG_EVERY_N(severity, n)
-}  //  namespace internal
-
-/**
- * @brief initialize logging
- * @note: Current implement of logging is lack of:
- *          PrintCallStack when fatal.
- *          VLOG_IS_ON
- *        But it is portable to multi-platform, and simple enough to modify.
- */
 void initializeLogging(int argc, char** argv);
-namespace logging {
-/**
- * @brief Set Min Log Level. if Log.level < minLogLevel, then will not print log
- *        to stream
- * @param level. Any integer is OK, but only 0 <= x <= NUM_SEVERITIES is useful.
- */
-void setMinLogLevel(int level);
 
-/**
- * @brief Install Log(Fatal) failure function. Default is abort();
- * @param callback: The failure function.
- */
-void installFailureFunction(void (*callback)() ATTR_NORETURN);
-
-/**
- * @brief installFailureWriter
- * @note: not implemented currently.
- */
-inline void installFailureWriter(void (*callback)(const char*, int)) {
-  (void)(callback);  // unused callback.
-}
-}  //  namespace logging
-}  //  namespace paddle
-#else
-#include <glog/logging.h>
-namespace paddle {
-void initializeLogging(int argc, char** argv);
 namespace logging {
+
 void setMinLogLevel(int level);
+
 void installFailureFunction(void (*callback)());
+
 void installFailureWriter(void (*callback)(const char*, int));
-}  //  namespace logging
-}
-#endif  // PADDLE_USE_GLOG
+
+}  // namespace logging
+}  // namespace paddle
 
 #ifndef NDEBUG
 #define DEBUG_LEVEL 5
diff --git a/paddle/utils/PythonUtil.cpp b/paddle/utils/PythonUtil.cpp
index 7f17a825228ef56be7b8678bf003e57388d4b0bf..7faeff55c28b9065179ad27b3b604a9f411249e5 100644
--- a/paddle/utils/PythonUtil.cpp
+++ b/paddle/utils/PythonUtil.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "PythonUtil.h"
-#include <sstream>
 #include <signal.h>
+#include <sstream>
 
 namespace paddle {
 
 #ifdef PADDLE_NO_PYTHON
 
-P_DEFINE_string(python_path, "", "python path");
-P_DEFINE_string(python_bin, "python2.7", "python bin");
+DEFINE_string(python_path, "", "python path");
+DEFINE_string(python_bin, "python2.7", "python bin");
 
 constexpr int kExecuteCMDBufLength = 204800;
 
diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h
index 65677d90101a0ee2e62c8ac45c50b88326e169e1..daebaffc855518425ae43942c22ec150d2e327f0 100644
--- a/paddle/utils/PythonUtil.h
+++ b/paddle/utils/PythonUtil.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+// clang-format off
+#include "paddle/utils/Util.h"
 
 #ifndef PADDLE_NO_PYTHON
 // must include the following two blocks, otherwise,
@@ -33,13 +35,12 @@ limitations under the License. */
 #endif
 #include <Python.h>
 #include <frameobject.h>
-
 #endif
 
-#include "paddle/utils/Util.h"
 #include <stdarg.h>
-#include <mutex>
 #include <map>
+#include <mutex>
+// clang-format on
 
 namespace paddle {
 
diff --git a/paddle/utils/Queue.h b/paddle/utils/Queue.h
index 58d17e86c432b90a6b3240dd5528146a24b72184..f054738f87c02d2d749eec8d6c7bb55b506a6d91 100644
--- a/paddle/utils/Queue.h
+++ b/paddle/utils/Queue.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -142,9 +142,9 @@ public:
    */
   bool waitNotEmptyFor(int seconds) {
     std::unique_lock<std::mutex> lock(queueLock_);
-    return queueCV_.wait_for(lock,
-                             std::chrono::seconds(seconds),
-                             [this] { return numElements_ != 0; });
+    return queueCV_.wait_for(lock, std::chrono::seconds(seconds), [this] {
+      return numElements_ != 0;
+    });
   }
 
 private:
diff --git a/paddle/utils/Stat.cpp b/paddle/utils/Stat.cpp
index d7b20ca5eb2f4eadaa6b4acad056d669a9b59c14..44acee249554e41f715314a3cd7eef29e3e6c5b0 100644
--- a/paddle/utils/Stat.cpp
+++ b/paddle/utils/Stat.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Stat.h"
-#include "Util.h"
-#include <iomanip>
 #include <algorithm>
+#include <iomanip>
+#include "Util.h"
 
 namespace paddle {
 
@@ -65,6 +65,7 @@ std::ostream& operator<<(std::ostream& outPut, const Stat& stat) {
   auto showStat = [&](const StatInfo* info, pid_t tid, bool isFirst = true) {
     uint64_t average = 0;
     if (info->count_ > 0) {
+      outPut << std::setfill(' ') << std::left;
       if (!isFirst) {
         outPut << std::setw(42) << " ";
       }
@@ -202,4 +203,21 @@ StatInfo::~StatInfo() {
   }
 }
 
+static unsigned g_profileCount = 0;
+static std::recursive_mutex g_profileMutex;
+
+GpuProfiler::GpuProfiler(std::string statName, std::string info)
+    : guard_(g_profileMutex) {
+  if (++g_profileCount == 1) {
+    LOG(INFO) << "Enable GPU Profiler Stat: [" << statName << "] " << info;
+    hl_profiler_start();
+  }
+}
+
+GpuProfiler::~GpuProfiler() {
+  if (--g_profileCount == 0) {
+    hl_profiler_end();
+  }
+}
+
 }  // namespace paddle
diff --git a/paddle/utils/Stat.h b/paddle/utils/Stat.h
index 4051145d9246639fce5d041103c1211a939eddca..9be79e8859a3b4cff4c834ee698902706f947712 100644
--- a/paddle/utils/Stat.h
+++ b/paddle/utils/Stat.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,19 +15,19 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
-#include <string>
 #include <sys/time.h>
-#include <memory>
 #include <iostream>
+#include <list>
+#include <memory>
 #include <mutex>
+#include <string>
 #include <unordered_map>
-#include <list>
 
-#include "Logging.h"
 #include "BarrierStat.h"
 #include "Locks.h"
+#include "Logging.h"
 #include "ThreadLocal.h"
-#include "BarrierStat.h"
+#include "hl_gpu.h"
 
 namespace paddle {
 
@@ -283,4 +283,24 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1,
 
 #endif  // DISABLE_TIMER
 
+class GpuProfiler final {
+public:
+  GpuProfiler(std::string statName, std::string info);
+  ~GpuProfiler();
+
+private:
+  std::lock_guard<std::recursive_mutex> guard_;
+};
+
+#ifdef PADDLE_DISABLE_PROFILER
+
+#define REGISTER_GPU_PROFILER(statName, ...)
+
+#else
+
+#define REGISTER_GPU_PROFILER(statName, ...) \
+  GpuProfiler __gpuProfiler(statName, #__VA_ARGS__);
+
+#endif  // DISABLE_PROFILER
+
 }  // namespace paddle
diff --git a/paddle/utils/StringUtil.cpp b/paddle/utils/StringUtil.cpp
index b416cda4af1572d76cdd9a7144e05b790eedf22a..0c98e6db34530ae40a7245768051b8ce8aa69202 100644
--- a/paddle/utils/StringUtil.cpp
+++ b/paddle/utils/StringUtil.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/utils/StringUtil.h b/paddle/utils/StringUtil.h
index 8b44dad19231781623a0a65d02b24ac1cf9e4523..0b4f4c9113ae9d714b634b67931e51b408bbe777 100644
--- a/paddle/utils/StringUtil.h
+++ b/paddle/utils/StringUtil.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <sstream>
 #include <string>
 #include <vector>
-#include <sstream>
 #include "Logging.h"
 
 namespace paddle {
diff --git a/paddle/utils/Thread.h b/paddle/utils/Thread.h
index ade0ee496f94f6165f35dd1a0a37618df8fae585..ef36a8c5b2b0e95d759da8a781d781b71d067b7a 100644
--- a/paddle/utils/Thread.h
+++ b/paddle/utils/Thread.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "Util.h"
-#include "Logging.h"
 #include <thread>
+#include "Logging.h"
+#include "Util.h"
 
 #include "Queue.h"
 #include "ThreadLocal.h"
diff --git a/paddle/utils/ThreadLocal.cpp b/paddle/utils/ThreadLocal.cpp
index 49d4b1526537def9b8183934faa971402f3678aa..75ccbd28cf21b7fafb43a072503dff14a29fec8a 100644
--- a/paddle/utils/ThreadLocal.cpp
+++ b/paddle/utils/ThreadLocal.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "Util.h"
 #include "ThreadLocal.h"
 #include "CommandLineParser.h"
+#include "Util.h"
 
-P_DEFINE_bool(thread_local_rand_use_global_seed,
-              false,
-              "Whether to use global seed in thread local rand.");
+DEFINE_bool(thread_local_rand_use_global_seed,
+            false,
+            "Whether to use global seed in thread local rand.");
 
 namespace paddle {
 
diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h
index 06c8b392af23f81ab48042cb4d24a40b1c50275d..a4987c9ec261a2ee57e62d1640e2a21c7f804c99 100644
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,14 +15,14 @@ limitations under the License. */
 #pragma once
 
 #include <pthread.h>
-#include <sys/types.h>
 #include <sys/syscall.h>
+#include <sys/types.h>
 #include <unistd.h>
 #include <map>
 #include <mutex>
 #include <random>
-#include "Util.h"
 #include "Logging.h"
+#include "Util.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/TypeDefs.h b/paddle/utils/TypeDefs.h
index e8be779bea255eec71057495d1253ed92c2256c3..c50a05e82daefd1273c896f3603957f4484ecd5d 100644
--- a/paddle/utils/TypeDefs.h
+++ b/paddle/utils/TypeDefs.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index bc727cfa74cdfb51b36259bd08733804578f6d66..7c0d66c488f5064641c53ea7995a75c330a3e49d 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,11 +15,11 @@ limitations under the License. */
 #include "Util.h"
 
 #include <dirent.h>
+#include <pmmintrin.h>
 #include <signal.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <xmmintrin.h>
-#include <pmmintrin.h>
 
 #include <fstream>
 #include <mutex>
@@ -28,12 +28,12 @@ limitations under the License. */
 
 #include "CommandLineParser.h"
 #include "CustomStackTrace.h"
+#include "StringUtil.h"
 #include "Thread.h"
 #include "ThreadLocal.h"
 #include "Version.h"
-#include "StringUtil.h"
 
-P_DEFINE_int32(seed, 1, "random number seed. 0 for srand(time)");
+DEFINE_int32(seed, 1, "random number seed. 0 for srand(time)");
 
 #ifdef WITH_GOOGLE_PERFTOOLS
 /*
@@ -52,10 +52,8 @@ P_DEFINE_int32(seed, 1, "random number seed. 0 for srand(time)");
 
 #include <gperftools/profiler.h>
 
-P_DEFINE_int32(profile_signal, 12, "signal for switch google profiler");
-P_DEFINE_string(profile_data_file,
-                "gperf.prof",
-                "file for storing profile data");
+DEFINE_int32(profile_signal, 12, "signal for switch google profiler");
+DEFINE_string(profile_data_file, "gperf.prof", "file for storing profile data");
 
 static void profilerSwitch(int signalNumber) {
   bool static started = false;
@@ -126,25 +124,23 @@ void registerInitFunction(std::function<void()> func, int priority) {
 }
 
 void runInitFunctions() {
-  std::call_once(
-      g_onceFlag,
-      []() {
-        LOG(INFO) << "Calling runInitFunctions";
-        if (g_initFuncs) {
-          std::sort(g_initFuncs->begin(),
-                    g_initFuncs->end(),
-                    [](const PriorityFuncPair& x, const PriorityFuncPair& y) {
-                      return x.first > y.first;
-                    });
-          for (auto& f : *g_initFuncs) {
-            f.second();
-          }
-          delete g_initFuncs;
-          g_initFuncs = nullptr;
-        }
-        g_initialized = true;
-        LOG(INFO) << "Call runInitFunctions done.";
-      });
+  std::call_once(g_onceFlag, []() {
+    LOG(INFO) << "Calling runInitFunctions";
+    if (g_initFuncs) {
+      std::sort(g_initFuncs->begin(),
+                g_initFuncs->end(),
+                [](const PriorityFuncPair& x, const PriorityFuncPair& y) {
+                  return x.first > y.first;
+                });
+      for (auto& f : *g_initFuncs) {
+        f.second();
+      }
+      delete g_initFuncs;
+      g_initFuncs = nullptr;
+    }
+    g_initialized = true;
+    LOG(INFO) << "Call runInitFunctions done.";
+  });
 }
 
 void initMain(int argc, char** argv) {
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index ed38f8fa60b3716c12e755b047557c1409fa767c..24ddde28e7e9f44c32d70e1b9621954ee77b2883 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,25 +14,25 @@ limitations under the License. */
 
 #pragma once
 
+#include <sys/syscall.h>  // for syscall()
+#include <sys/types.h>
 #include <algorithm>
 #include <cmath>
-#include <string>
-#include <vector>
+#include <functional>
 #include <memory>
+#include <mutex>
+#include <string>
 #include <thread>
 #include <unordered_map>
-#include <mutex>
-#include <functional>
-#include <sys/syscall.h>  // for syscall()
-#include <sys/types.h>
+#include <vector>
 
 #include "CommandLineParser.h"
+#include "DisableCopy.h"
 #include "Logging.h"
 #include "TrainerConfig.pb.h"
-#include "DisableCopy.h"
 
-#include "TypeDefs.h"
 #include "Flags.h"
+#include "TypeDefs.h"
 #include "hl_gpu.h"
 
 /**
diff --git a/paddle/utils/Version.cpp b/paddle/utils/Version.cpp
index e706983918b4a865f6674a34083ef0143bd6e185..731c30842118bce59ce45297d9c8f47fa0a69d69 100644
--- a/paddle/utils/Version.cpp
+++ b/paddle/utils/Version.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,17 +14,12 @@ limitations under the License. */
 
 #include "Version.h"
 
-#include "Flags.h"
-#include "Util.h"
 #include <iomanip>
 #include <numeric>
-//! TODO(yuyang18) in gflags, version has another define. Use another flag
-//! instead.
-#ifndef PADDLE_USE_GFLAGS
-P_DEFINE_bool(version, false, "print version");
-#else
-P_DECLARE_bool(version);
-#endif
+#include "Flags.h"
+#include "Util.h"
+
+DECLARE_bool(version);
 
 namespace paddle {
 namespace version {
@@ -33,7 +28,12 @@ void printVersion(std::ostream& os) {
 #ifndef PADDLE_VERSION
 #define PADDLE_VERSION "unknown"
 #endif
-  os << "paddle version: " << PADDLE_VERSION << std::endl
+// converts macro to string
+// https://gcc.gnu.org/onlinedocs/cpp/Stringification.html
+#define xstr(s) str(s)
+#define str(s) #s
+
+  os << "paddle version: " << xstr(PADDLE_VERSION) << std::endl
      << std::boolalpha << "\t"
      << "withGpu: " << version::isWithGpu() << std::endl
      << "\t"
diff --git a/paddle/utils/Version.h b/paddle/utils/Version.h
index e6c799644ee7f88e4e90eec565d1bab2bc9faed7..d1a07d9485076e5382d47f7408fcbf032166b1ed 100644
--- a/paddle/utils/Version.h
+++ b/paddle/utils/Version.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include <stddef.h>
-#include "TypeDefs.h"
 #include <iostream>
+#include "TypeDefs.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp
index 93016daeaea644ca44499fdc6024ec8deac57ca8..2a6f96e04d024ac3977bc154dbeeb69ce9ab3a5d 100644
--- a/paddle/utils/arch/linux/Locks.cpp
+++ b/paddle/utils/arch/linux/Locks.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/utils/arch/osx/Locks.cpp b/paddle/utils/arch/osx/Locks.cpp
index ae563a6afd29b6315d9c6609474faddbfaaded14..e03992363fd6051a1970664d63406b2e7a47fce3 100644
--- a/paddle/utils/arch/osx/Locks.cpp
+++ b/paddle/utils/arch/osx/Locks.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/utils/Locks.h"
-#include "paddle/utils/Logging.h"
 #include <dispatch/dispatch.h>
-#include <atomic>
 #include <libkern/OSAtomic.h>
+#include <atomic>
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
index adf489fafe722117c53789593923c21a087bd74a..26fafbd1ab3f2967b765b8bcb973fb745c0e6422 100644
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
@@ -1,10 +1,9 @@
-add_simple_unittest(test_CommandLineParser)
-add_simple_unittest(test_Logging)
 add_simple_unittest(test_Thread)
 add_simple_unittest(test_StringUtils)
 add_simple_unittest(test_CustomStackTrace)
 add_simple_unittest(test_ThreadBarrier)
 add_simple_unittest(test_SpinLock)
+add_simple_unittest(test_SIMDFlags)
 
 add_executable(
     test_CustomStackTracePrint
diff --git a/paddle/utils/tests/test_CommandLineParser.cpp b/paddle/utils/tests/test_CommandLineParser.cpp
deleted file mode 100644
index 5ecfb2b4f511e63eac21a5eae3829532f6860d66..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/test_CommandLineParser.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_USE_GFLAGS
-//! Test Command Line Parser for paddle internal implement.
-
-#include <paddle/utils/CommandLineParser.h>
-#include <gtest/gtest.h>
-
-P_DEFINE_int32(i1, 1, "test int flag 1");
-P_DEFINE_int32(i2, 2, "test int flag 2");
-
-P_DEFINE_string(str1, "1", "test str flag 1");
-P_DEFINE_string(str2, "2", "test str flag 2");
-
-P_DEFINE_bool(b1, true, "test bool flag 1");
-P_DEFINE_bool(b2, false, "test bool flag 2");
-
-P_DEFINE_double(d1, 0.1, "test double flag 1");
-P_DEFINE_double(d2, -42.3, "test double flag 2");
-
-P_DEFINE_int64(l1, 1, "test int64 flag 1");
-P_DEFINE_int64(l2, 2, "test int64 flag 2");
-
-P_DEFINE_uint64(ul1, 32, "test uint64 flag 1");
-P_DEFINE_uint64(ul2, 33, "test uint64 flag 2");
-
-constexpr double EPSILON = 1e-5;
-
-#define cc(x) const_cast<char*>((x))
-
-TEST(CommandLineParser, defaultValue) {
-  char* argv[] = {cc("test_program"), cc("--unused_flag=134")};
-  int argc = sizeof(argv) / sizeof(char*);
-
-  paddle::ParseCommandLineFlags(&argc, argv);
-
-  // Check Default Value
-  ASSERT_EQ(argc, 2);
-  ASSERT_EQ(FLAGS_i1, 1);
-  ASSERT_EQ(FLAGS_i2, 2);
-  ASSERT_EQ(FLAGS_str1, "1");
-  ASSERT_EQ(FLAGS_str2, "2");
-  ASSERT_EQ(FLAGS_b1, true);
-  ASSERT_EQ(FLAGS_b2, false);
-  ASSERT_NEAR(FLAGS_d1, 0.1, EPSILON);
-  ASSERT_NEAR(FLAGS_d2, -42.3, EPSILON);
-  ASSERT_EQ(FLAGS_i1, 1);
-  ASSERT_EQ(FLAGS_i2, 2);
-  ASSERT_EQ(FLAGS_ul1, 32UL);
-  ASSERT_EQ(FLAGS_ul2, 33UL);
-}
-
-TEST(CommandLineParser, normal) {
-  char* argv[] = {cc("test_program"),
-                  cc("--i2=32"),
-                  cc("--str1=abc"),
-                  cc("--b2=1"),
-                  cc("-b1=False"),
-                  cc("--d2=.34"),
-                  cc("--d1=0"),
-                  cc("--l1=-12345678901234"),
-                  cc("-ul2=3212")};
-  int argc = sizeof(argv) / sizeof(char*);
-  paddle::ParseCommandLineFlags(&argc, argv);
-  ASSERT_EQ(argc, 1);
-  ASSERT_EQ(FLAGS_i2, 32);
-  ASSERT_EQ(FLAGS_str1, "abc");
-  ASSERT_EQ(FLAGS_b2, true);
-  ASSERT_EQ(FLAGS_b1, false);
-  ASSERT_NEAR(FLAGS_d2, 0.34, EPSILON);
-  ASSERT_NEAR(FLAGS_d1, 0.0, EPSILON);
-  ASSERT_EQ(FLAGS_l1, -12345678901234);
-  ASSERT_EQ(FLAGS_ul2, 3212UL);
-}
-
-TEST(CommandLineParser, printHelp) {
-  char* argv[] = {cc("test_program"), cc("--help")};
-  int argc = sizeof(argv) / sizeof(char*);
-
-  // Will Print Usage
-  ASSERT_DEATH(paddle::ParseCommandLineFlags(&argc, argv), ".*test_program.*");
-}
-
-TEST(CommandLineParser, parseError) {
-  char* argv[] = {cc("test_program"), cc("--i1=abc")};
-
-  int argc = sizeof(argv) / sizeof(char*);
-  ASSERT_DEATH(
-      paddle::ParseCommandLineFlags(&argc, argv),
-      "Parse command flag i1 error! User input is --i1=abc.*test_program.*");
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#else
-
-int main(int argc, char** argv) { return 0; }
-
-#endif
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
index 3bfb381ed93659feebcc567a04b2a095dc94dfa8..2ce199837601755ac018889c07c223ad34c4a45b 100644
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,12 +15,12 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <chrono>
 
-#include "paddle/utils/CustomStackTrace.h"
 #include "paddle/utils/CommandLineParser.h"
-#include "paddle/utils/Util.h"
+#include "paddle/utils/CustomStackTrace.h"
 #include "paddle/utils/Locks.h"
+#include "paddle/utils/Util.h"
 
-P_DEFINE_int32(test_thread_num, 10, "testing thread number");
+DEFINE_int32(test_thread_num, 10, "testing thread number");
 
 void testNormalImpl(
     const std::function<void(paddle::CustomStackTrace<std::string>&,
diff --git a/paddle/utils/tests/test_CustomStackTracePrint.cpp b/paddle/utils/tests/test_CustomStackTracePrint.cpp
index d39a190961a96906eef2b510cb3538c639d5df5c..611b16aa7116d03ee51ba0095d043b78df1742ba 100644
--- a/paddle/utils/tests/test_CustomStackTracePrint.cpp
+++ b/paddle/utils/tests/test_CustomStackTracePrint.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
 #include "paddle/utils/CustomStackTrace.h"
+#include "paddle/utils/Util.h"
 
 int main(int argc, char** argv) {
   paddle::initMain(argc, argv);
diff --git a/paddle/utils/tests/test_Logging.cpp b/paddle/utils/tests/test_Logging.cpp
deleted file mode 100644
index 9f477fab14a2abde93505a05fc4c9ccd3d6426b6..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/test_Logging.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * Basically from tensorflow/core/platform/default/logging.cc
- * Used in embedded system where there is no glogs.
- */
-
-#include <gtest/gtest.h>
-#include <fstream>
-#include <stdlib.h>
-#include <dirent.h>
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
-#ifndef PADDLE_USE_GLOG
-TEST(Logging, BasicalLog) {
-  auto pinfo = [] {
-    P_LOG(INFO) << "INFO";
-    exit(1);
-  };
-  ASSERT_DEATH(pinfo(), "I .*test_Logging.cpp:[0-9]+] INFO");
-
-  auto pwarn = [] {
-    P_LOG(WARNING) << "WARN";
-    exit(1);
-  };
-  ASSERT_DEATH(pwarn(), "W .*test_Logging.cpp:[0-9]+] WARN");
-
-  auto perr = [] {
-    P_LOG(ERROR) << "ERROR";
-    exit(1);
-  };
-  ASSERT_DEATH(perr(), "E .*test_Logging.cpp:[0-9]+] ERROR");
-
-  auto pfatal = [] { P_LOG(FATAL) << "FATAL"; };
-  ASSERT_DEATH(pfatal(), "F .*test_Logging.cpp:[0-9]+] FATAL");
-}
-
-TEST(Logging, Check) {
-  int a = 1;
-  int b = 2;
-  P_CHECK(a != b);
-
-  auto pcheckDown = [&] { P_CHECK(a == b); };
-  ASSERT_DEATH(pcheckDown(),
-               "F .*test_Logging.cpp:[0-9]+] Check failed: a == b ");
-
-  P_CHECK_LE(a, b);
-  P_CHECK_LT(a, b);
-  double t = 1.2;
-  P_CHECK_LE(a, t);
-  double* ptr = nullptr;
-
-  auto pcheckDown2 = [&] { P_CHECK_NOTNULL(ptr); };
-  ASSERT_DEATH(pcheckDown2(), "F");
-}
-
-#define cc(x) const_cast<char*>(x)
-
-TEST(Logging, LogToStderr) {
-  auto logToStderrCallback = [] {
-    setenv("PLOG_LOGTOSTDERR", "0", true);
-    char* argv[] = {cc("test")};
-    paddle::initializeLogging(1, argv);
-    P_LOG(INFO) << "This output will not print to std error";
-    exit(1);
-  };
-
-  ASSERT_DEATH(logToStderrCallback(), "");
-}
-
-constexpr char kLogDirName[] = "./test_log_dir";
-const std::vector<std::string> kLevels = {"INFO", "WARNING", "ERROR", "FATAL"};
-
-TEST(Logging, LogToDir) {
-  ASSERT_EQ(0, mkdir(kLogDirName, 0777));
-  auto logToDirCallback = [] {
-    setenv("PLOG_LOGTOSTDERR", "0", true);
-    setenv("PLOG_LOGDIR", kLogDirName, true);
-    char* argv[] = {cc("test")};
-    paddle::initializeLogging(1, argv);
-
-    P_LOG(INFO) << "INFO";
-    P_LOG(WARNING) << "WARNING";
-    P_LOG(ERROR) << "ERROR";
-    P_LOG(FATAL) << "FATAL";
-  };
-  ASSERT_DEATH(logToDirCallback(), "");
-
-  // There 4 file in logdir
-  auto dir = opendir(kLogDirName);
-  size_t fileCount = 0;
-  std::vector<std::string> filenames;
-  for (auto dirContent = readdir(dir); dirContent != nullptr;
-       dirContent = readdir(dir)) {
-    std::string filename(dirContent->d_name);
-    if (filename == "." || filename == "..") {
-      continue;
-    } else {
-      ++fileCount;
-      for (size_t i = 0; i < kLevels.size(); ++i) {
-        const std::string& curLevel = kLevels[i];
-        if (filename.size() > curLevel.length()) {
-          size_t diff = filename.size() - curLevel.length();
-          size_t j = 0;
-          for (; j < curLevel.length(); ++j) {
-            if (filename[j + diff] != curLevel[j]) {
-              // File Suffix Not Same, then break.
-              break;
-            }
-          }
-          if (j == curLevel.length()) {  // Same suffix.
-            std::ifstream fin;
-            auto fn = paddle::path::join(kLogDirName, filename);
-            fin.open(fn);
-            filenames.push_back(fn);
-            ASSERT_TRUE(fin.is_open());
-            size_t lineCounter = 0;
-            for (std::string line; std::getline(fin, line); ++lineCounter) {
-              // Do Nothing, Just calc lineCounter.
-            }
-
-            // For example.
-            // The info channel will have all log which level >= INFO
-            // So the info file's lineCounter should == 4.
-            ASSERT_EQ(kLevels.size() - i, lineCounter);
-            fin.close();
-          }
-        }
-      }
-    }
-  }
-  closedir(dir);
-  ASSERT_EQ(4UL, fileCount);  // 4 levels.
-  // Clean Unittest.
-  for (std::string& fn : filenames) {
-    ASSERT_EQ(remove(fn.c_str()), 0);
-  }
-  ASSERT_EQ(rmdir(kLogDirName), 0);
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#else
-
-int main(int, char**) { return 0; }
-
-#endif
diff --git a/paddle/utils/tests/test_SIMDFlags.cpp b/paddle/utils/tests/test_SIMDFlags.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..42edede209ad957c13c1cec8e6bb20bd0fe9d28b
--- /dev/null
+++ b/paddle/utils/tests/test_SIMDFlags.cpp
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+
+#include "paddle/utils/CpuId.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Util.h"
+
+using namespace paddle;  // NOLINT
+
+TEST(SIMDFlags, gccTest) {
+#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__))
+  // clang-format off
+  CHECK(!__builtin_cpu_supports("sse")    != HAS_SSE);
+  CHECK(!__builtin_cpu_supports("sse2")   != HAS_SSE2);
+  CHECK(!__builtin_cpu_supports("sse3")   != HAS_SSE3);
+  CHECK(!__builtin_cpu_supports("ssse3")  != HAS_SSSE3);
+  CHECK(!__builtin_cpu_supports("sse4.1") != HAS_SSE41);
+  CHECK(!__builtin_cpu_supports("sse4.2") != HAS_SSE42);
+  CHECK(!__builtin_cpu_supports("avx")    != HAS_AVX);
+  CHECK(!__builtin_cpu_supports("avx2")   != HAS_AVX2);
+// clang-format on
+#endif
+}
+
+TEST(SIMDFlags, normalPrint) {
+  LOG(INFO) << "Has SSE:     " << std::boolalpha << HAS_SSE;
+  LOG(INFO) << "Has SSE2:    " << std::boolalpha << HAS_SSE2;
+  LOG(INFO) << "Has SSE3:    " << std::boolalpha << HAS_SSE3;
+  LOG(INFO) << "Has SSSE3:   " << std::boolalpha << HAS_SSSE3;
+  LOG(INFO) << "Has SSE4:    " << std::boolalpha << HAS_SSE41 || HAS_SSE42;
+  LOG(INFO) << "Has FMA3:    " << std::boolalpha << HAS_FMA3;
+  LOG(INFO) << "Has FMA4:    " << std::boolalpha << HAS_FMA4;
+  LOG(INFO) << "Has AVX:     " << std::boolalpha << HAS_AVX;
+  LOG(INFO) << "Has AVX2:    " << std::boolalpha << HAS_AVX2;
+  LOG(INFO) << "Has AVX512:  " << std::boolalpha << HAS_AVX512;
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/utils/tests/test_SpinLock.cpp b/paddle/utils/tests/test_SpinLock.cpp
index 77d281962cfeaa3cc951a72eddf4f37b619c5691..8351e7e3acd1afe1c6507ffced32f27ce065e5ce 100644
--- a/paddle/utils/tests/test_SpinLock.cpp
+++ b/paddle/utils/tests/test_SpinLock.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include <vector>
-#include "paddle/utils/Logging.h"
 #include "paddle/utils/CommandLineParser.h"
-#include "paddle/utils/Util.h"
 #include "paddle/utils/Locks.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Util.h"
 
-P_DEFINE_int32(test_thread_num, 100, "testing thread number");
+DEFINE_int32(test_thread_num, 100, "testing thread number");
 
 void testNormalImpl(
     size_t thread_num,
diff --git a/paddle/utils/tests/test_StringUtils.cpp b/paddle/utils/tests/test_StringUtils.cpp
index 2c699b791ffad8ed680c5537005aac7dad832f41..fdc914d1bcc3c74e0f05ef475069abc315bdc306 100644
--- a/paddle/utils/tests/test_StringUtils.cpp
+++ b/paddle/utils/tests/test_StringUtils.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/utils/tests/test_Thread.cpp b/paddle/utils/tests/test_Thread.cpp
index 154db5d9c616d4817b933c82587834f5ce2d0f8e..2f5c5bbce07f39b799b928fd231bb4db1d2b3e05 100644
--- a/paddle/utils/tests/test_Thread.cpp
+++ b/paddle/utils/tests/test_Thread.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <atomic>
-#include <paddle/utils/Thread.h>
 #include <gtest/gtest.h>
+#include <paddle/utils/Thread.h>
+#include <atomic>
 
 using paddle::AsyncThreadPool;  // NOLINT
 
@@ -52,17 +52,13 @@ TEST(AsyncThreadPool, multiThreadAddBatchJob) {
   int counter = 0;
   const int numMonitors = 300;
   const int numSlaves = 300;
-  std::vector<AsyncThreadPool::JobFunc> moniterJobs(
-      numMonitors,
-      [&] {
-        std::vector<AsyncThreadPool::JobFunc> slaveJobs(
-            numSlaves,
-            [mut, &counter] {
-              std::lock_guard<std::mutex> lk(*mut);
-              counter++;
-            });
-        levelTwoPool.addBatchJobs(slaveJobs);
-      });
+  std::vector<AsyncThreadPool::JobFunc> moniterJobs(numMonitors, [&] {
+    std::vector<AsyncThreadPool::JobFunc> slaveJobs(numSlaves, [mut, &counter] {
+      std::lock_guard<std::mutex> lk(*mut);
+      counter++;
+    });
+    levelTwoPool.addBatchJobs(slaveJobs);
+  });
   levelOnePool.addBatchJobs(moniterJobs);
   ASSERT_EQ(counter, numMonitors * numSlaves);
 }
diff --git a/paddle/utils/tests/test_ThreadBarrier.cpp b/paddle/utils/tests/test_ThreadBarrier.cpp
index 20b9babd94cf4e6a475daece349c871bd606d83d..60c2214ffd1066ed4f7b95cd63dfe6a24fe66d67 100644
--- a/paddle/utils/tests/test_ThreadBarrier.cpp
+++ b/paddle/utils/tests/test_ThreadBarrier.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,12 +15,12 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <set>
 #include <vector>
-#include "paddle/utils/Logging.h"
 #include "paddle/utils/CommandLineParser.h"
-#include "paddle/utils/Util.h"
 #include "paddle/utils/Locks.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Util.h"
 
-P_DEFINE_int32(test_thread_num, 100, "testing thread number");
+DEFINE_int32(test_thread_num, 100, "testing thread number");
 
 void testNormalImpl(
     size_t thread_num,
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 461c73f14c2dc9377cc39ebb8f1273eee81730a3..2c40070eca44d8656d7ce82157a1b840092b9965 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -6,21 +6,6 @@ set(proto_filenames
     ParameterService.proto
     TrainerConfig.proto)
 
-set(real_proto_files)
-
-# TODO(yuyang18): Some internal proto will also be depended on.
-#                 Find a way to automatically calculate all depends.
-foreach(filename ${proto_filenames})
-    add_custom_command(OUTPUT ${filename}
-        COMMAND ${M4_EXECUTABLE} -Dreal=${ACCURACY} -I '${INTERNAL_PROTO_PATH}'
-              ${PROJ_ROOT}/proto/${filename}.m4 > ${filename}
-        DEPENDS ${PROJ_ROOT}/proto/${filename}.m4
-        COMMENT "Generate ${filename}")
-endforeach()
-
-add_custom_target(proto_accuracy ALL
-                    DEPENDS ${proto_filenames})
-
 set(PROTO_GEN)
 set(PROTO_GEN_PY)
 
@@ -35,9 +20,8 @@ foreach(filename ${proto_filenames})
     add_custom_command(OUTPUT ${CUR_PROTO_GEN}
         COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} 
                   --cpp_out ${CMAKE_CURRENT_BINARY_DIR}
-                  --proto_path ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/${filename}
-        DEPENDS proto_accuracy
-                ${PROJ_ROOT}/proto/${filename}.m4)
+		  --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
+        DEPENDS ${filename})
 
     set(CUR_PROTO_GEN_PY
         ${PROJ_ROOT}/paddle/python/paddle/proto/${base_filename}_pb2.py)
@@ -46,9 +30,8 @@ foreach(filename ${proto_filenames})
         ${PROTO_GEN_PY})
     add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
         COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${PROJ_ROOT}/python/paddle/proto
-                  --proto_path ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/${filename}
-        DEPENDS proto_accuracy
-                ${PROJ_ROOT}/proto/${filename}.m4)
+	--proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
+        DEPENDS ${filename})
 endforeach()
 
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/proto)
@@ -57,5 +40,4 @@ add_custom_target(gen_proto_cpp ALL DEPENDS ${PROTO_GEN})
 add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY})
 add_library(paddle_proto STATIC
     ${PROTO_GEN})
-add_dependencies(paddle_proto proto_accuracy)
 target_include_directories(paddle_proto PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/proto/DataConfig.proto.m4 b/proto/DataConfig.proto
similarity index 92%
rename from proto/DataConfig.proto.m4
rename to proto/DataConfig.proto
index 9862e4e7ef2ff96eafc91246e0b435c70fbe31d9..e895c184d9f95dba1449e6467a2566712837600b 100644
--- a/proto/DataConfig.proto.m4
+++ b/proto/DataConfig.proto
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,10 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+syntax = "proto2";
 
 package paddle;
 
-sinclude(`DataConfigExt.proto.m4')
+
 message FileGroupConf {
   optional uint32 queue_capacity = 1 [default = 1];
   // how many files to load for a load file thread
@@ -25,7 +26,7 @@ message FileGroupConf {
 };
 
 message DataConfig {
-sinclude(`DataConfigInter.proto.m4')
+
   required string type = 1;
 
   // name of a text file which contains a list of file names at each line
@@ -50,11 +51,11 @@ sinclude(`DataConfigInter.proto.m4')
 
   /// Note the field number 17, 18 and 19 have been deprecated.
 
-  // a list of values which will be used to create additional one dimensional real
+  // a list of values which will be used to create additional one dimensional float
   // values slots. These one dimensional slots can be used as the weight input
   // for cost layers.
   // Currently this is only supported by ProtoDataProvider.
-  repeated real constant_slots = 20;
+  repeated double constant_slots = 20;
 
   // for PyDataProvider.
   // Specify the load data script module name, object name and user args
@@ -79,6 +80,6 @@ sinclude(`DataConfigInter.proto.m4')
   optional bool is_main_data = 26 [default = true];
 
   // the usage ratio of instances. Setting to 1.0 means the use of all instances.
-  optional real usage_ratio = 27 [default = 1.0];
+  optional double usage_ratio = 27 [default = 1.0];
 };
 
diff --git a/proto/DataFormat.proto.m4 b/proto/DataFormat.proto
similarity index 96%
rename from proto/DataFormat.proto.m4
rename to proto/DataFormat.proto
index 556eace5e194ef26991cc06d1f7794f14fbbdded..19b1499b0281a1b92028cc8944c27ee4d56b8dd2 100644
--- a/proto/DataFormat.proto.m4
+++ b/proto/DataFormat.proto
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+syntax = "proto2";
 
 package paddle;
 
diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto
similarity index 90%
rename from proto/ModelConfig.proto.m4
rename to proto/ModelConfig.proto
index c835cfd5221c8579b383c0a6f0b2f0f554eac6d2..552af71e76e5adf27f35bb5ad6fd8a69c71df0f1 100644
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+syntax = "proto2";
 
 import "ParameterConfig.proto";
 
@@ -19,7 +20,7 @@ package paddle;
 /**
  * Various structs for the configuration of a neural network
  */
-sinclude(`ModelConfigExt.proto.m4')
+
 
 message ExternalConfig {
   repeated string layer_names = 1;
@@ -76,6 +77,12 @@ message ConvConfig {
   required uint32 filter_size_y = 10;
   required uint32 padding_y = 11;
   required uint32 stride_y = 12;
+
+  // if not set, use output_x
+  optional uint32 output_y = 13;
+
+  // if not set, use img_size
+  optional uint32 img_size_y = 14;
 }
 
 message PoolConfig {
@@ -121,11 +128,9 @@ message PoolConfig {
 }
 
 message SppConfig {
-  required string pool_type = 1;
-  required uint32 pyramid_height = 2;
-  required uint32 channels = 3;
-  required uint32 img_size = 4;
-  optional uint32 img_size_y = 5;
+  required ImageConfig image_conf = 1;
+  required string pool_type = 2;
+  required uint32 pyramid_height = 3;
 }
 
 message NormConfig {
@@ -141,8 +146,8 @@ message NormConfig {
 
   // the parameters for normalization
   // u = u / (1+scale*sum(u^2 in window))^pow
-  required real scale = 4;
-  required real pow = 5;
+  required double scale = 4;
+  required double pow = 5;
 
   // The size of output feature map.
   required uint32 output_x = 6;
@@ -155,6 +160,12 @@ message NormConfig {
   // fixed window: shared a fixed window for each value
   // sliding window: have a different window for each value
   optional bool blocked = 8;
+
+  // if not set, use output_x
+  optional uint32 output_y = 9;
+
+  // if not set, use img_size
+  optional uint32 img_size_y = 10;
 }
 
 message BlockExpandConfig {
@@ -179,12 +190,8 @@ message BlockExpandConfig {
 }
 
 message MaxOutConfig {
-  required uint32 channels = 1;
+  required ImageConfig image_conf = 1;
   required uint32 groups = 2;
-
-  // The size of input feature map.
-  required uint32 img_size_x = 3;
-  required uint32 img_size_y = 4;
 }
 
 message ProjectionConfig {
@@ -216,7 +223,7 @@ message OperatorConfig {
   required uint64 output_size = 4;
 
   // For DotMulOperator
-  optional real dotmul_scale = 5 [default = 1.0];
+  optional double dotmul_scale = 5 [default = 1.0];
 
   // For ConvOperator
   optional ConvConfig conv_conf = 6;
@@ -225,12 +232,10 @@ message OperatorConfig {
 
 message BilinearInterpConfig {
   // The size of input feature map.
-  optional uint32 img_size_x = 1;
-  optional uint32 img_size_y = 2;
+  required ImageConfig image_conf = 1;
   // The size of output feature map.
-  required uint32 out_size_x = 3;
-  required uint32 out_size_y = 4;
-  required uint32 num_channels = 5;
+  required uint32 out_size_x = 2;
+  required uint32 out_size_y = 3;
 }
 
 message ImageConfig {
@@ -240,6 +245,7 @@ message ImageConfig {
 
   // The size of input feature map.
   required uint32 img_size = 8;
+  optional uint32 img_size_y = 9;
 }
 
 message LayerInputConfig {
@@ -260,7 +266,7 @@ message LayerInputConfig {
 }
 
 message LayerConfig {
-sinclude(`ModelConfigLayer.proto.m4')
+
   required string name = 1;
   required string type = 2;
   optional uint64 size = 3;
@@ -287,7 +293,7 @@ sinclude(`ModelConfigLayer.proto.m4')
   optional uint32 partial_sum = 9;
 
   // for dropout
-  optional real drop_rate = 10;
+  optional double drop_rate = 10;
 
   // for HierarchicalSoftmaxLayer and NCELayer
   // the number of classes
@@ -311,17 +317,17 @@ sinclude(`ModelConfigLayer.proto.m4')
   // For NCELayer
   // The distribution for generating the random negative labels.
   // A uniform distribution will be used if not provided
-  repeated real neg_sampling_dist = 17 [packed = true];
+  repeated double neg_sampling_dist = 17 [packed = true];
 
   // For MaxLayer
   // default: output VALUE of MaxLayer. set this flag to true for output INDEX
-  // INDEX will be put in Argument::value as real values.
+  // INDEX will be put in Argument::value as double values.
   optional bool output_max_index = 19 [default = false];
 
   /// The filed number 20 have been deprecated.
 
   // For self-normalized estimation
-  optional real softmax_selfnorm_alpha = 21 [default = 0.1];
+  optional double softmax_selfnorm_alpha = 21 [default = 0.1];
 
   /// The filed numbers 22 and 23 have been deprecated.
 
@@ -332,14 +338,14 @@ sinclude(`ModelConfigLayer.proto.m4')
   optional bool norm_by_times = 25;
 
   // for CostLayers
-  optional real coeff = 26 [default = 1.0];
+  optional double coeff = 26 [default = 1.0];
 
   // for AverageLayer
   // can be set to: 'average', 'sum' or 'squarerootn'
   optional string average_strategy = 27;
 
   // for error clipping
-  optional real error_clipping_threshold = 28 [default = 0.0];
+  optional double error_clipping_threshold = 28 [default = 0.0];
 
   // for operators used by mixed layer
   repeated OperatorConfig operator_confs = 29;
@@ -349,11 +355,11 @@ sinclude(`ModelConfigLayer.proto.m4')
   optional int32 max_sort_size = 31;
 
   // for SlopeInterceptLayer
-  optional real slope = 32;
-  optional real intercept = 33;
+  optional double slope = 32;
+  optional double intercept = 33;
 
   // for CosSimVecMatLayer and CosSimLayer
-  optional real cos_scale = 34;
+  optional double cos_scale = 34;
 
   // for DataNormLayer
   // can be set to: 'z-score', 'min-max' or 'decimal-scaling'
@@ -388,7 +394,7 @@ sinclude(`ModelConfigLayer.proto.m4')
   // if number of the selected columns is less than
   // sample number * selective_fc output size * selective_fc_mull_mull_ratio
   // sparse multiplication is used, otherwise, using full multiplication.
-  optional real selective_fc_full_mul_ratio = 44 [default = 0.02];
+  optional double selective_fc_full_mul_ratio = 44 [default = 0.02];
 
   // to indicate how many threads selective_fc use to to accelate
   // the plain_mul period
@@ -400,7 +406,7 @@ sinclude(`ModelConfigLayer.proto.m4')
   optional bool use_global_stats = 46;
 
   // use to compute moving mean and variance.
-  optional real moving_average_fraction = 47 [default = 0.9];
+  optional double moving_average_fraction = 47 [default = 0.9];
 
   // bias size
   optional uint32 bias_size = 48 [default = 0];
@@ -412,7 +418,13 @@ sinclude(`ModelConfigLayer.proto.m4')
   // string type is used for flexibility: different types can be converted
   // to string and reinterpreted in the user's own layer implementation.  
   optional string user_arg = 49;
+  
+  // to indicate rectangle image data
+  optional uint64 height = 50;
+  optional uint64 width = 51;
 
+  // blank label used in ctc loss
+  optional uint32 blank = 52 [default = 0];
 }
 
 message EvaluatorConfig {
@@ -426,7 +438,7 @@ message EvaluatorConfig {
 
   // Used by PrecisionRecallEvaluator and ClassificationErrorEvaluator
   // For multi binary labels: true if output > classification_threshold
-  optional real classification_threshold = 6 [default = 0.5];
+  optional double classification_threshold = 6 [default = 0.5];
   // The positive label. -1 means average precision and recall
   optional int32 positive_label = 7 [default = -1];
 
diff --git a/proto/ParameterConfig.proto.m4 b/proto/ParameterConfig.proto
similarity index 85%
rename from proto/ParameterConfig.proto.m4
rename to proto/ParameterConfig.proto
index e8d512445e5025f5663fbe3e20b4425cf1633a2b..cbcd0af598df22c36c66767fdeb7add2aa49e87d 100644
--- a/proto/ParameterConfig.proto.m4
+++ b/proto/ParameterConfig.proto
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+syntax = "proto2";
 
 package paddle;
 
@@ -31,14 +32,14 @@ message ParameterUpdaterHookConfig {
 message ParameterConfig {
   required string name = 1;
   required uint64 size = 2;
-  optional real learning_rate = 3 [default = 1.0];
-  optional real momentum = 4 [default = 0.0];
-  optional real initial_mean = 5 [default = 0.0];
-  optional real initial_std = 6 [default = 0.01];
+  optional double learning_rate = 3 [default = 1.0];
+  optional double momentum = 4 [default = 0.0];
+  optional double initial_mean = 5 [default = 0.0];
+  optional double initial_std = 6 [default = 0.01];
   // use L2-regularization if decay_rate set and decay_rate_l1 not set
-  optional real decay_rate = 7 [default = 0.0];
+  optional double decay_rate = 7 [default = 0.0];
   // use L1-regularization if decay_rate_l1 set
-  optional real decay_rate_l1 = 8 [default = 0.0];
+  optional double decay_rate_l1 = 8 [default = 0.0];
   // dims of Parameter, e.g. dims[0] as height, dims[1] as width..
   repeated uint64 dims = 9;
   // the gpu device which the parameter in.
@@ -59,7 +60,7 @@ message ParameterConfig {
   // sparse remote update or not
   optional bool sparse_remote_update = 16 [default = false];
   // gradient clipping threshold, no clipping by default
-  optional real gradient_clipping_threshold = 17 [default = 0.0];
+  optional double gradient_clipping_threshold = 17 [default = 0.0];
   // static parameters are fixed when training
   optional bool is_static = 18 [default = false];
   // para_id should NOT be set by config_parser. It is for
diff --git a/proto/ParameterService.proto.m4 b/proto/ParameterService.proto
similarity index 96%
rename from proto/ParameterService.proto.m4
rename to proto/ParameterService.proto
index 189dc1c9700bd821959bab80aef3721bd4940b5c..c1c04d8cc5bdedd09173d5dfa10b82c7ee7ed6a4 100644
--- a/proto/ParameterService.proto.m4
+++ b/proto/ParameterService.proto
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+syntax = "proto2";
 
 import "ParameterConfig.proto";
 import "TrainerConfig.proto";
@@ -20,7 +21,6 @@ package paddle;
 /**
  * Various structs for communicating with parameter server
  */
-
 enum ParameterUpdateMode {
   // Set parameter
    PSERVER_UPDATE_MODE_SET_PARAM = 0;//use local param
@@ -73,7 +73,7 @@ message SendParameterRequest {
   optional int64 num_samples = 4;
 
   // cost will be used to calculate global objective value
-  optional real cost = 5;
+  optional double cost = 5;
 
   required BatchStatus batch_status = 6;
 
@@ -245,13 +245,13 @@ enum MatrixVectorOperation {
 
 message ProtoVector {
   required int64 dim = 1;
-  repeated real values = 2 [packed = true];
+  repeated double values = 2 [packed = true];
 }
 
 message ProtoMatrix {
   required int64 num_rows = 1;
   required int64 num_cols = 2;
-  repeated real values = 3 [packed = true];
+  repeated double values = 3 [packed = true];
 }
 
 message Operation {
@@ -263,7 +263,7 @@ message Operation {
   // matrix handles created on the pserver
   repeated int64 pmatrices = 3;       // A, B, C
 
-  repeated real scalars = 4;  	      // a, b, c
+  repeated double scalars = 4;  	      // a, b, c
   repeated ProtoVector vectors = 5;   // x, y, z
   repeated ProtoMatrix matrices = 6;  // X, Y, Z
 }
@@ -272,7 +272,7 @@ message OperationResult {
   // error message. Empty if success
   optional string return_message = 1;
 //
-  repeated real scalars = 2;  // d, e, f
+  repeated double scalars = 2;  // d, e, f
   repeated ProtoVector vectors = 3;  // p, q, r
   repeated ProtoMatrix matrices = 4;  // P, Q, R
 }
diff --git a/proto/TrainerConfig.proto.m4 b/proto/TrainerConfig.proto
similarity index 86%
rename from proto/TrainerConfig.proto.m4
rename to proto/TrainerConfig.proto
index 3b0e24f90bed8cdf0e102c12d2a4a041c17a8447..a334e07b6282a6ff9867482e0c3a299df2a78d1d 100644
--- a/proto/TrainerConfig.proto.m4
+++ b/proto/TrainerConfig.proto
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+syntax = "proto2";
 
 import "DataConfig.proto";
 import "ModelConfig.proto";
@@ -23,9 +24,9 @@ message OptimizationConfig {
   optional int32 num_batches_per_send_parameter = 5 [default = 1];
   optional int32 num_batches_per_get_parameter = 6 [default = 1];
 
-  required real learning_rate = 7;
-  optional real learning_rate_decay_a = 8 [default = 0];
-  optional real learning_rate_decay_b = 9 [default = 0];
+  required double learning_rate = 7;
+  optional double learning_rate_decay_a = 8 [default = 0];
+  optional double learning_rate_decay_b = 9 [default = 0];
   optional string learning_rate_schedule = 27 [default = "constant"];
   // learning rate will be scaled according to learning_rate_schedule
   // 1), constant:
@@ -48,14 +49,14 @@ message OptimizationConfig {
 
   // owlqn related
   // L1-regularization
-  optional real l1weight = 10 [default = 0.1];
+  optional double l1weight = 10 [default = 0.1];
   // L2-regularization
-  optional real l2weight = 11 [default = 0];
+  optional double l2weight = 11 [default = 0];
   // "c1" in wolfe condition: if (newobj <= oldobj + c1 * origDirDeriv * step)
   // then accept the step
-  optional real c1 = 12 [default = 0.0001];
+  optional double c1 = 12 [default = 0.0001];
   // multiply the step with "backoff", when wolfe condition doesn't satisfy
-  optional real backoff = 13 [default = 0.5];
+  optional double backoff = 13 [default = 0.5];
   // how many "s"s and "y"s are kept in owlqn
   optional int32 owlqn_steps = 14 [default = 10];
   // accept the step if encountered "max_backoff" times of "reduce the step"
@@ -81,15 +82,15 @@ message OptimizationConfig {
   // default learning method("momentum") use global decayed learning rate with momentum.
   // "adagrad", "adadelta" and "rmsprop" can set momentum too.
   optional string learning_method = 23 [default = "momentum"];
-  optional real ada_epsilon = 24 [default = 1e-6];
-  optional real ada_rou = 26 [default = 0.95];
+  optional double ada_epsilon = 24 [default = 1e-6];
+  optional double ada_rou = 26 [default = 0.95];
 
   // Force to do average in cpu in order to save gpu memory usage
   optional bool do_average_in_cpu = 25 [default = false];
 
   // delta add rate in pserver, used while num_batches_per_send_parameter>1
   // will be divided by #machines automatically.
-  optional real delta_add_rate = 28 [default = 1.0];
+  optional double delta_add_rate = 28 [default = 1.0];
 
   // We split a large size into smaller mini-batches, whose sizes are
   // determined by mini_batch_size. It only takes effect when there is
@@ -107,14 +108,14 @@ message OptimizationConfig {
 
   // shrink sparse parameter value
   // only works if parameter is remote sparse update and has L1 decay rate
-  optional real shrink_parameter_value = 32 [default = 0];
+  optional double shrink_parameter_value = 32 [default = 0];
 
   ////////////////////////////
   // Options Adam Optimizer //
   ////////////////////////////
-  optional real adam_beta1 = 33 [default = 0.9];
-  optional real adam_beta2 = 34 [default = 0.999];
-  optional real adam_epsilon = 35 [default = 1e-8];
+  optional double adam_beta1 = 33 [default = 0.9];
+  optional double adam_beta2 = 34 [default = 0.999];
+  optional double adam_epsilon = 35 [default = 1e-8];
 
   // arguments for learning rate scheduler
   // Format: num1:rate1,num2:rate2,...,numK:rateK
@@ -126,7 +127,7 @@ message OptimizationConfig {
   // for async sgd gradient commit control.
   // when async_lagged_grad_discard_ratio * num_gradient_servers commit passed,
   // current async gradient will be discard silently.
-  optional real async_lagged_grad_discard_ratio = 37 [default = 1.5];
+  optional double async_lagged_grad_discard_ratio = 37 [default = 1.5];
 };
 
 message TrainerConfig {
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index c90af2ee000d46a032984ee23559e7e99b49ddad..f662d6826321eb840739382558f76327d27b5847 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/proto/__init__.py b/python/paddle/proto/__init__.py
index cd6a59ecbb0952e89f34b11678a60ad300585979..07406a841ec90a79fbe5d0aca7b19d19d85e008a 100644
--- a/python/paddle/proto/__init__.py
+++ b/python/paddle/proto/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py
index 0c577ec657bc6d35c41e55ed5ab6adb80ab2c37c..6618153df30250652f1721d2fb0bb75ecbb8a04a 100644
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -203,6 +203,26 @@ class CheckWrapper(object):
             callback(each)
 
 
+class CheckInputTypeWrapper(object):
+    def __init__(self, generator, input_types, logger):
+        self.generator = generator
+        self.input_types = input_types
+        self.logger = logger
+
+    def __call__(self, obj, filename):
+        for items in self.generator(obj, filename):
+            try:
+                # dict type is required for input_types when item is dict type 
+                assert (isinstance(items, dict) and \
+                        not isinstance(self.input_types, dict))==False
+                yield items
+            except AssertionError as e:
+                self.logger.error(
+                    "%s type is required for input type but got %s" %
+                    (repr(type(items)), repr(type(self.input_types))))
+                raise
+
+
 def provider(input_types=None,
              should_shuffle=None,
              pool_size=-1,
@@ -355,6 +375,9 @@ def provider(input_types=None,
                 if use_dynamic_order:
                     self.generator = InputOrderWrapper(self.generator,
                                                        self.input_order)
+                else:
+                    self.generator = CheckInputTypeWrapper(
+                        self.generator, self.slots, self.logger)
                 if self.check:
                     self.generator = CheckWrapper(self.generator, self.slots,
                                                   check_fail_continue,
diff --git a/python/paddle/trainer/PyDataProviderWrapper.py b/python/paddle/trainer/PyDataProviderWrapper.py
index 90b684a000017fc03c8c33f829aaa64a5f769e45..6af250772859811b3c48434ab005e50b435dd320 100644
--- a/python/paddle/trainer/PyDataProviderWrapper.py
+++ b/python/paddle/trainer/PyDataProviderWrapper.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/trainer/__init__.py b/python/paddle/trainer/__init__.py
index c90af2ee000d46a032984ee23559e7e99b49ddad..f662d6826321eb840739382558f76327d27b5847 100644
--- a/python/paddle/trainer/__init__.py
+++ b/python/paddle/trainer/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 9db42bf172a77ff0972107dd26eed3882bf5906e..5b7f4d85e2c3343013938e38492be8985a8cd11f 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -138,7 +138,14 @@ def init_config_environment(
         g_root_submodel=None,
         g_submodel_map={},
         g_submodel_stack=[],
-        g_add_submodel_suffix=False, ):
+        g_add_submodel_suffix=False,
+
+        # Whether current layer needs to pass the image height and width.
+        # Default value is true, but if it encounters recurrent_layer_group,
+        # it will be false. The reason is that image is converted to be sequence,
+        # image height will be sequence length, and image width will be feature
+        # length of each timestep.
+        g_pass_height_width=True, ):
 
     for k, v in locals().iteritems():
         globals()[k] = copy.deepcopy(v)
@@ -686,9 +693,9 @@ class ConvProjection(Projection):
 
         parse_conv(conv_conf, input_layer_name, self.proj_conf.conv_conf,
                    num_filters)
-        # TODO: support rectangle input
-        self.proj_conf.output_size = (self.proj_conf.conv_conf.output_x
-                                      **2) * num_filters
+        self.proj_conf.output_size = self.proj_conf.conv_conf.output_x * \
+                                     self.proj_conf.conv_conf.output_y * \
+                                     num_filters
 
     def calc_output_size(self, input_layer_config):
         return self.proj_conf.output_size
@@ -698,7 +705,8 @@ class ConvProjection(Projection):
         ci = self.proj_conf.conv_conf.channels
         fh = self.proj_conf.conv_conf.filter_size
         fw = self.proj_conf.conv_conf.filter_size_y
-        return co * ci * fh * fw
+        gr = self.proj_conf.conv_conf.groups
+        return co * ci * fh * fw / gr
 
     def calc_bias_size(self):
         return self.proj_conf.num_filters
@@ -763,8 +771,9 @@ class ConvOperator(Operator):
         parse_conv(conv_conf,
                    MakeLayerNameInSubmodel(input_layer_names[0]),
                    self.operator_conf.conv_conf, num_filters)
-        self.operator_conf.output_size = (self.operator_conf.conv_conf.output_x
-                                          **2) * num_filters
+        self.operator_conf.output_size = self.operator_conf.conv_conf.output_x * \
+                                         self.operator_conf.conv_conf.output_y * \
+                                         num_filters
 
         config_assert(len(input_layer_names) == 2, "Conv is binary operator")
 
@@ -799,14 +808,12 @@ class Conv(Cfg):
             config_assert(output_x <= 0)
 
 
-# please refer to the comments in proto/ModelConfig.proto
 @config_class
 class BilinearInterp(Cfg):
-    def __init__(self, out_size_x=None, out_size_y=None, num_channels=None):
+    def __init__(self, out_size_x=None, out_size_y=None, channels=None):
         self.add_keys(locals())
 
 
-# please refer to the comments in proto/ModelConfig.proto
 @config_class
 class Pool(Cfg):
     def __init__(
@@ -824,14 +831,12 @@ class Pool(Cfg):
         self.add_keys(locals())
 
 
-# please refer to the comments in proto/ModelConfig.proto
 @config_class
 class SpatialPyramidPool(Cfg):
-    def __init__(self, pool_type, pyramid_height, channels, img_width=None):
+    def __init__(self, pool_type, pyramid_height, channels):
         self.add_keys(locals())
 
 
-# please refer to the comments in proto/ModelConfig.proto
 @config_class
 class Norm(Cfg):
     def __init__(self,
@@ -846,7 +851,6 @@ class Norm(Cfg):
         self.add_keys(locals())
 
 
-# please refer to the comments in proto/ModelConfig.proto
 @config_class
 class Image(Cfg):
     def __init__(self, channels, img_size=None):
@@ -1053,18 +1057,8 @@ def TestData(data_config, async_load_data=None):
         g_config.test_data_config.async_load_data = async_load_data
 
 
-def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
-    bilinear_conf.out_size_x = bilinear.out_size_x
-    bilinear_conf.out_size_y = bilinear.out_size_y
-    bilinear_conf.num_channels = bilinear.num_channels
-
-
-'''
-caffe_mode: compute the output size using floor instead of ceil,
-            which is consistent of caffe and CuDNN's convention.
-'''
-
-
+#caffe_mode: compute the output size using floor instead of ceil,
+#            which is consistent of caffe and CuDNN's convention.
 def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
     output = (2 * padding + img_size - filter_size) / float(stride)
     if caffe_mode:
@@ -1073,20 +1067,34 @@ def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
         return 1 + int(math.ceil(output))
 
 
-'''
-calcualte image_size based on output_size for convolution. 
-It is the reverse function of cnn_output_size
-'''
-
-
+#calcualte image_size based on output_size for de-convolution (ConvTransLayer).
+#It is the reverse function of cnn_output_size
 def cnn_image_size(output_size, filter_size, padding, stride, caffe_mode):
-    if caffe_mode:
-        img_size = (output_size - 1) * stride + filter_size - 2 * padding
-    else:
-        img_size = (output_size - 2) * stride + filter_size - 2 * padding + 1
+    img_size = (output_size - 1) * stride + filter_size - 2 * padding
+    if not caffe_mode:
+        img_size = img_size + 1
     return img_size
 
 
+def get_img_size(input_layer_name, channels):
+    input = g_layer_map[input_layer_name]
+    img_pixels = input.size / channels
+    img_size = input.width if input.width > 0 else int(img_pixels**0.5)
+    img_size_y = input.height if input.height > 0 else int(img_pixels /
+                                                           img_size)
+    config_assert(
+        img_size * img_size_y == img_pixels,
+        "Input layer %s: Incorrect input image size %d * %d for input image pixels %d"
+        % (input_layer_name, img_size, img_size_y, img_pixels))
+    return img_size, img_size_y
+
+
+def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
+    parse_image(bilinear, input_layer_name, bilinear_conf.image_conf)
+    bilinear_conf.out_size_x = bilinear.out_size_x
+    bilinear_conf.out_size_y = bilinear.out_size_y
+
+
 def parse_pool(pool, input_layer_name, pool_conf):
     pool_conf.pool_type = pool.pool_type
     config_assert(pool.pool_type in [
@@ -1102,14 +1110,8 @@ def parse_pool(pool, input_layer_name, pool_conf):
     pool_conf.size_y = default(pool.size_y, pool_conf.size_x)
     pool_conf.stride_y = default(pool.stride_y, pool_conf.stride)
 
-    img_pixels = g_layer_map[input_layer_name].size / pool.channels
-    # the img_width may be removed,
-    # and it can be calculated automatically later.
-    pool_conf.img_size = default(pool.img_width, int(img_pixels**0.5))
-    pool_conf.img_size_y = img_pixels / pool_conf.img_size
-    config_assert(pool_conf.img_size * pool_conf.img_size_y == img_pixels,
-                  "Incorrect input image size %d for input image pixels %d" %
-                  (pool_conf.img_size, img_pixels))
+    pool_conf.img_size, pool_conf.img_size_y = \
+        get_img_size(input_layer_name, pool.channels)
 
     config_assert(not pool.start, "start is deprecated in pooling.")
 
@@ -1125,29 +1127,18 @@ def parse_pool(pool, input_layer_name, pool_conf):
 
 
 def parse_spp(spp, input_layer_name, spp_conf):
+    parse_image(spp, input_layer_name, spp_conf.image_conf)
     spp_conf.pool_type = spp.pool_type
     config_assert(spp.pool_type in ['max-projection', 'avg-projection'],
                   "pool-type %s is not in "
                   "['max-projection', 'avg-projection']" % spp.pool_type)
     spp_conf.pyramid_height = spp.pyramid_height
-    spp_conf.channels = spp.channels
-
-    img_pixels = g_layer_map[input_layer_name].size / spp_conf.channels
-
-    spp_conf.img_size = default(spp.img_width, int(img_pixels**0.5))
-    spp_conf.img_size_y = img_pixels / spp_conf.img_size
-    config_assert(spp_conf.img_size * spp_conf.img_size_y == img_pixels,
-                  "Incorrect input image size %d for input image pixels %d" %
-                  (spp_conf.img_size, img_pixels))
 
 
 def parse_image(image, input_layer_name, image_conf):
     image_conf.channels = image.channels
-    image_pixels = g_layer_map[input_layer_name].size / image_conf.channels
-    image_conf.img_size = int(image_pixels**0.5)
-    config_assert((image_conf.img_size**2) == image_pixels,
-                  "Incorrect input image size %d for input image pixels %d" %
-                  (image_conf.img_size, image_pixels))
+    image_conf.img_size, image_conf.img_size_y = \
+        get_img_size(input_layer_name, image_conf.channels)
 
 
 def parse_norm(norm, input_layer_name, norm_conf):
@@ -1161,24 +1152,18 @@ def parse_norm(norm, input_layer_name, norm_conf):
     norm_conf.pow = norm.pow
     norm_conf.blocked = norm.blocked
 
-    img_pixels = g_layer_map[input_layer_name].size / norm.channels
-    norm_conf.img_size = int(img_pixels**0.5)
-    config_assert((norm_conf.img_size**2) == img_pixels,
-                  "Incorrect input image size %d for input image pixels %d" %
-                  (norm_conf.img_size, img_pixels))
+    norm_conf.img_size, norm_conf.img_size_y = \
+        get_img_size(input_layer_name, norm.channels)
     norm_conf.output_x = norm_conf.img_size
+    norm_conf.output_y = norm_conf.img_size_y
     if norm.norm_type in ['cmrnorm-projection']:
         norm_conf.scale /= norm.size
     else:
         norm_conf.scale /= norm.size**2
 
 
-'''
-caffe_mode: compute the output size using floor instead of ceil,
-            which is consistent of caffe and CuDNN's convention.
-'''
-
-
+#caffe_mode: compute the output size using floor instead of ceil,
+#            which is consistent of caffe and CuDNN's convention.
 def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
     conv_conf.filter_size = conv.filter_size
     conv_conf.filter_size_y = conv.filter_size_y
@@ -1192,33 +1177,24 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
 
     if not trans:
         conv_conf.filter_channels = conv.channels / conv.groups
-
-        img_pixels = g_layer_map[input_layer_name].size / conv.channels
-        print('channels=%d size=%d' % (conv.channels,
-                                       g_layer_map[input_layer_name].size))
-        conv_conf.img_size = int(img_pixels**0.5)
-        config_assert((conv_conf.img_size**2) == img_pixels, (
-            "Input layer %s: Incorrect input image size %d for input " +
-            "image pixels %d") %
-                      (input_layer_name, conv_conf.img_size, img_pixels))
-
+        conv_conf.img_size, conv_conf.img_size_y = \
+            get_img_size(input_layer_name, conv.channels)
         conv_conf.output_x = cnn_output_size(
             conv_conf.img_size, conv_conf.filter_size, conv_conf.padding,
             conv_conf.stride, conv_conf.caffe_mode)
+        conv_conf.output_y = cnn_output_size(
+            conv_conf.img_size_y, conv_conf.filter_size_y, conv_conf.padding_y,
+            conv_conf.stride_y, conv_conf.caffe_mode)
     else:
         conv_conf.filter_channels = num_filters / conv.groups
-
-        outputSize = g_layer_map[input_layer_name].size / conv.channels
-        print('channels=%d size=%d' % (conv.channels,
-                                       g_layer_map[input_layer_name].size))
-        conv_conf.output_x = int(outputSize**0.5)
-        config_assert((conv_conf.output_x**2) == outputSize, (
-            "Input layer %s: Incorrect input image size %d for input " +
-            "image pixels %d") %
-                      (input_layer_name, conv_conf.output_x, outputSize))
+        conv_conf.output_x, conv_conf.output_y = \
+            get_img_size(input_layer_name, conv.channels)
         conv_conf.img_size = cnn_image_size(
             conv_conf.output_x, conv_conf.filter_size, conv_conf.padding,
             conv_conf.stride, conv_conf.caffe_mode)
+        conv_conf.img_size_y = cnn_image_size(
+            conv_conf.output_y, conv_conf.filter_size_y, conv_conf.padding_y,
+            conv_conf.stride_y, conv_conf.caffe_mode)
 
 
 def parse_block_expand(block_expand, input_layer_name, block_expand_conf):
@@ -1247,10 +1223,8 @@ def parse_block_expand(block_expand, input_layer_name, block_expand_conf):
 
 
 def parse_maxout(maxout, input_layer_name, maxout_conf):
-    maxout_conf.channels = maxout.channels
+    parse_image(maxout, input_layer_name, maxout_conf.image_conf)
     maxout_conf.groups = maxout.groups
-    maxout_conf.img_size_x = maxout.img_size_x
-    maxout_conf.img_size_y = maxout.img_size_y
 
 
 # Define an evaluator
@@ -1377,6 +1351,12 @@ class LayerBase(object):
 
         g_current_submodel.layer_names.append(self.config.name)
 
+        if self.config.type != 'data' and g_pass_height_width:
+            height = self.get_input_layer(0).height
+            width = self.get_input_layer(0).width
+            if height and width:
+                self.set_layer_height_width(height, width)
+
     def get_input_layer(self, input_index):
         return g_layer_map[self.config.inputs[input_index].input_layer_name]
 
@@ -1494,6 +1474,23 @@ class LayerBase(object):
                           'Different inputs result in' +
                           'different layer size at layer %s' % self.config.name)
 
+    def set_layer_height_width(self, height, width):
+        self.config.height = height
+        self.config.width = width
+
+    def set_cnn_layer(self,
+                      input_layer_name,
+                      height,
+                      width,
+                      channels,
+                      is_print=True):
+        size = height * width * channels
+        self.set_layer_size(size)
+        self.set_layer_height_width(height, width)
+        if is_print:
+            print("output for %s: c = %d, h = %d, w = %d, size = %d" %
+                  (input_layer_name, channels, height, width, size))
+
 
 @config_layer('multi_class_cross_entropy_with_selfnorm')
 class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
@@ -1583,9 +1580,11 @@ class PrintLayer(LayerBase):
 
 @config_layer('data')
 class DataLayer(LayerBase):
-    def __init__(self, name, size, device=None):
+    def __init__(self, name, size, height=None, width=None, device=None):
         super(DataLayer, self).__init__(
             name, 'data', size, inputs=[], device=device)
+        if height and width:
+            self.set_layer_height_width(height, width)
 
 
 '''
@@ -1684,14 +1683,13 @@ class ConvLayerBase(LayerBase):
 
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
-            parse_conv(self.inputs[input_index].conv, input_layer.name,
-                       self.config.inputs[input_index].conv_conf, num_filters)
             conv_conf = self.config.inputs[input_index].conv_conf
+            parse_conv(self.inputs[input_index].conv, input_layer.name,
+                       conv_conf, num_filters)
             psize = self.calc_parameter_size(conv_conf)
-            print("output size for %s is %d " % (name, conv_conf.output_x))
             self.create_input_parameter(input_index, psize)
-            self.set_layer_size(
-                (conv_conf.output_x**2) * self.config.num_filters)
+            self.set_cnn_layer(name, conv_conf.output_y, conv_conf.output_x,
+                               self.config.num_filters)
 
         psize = self.config.size
         if shared_biases:
@@ -1778,10 +1776,11 @@ class NormLayer(LayerBase):
             name, 'norm', 0, inputs=inputs, device=device)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
-            parse_norm(self.inputs[input_index].norm, input_layer.name,
-                       self.config.inputs[input_index].norm_conf)
             norm_conf = self.config.inputs[input_index].norm_conf
-            self.set_layer_size((norm_conf.output_x**2) * norm_conf.channels)
+            parse_norm(self.inputs[input_index].norm, input_layer.name,
+                       norm_conf)
+            self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x,
+                               norm_conf.channels, False)
 
 
 @config_layer('pool')
@@ -1791,13 +1790,11 @@ class PoolLayer(LayerBase):
             name, 'pool', 0, inputs=inputs, device=device)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
-            parse_pool(self.inputs[input_index].pool, input_layer.name,
-                       self.config.inputs[input_index].pool_conf)
             pool_conf = self.config.inputs[input_index].pool_conf
-            print("output size for %s is %d*%d " % (name, pool_conf.output_y,
-                                                    pool_conf.output_x))
-            self.set_layer_size(
-                (pool_conf.output_x * pool_conf.output_y) * pool_conf.channels)
+            parse_pool(self.inputs[input_index].pool, input_layer.name,
+                       pool_conf)
+            self.set_cnn_layer(name, pool_conf.output_y, pool_conf.output_x,
+                               pool_conf.channels)
 
 
 @config_layer('spp')
@@ -1807,12 +1804,10 @@ class SpatialPyramidPoolLayer(LayerBase):
             name, 'spp', 0, inputs=inputs, device=device)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
-            parse_spp(self.inputs[input_index].spp, input_layer.name,
-                      self.config.inputs[input_index].spp_conf)
             spp_conf = self.config.inputs[input_index].spp_conf
-            output_size = (pow(4, spp_conf.pyramid_height) - 1) / (4 - 1)
-            print("output size for %s is %d " % (name, output_size))
-            self.set_layer_size(output_size * spp_conf.channels)
+            parse_spp(self.inputs[input_index].spp, input_layer.name, spp_conf)
+            output_x = (pow(4, spp_conf.pyramid_height) - 1) / (4 - 1)
+            self.set_cnn_layer(name, 1, output_x, spp_conf.image_conf.channels)
 
 
 @config_layer('batch_norm')
@@ -1874,10 +1869,10 @@ class BatchNormLayer(LayerBase):
             self.config.moving_average_fraction = moving_average_fraction
 
         input_layer = self.get_input_layer(0)
-        parse_image(self.inputs[0].image, input_layer.name,
-                    self.config.inputs[0].image_conf)
         image_conf = self.config.inputs[0].image_conf
-        self.set_layer_size((image_conf.img_size**2) * image_conf.channels)
+        parse_image(self.inputs[0].image, input_layer.name, image_conf)
+        self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size,
+                           image_conf.channels, False)
 
         psize = self.calc_parameter_size(image_conf)
         dims = [1, psize]
@@ -1935,11 +1930,11 @@ class MaxOutLayer(LayerBase):
         super(MaxOutLayer, self).__init__(
             name, 'maxout', 0, inputs=inputs, **xargs)
         input_layer = self.get_input_layer(0)
-        parse_maxout(self.inputs[0].maxout, input_layer.name,
-                     self.config.inputs[0].maxout_conf)
         maxout_conf = self.config.inputs[0].maxout_conf
-        self.set_layer_size(g_layer_map[input_layer.name].size /
-                            maxout_conf.groups)
+        parse_maxout(self.inputs[0].maxout, input_layer.name, maxout_conf)
+        out_channels = maxout_conf.image_conf.channels / maxout_conf.groups
+        self.set_cnn_layer(name, g_layer_map[input_layer.name].height,
+                           g_layer_map[input_layer.name].width, out_channels)
 
 
 # key: cost type
@@ -2519,11 +2514,10 @@ class BilinearInterpLayer(LayerBase):
         super(BilinearInterpLayer, self).__init__(
             name, 'bilinear_interp', 0, inputs=inputs, **xargs)
         input_layer = self.get_input_layer(0)
-        parse_bilinear(self.inputs[0].bilinear_interp, input_layer.name,
-                       self.config.inputs[0].bilinear_interp_conf)
-        conf = self.inputs[0].bilinear_interp
-        self.set_layer_size(conf.out_size_x * conf.out_size_y *
-                            conf.num_channels)
+        conf = self.config.inputs[0].bilinear_interp_conf
+        parse_bilinear(self.inputs[0].bilinear_interp, input_layer.name, conf)
+        self.set_cnn_layer(name, conf.out_size_y, conf.out_size_x,
+                           conf.image_conf.channels)
 
 
 @config_layer('sum_to_one_norm')
@@ -2993,9 +2987,32 @@ class CTCLayer(LayerBase):
         config_assert(len(self.inputs) == 2, 'CTCLayer must have 2 inputs')
 
 
+@config_layer('warp_ctc')
+class WarpCTCLayer(LayerBase):
+    def __init__(self,
+                 name,
+                 size,
+                 inputs,
+                 blank=0,
+                 norm_by_times=False,
+                 device=None):
+        super(WarpCTCLayer, self).__init__(
+            name, 'warp_ctc', size=size, inputs=inputs, device=device)
+        self.config.blank = blank
+        self.config.norm_by_times = norm_by_times
+        config_assert(len(self.inputs) == 2, 'WarpCTCLayer must have 2 inputs')
+        input_layer = self.get_input_layer(0)
+        config_assert(
+            (input_layer.active_type == '' or
+             input_layer.active_type == 'linear'),
+            "Expecting the active_type of input layer to be linear or null")
+
+
 @config_layer('recurrent_layer_group')
 class RecurrentLayerGroup(LayerBase):
     def __init__(self, name, device=None):
+        global g_pass_height_width
+        g_pass_height_width = False
         super(RecurrentLayerGroup, self).__init__(
             name, 'recurrent_layer_group', 0, inputs=[], device=device)
 
@@ -3348,12 +3365,26 @@ def my_fatal(s):
     raise Exception()
 
 
+_parse_config_hooks = set()
+
+
+def register_parse_config_hook(f):
+    """
+    Register a hook function for parse_config. parse_config will invoke the hook
+    at the beginning of parse. This make it possible to reset global state for
+    for constructing the model.
+    """
+    _parse_config_hooks.add(f)
+
+
 def parse_config(config_file, config_arg_str):
     '''
     @param config_arg_str: a string of the form var1=val1,var2=val2. It will be
     passed to config script as a dictionary CONFIG_ARGS
     '''
     init_config_environment()
+    for hook in _parse_config_hooks:
+        hook()
 
     config_args = {}
 
@@ -3381,7 +3412,21 @@ def parse_config(config_file, config_arg_str):
     g_root_submodel.is_recurrent_layer_group = False
     g_current_submodel = g_root_submodel
 
-    execfile(config_file, make_config_environment(config_file, config_args))
+    # for paddle on spark, need support non-file config.
+    # you can use parse_config like below:
+    #
+    # from paddle.trainer.config_parser import parse_config
+    # def configs():
+    #    #your paddle config code, which is same as config file.
+    #
+    # config = parse_config(configs, "is_predict=1")
+    # # then you get config proto object.
+    if hasattr(config_file, '__call__'):
+        config_file.func_globals.update(
+            make_config_environment("", config_args))
+        config_file()
+    else:
+        execfile(config_file, make_config_environment(config_file, config_args))
     for k, v in settings.iteritems():
         if v is None:
             continue
diff --git a/python/paddle/trainer/config_parser_extension.py b/python/paddle/trainer/config_parser_extension.py
index ba4c79efdc10ec6cc895e76ddb87bc3fbd19ddc1..b9e0f3eb13dd3f54e26a566f4ae937940134fa03 100644
--- a/python/paddle/trainer/config_parser_extension.py
+++ b/python/paddle/trainer/config_parser_extension.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/trainer/recurrent_units.py b/python/paddle/trainer/recurrent_units.py
index a80ad13d1ed52d84c3b5882939271b91ecc07bb3..edca279dcadef42243cb3fc00366cec90cbc69bf 100644
--- a/python/paddle/trainer/recurrent_units.py
+++ b/python/paddle/trainer/recurrent_units.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/trainer_config_helpers/__init__.py b/python/paddle/trainer_config_helpers/__init__.py
index adebebba2523f851507c4a0525eeaae9cfeb9dcc..a2335768b92b66e3bfc0fb0d2562fb24ad291258 100644
--- a/python/paddle/trainer_config_helpers/__init__.py
+++ b/python/paddle/trainer_config_helpers/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,4 +22,4 @@ from optimizers import *
 from attrs import *
 
 # This will enable operator overload for LayerOutput
-import math
+import math as layer_math
diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py
index 6261934e1bc8e8df62aeaa0757f4a237f91ef748..06be3e45993bedc2ccf9874e1ab503a9fdbba623 100644
--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,7 +16,8 @@ __all__ = [
     "TanhActivation", "SigmoidActivation", "SoftmaxActivation",
     "IdentityActivation", "LinearActivation", 'SequenceSoftmaxActivation',
     'ExpActivation', "ReluActivation", "BReluActivation", "SoftReluActivation",
-    "STanhActivation", "AbsActivation", "SquareActivation", "BaseActivation"
+    "STanhActivation", "AbsActivation", "SquareActivation", "BaseActivation",
+    "LogActivation"
 ]
 
 
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index 54169f382f164e7b9cf061baeb21d4109a8ae5b6..59bb18bfcab30540bd38ca8d1cb300813d30fee8 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/trainer_config_helpers/data_sources.py b/python/paddle/trainer_config_helpers/data_sources.py
index b41097953dad8aa9c8755c25860b177cdbff5b93..d7cb95c477133823f9147e2085c9c609916f16e8 100644
--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ b/python/paddle/trainer_config_helpers/data_sources.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -69,7 +69,7 @@ def define_py_data_source(file_list,
     """
     if isinstance(file_list, list):
         file_list_name = 'train.list'
-        if isinstance(cls, TestData):
+        if cls == TestData:
             file_list_name = 'test.list'
         with open(file_list_name, 'w') as f:
             f.writelines(file_list)
@@ -186,8 +186,7 @@ def define_py_data_sources2(train_list, test_list, module, obj, args=None):
                                 obj="process", 
                                 args={"dictionary": dict_name})
 
-    The related data provider can refer to 
-    `here <../../data_provider/pydataprovider2.html#dataprovider-for-the-sequential-model>`__.
+    The related data provider can refer to :ref:`api_pydataprovider2_sequential_model` .
 
     :param train_list: Train list name.
     :type train_list: basestring
diff --git a/python/paddle/trainer_config_helpers/default_decorators.py b/python/paddle/trainer_config_helpers/default_decorators.py
index c01050e338d5933f49f0504f2e9ef5f15c7743ba..ad3efcbf369411b9c42b2a32ed05b04f86bf7de6 100644
--- a/python/paddle/trainer_config_helpers/default_decorators.py
+++ b/python/paddle/trainer_config_helpers/default_decorators.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -78,6 +78,20 @@ class DefaultNameFactory(object):
         """
         pass
 
+    def reset(self):
+        self.__counter__ = 0
+
+
+_name_factories = []
+
+
+def reset_hook():
+    for factory in _name_factories:
+        factory.reset()
+
+
+register_parse_config_hook(reset_hook)
+
 
 def wrap_name_default(name_prefix=None):
     """
@@ -95,7 +109,9 @@ def wrap_name_default(name_prefix=None):
     :return: a decorator to set default name
     :rtype: callable
     """
-    return wrap_param_default(["name"], DefaultNameFactory(name_prefix))
+    factory = DefaultNameFactory(name_prefix)
+    _name_factories.append(factory)
+    return wrap_param_default(["name"], factory)
 
 
 def wrap_param_attr_default(param_names=None, default_factory=None):
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index dc6a36392f9c6bff42d3a37f963ed18a849414f5..3e0e88972c58e8c853e79e21f839943ae4b027d6 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -327,9 +327,10 @@ def ctc_error_evaluator(
 @wrap_name_default()
 def chunk_evaluator(
         input,
-        name=None,
-        chunk_scheme=None,
-        num_chunk_types=None, ):
+        label,
+        chunk_scheme,
+        num_chunk_types,
+        name=None, ):
     """
     Chunk evaluator is used to evaluate segment labelling accuracy for a
     sequence. It calculates the chunk detection F1 score.
@@ -363,22 +364,24 @@ def chunk_evaluator(
 
     .. code-block:: python
 
-       eval = chunk_evaluator(input)
+       eval = chunk_evaluator(input, label, chunk_scheme, num_chunk_types)
 
     :param input: The input layers.
     :type input: LayerOutput
-    :param name: The Evaluator name, it is not necessary.
-    :type name: basename|None
+    :param label: An input layer containing the ground truth label.
+    :type label: LayerOutput
     :param chunk_scheme: The labelling schemes support 4 types. It is one of
-                         "IOB", "IOE", "IOBES", "plain".This Evaluator must
-                         contain this chunk_scheme.
+                         "IOB", "IOE", "IOBES", "plain". It is required.
     :type chunk_scheme: basestring
     :param num_chunk_types: number of chunk types other than "other"
+    :param name: The Evaluator name, it is optional.
+    :type name: basename|None
     """
     evaluator_base(
         name=name,
         type="chunk",
         input=input,
+        label=label,
         chunk_scheme=chunk_scheme,
         num_chunk_types=num_chunk_types)
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index d984e843204c1cd99ee5b8941dc056c091504869..c10fa671bdbf7c0a5e34c183533e04bea76029d4 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -91,6 +91,7 @@ __all__ = [
     'linear_comb_layer',
     'convex_comb_layer',
     'ctc_layer',
+    'warp_ctc_layer',
     'crf_layer',
     'crf_decoding_layer',
     'nce_layer',
@@ -129,6 +130,9 @@ class LayerType(object):
     HSIGMOID = 'hsigmoid'
     CONV_LAYER = "conv"
     CONVTRANS_LAYER = "convt"
+    EXCONV_LAYER = "exconv"
+    EXCONVTRANS_LAYER = "exconvt"
+    CUDNNCONV_LAYER = "cudnn_conv"
     POOL_LAYER = "pool"
     BATCH_NORM_LAYER = 'batch_norm'
     NORM_LAYER = 'norm'
@@ -169,6 +173,7 @@ class LayerType(object):
     PRINT_LAYER = "print"
 
     CTC_LAYER = "ctc"
+    WARP_CTC_LAYER = "warp_ctc"
     CRF_LAYER = "crf"
     CRF_DECODING_LAYER = "crf_decoding"
     NCE_LAYER = 'nce'
@@ -763,7 +768,7 @@ def mixed_layer(size=0,
 
 
 @layer_support()
-def data_layer(name, size, layer_attr=None):
+def data_layer(name, size, height=None, width=None, layer_attr=None):
     """
     Define DataLayer For NeuralNetwork.
 
@@ -778,6 +783,10 @@ def data_layer(name, size, layer_attr=None):
     :type name: basestring
     :param size: Size of this data layer.
     :type size: int
+    :param height: Height of this data layer, used for image
+    :type size: int|None
+    :param width: Width of this data layer, used for image
+    :type size: int|None
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
@@ -787,6 +796,8 @@ def data_layer(name, size, layer_attr=None):
         type=LayerType.DATA,
         name=name,
         size=size,
+        height=height,
+        width=width,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
 
     return LayerOutput(name, LayerType.DATA, size=size)
@@ -959,7 +970,7 @@ def pooling_layer(input,
     :param layer_attr: The Extra Attributes for layer, such as dropout.
     :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
-    :rtype: LayerType
+    :rtype: LayerOutput
     """
     extra_dict = dict()
     # noinspection PyUnresolvedReferences
@@ -1480,7 +1491,7 @@ def bilinear_interp_layer(input,
             bilinear_interp=BilinearInterp(
                 out_size_x=out_size_x,
                 out_size_y=out_size_y,
-                num_channels=num_channels)),
+                channels=num_channels)),
         type=LayerType.BILINEAR_INTERP_LAYER,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
@@ -1762,7 +1773,8 @@ def img_conv_layer(input,
                    filter_size_y=None,
                    stride_y=None,
                    padding_y=None,
-                   trans=False):
+                   trans=False,
+                   layer_type=None):
     """
     Convolution layer for image. Paddle only support square input currently and
     thus input image's width equals height.
@@ -1829,6 +1841,10 @@ def img_conv_layer(input,
     :type layer_attr: ExtraLayerAttribute
     :param trans: true if it is a convTransLayer, false if it is a convLayer
     :type trans: bool
+    :param layer_type: specify the layer_type, default is None. If trans=True,
+                       layer_type has to be "exconvt", otherwise layer_type 
+                       has to be either "exconv" or "cudnn_conv"
+    :type layer_type: String
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1865,7 +1881,14 @@ def img_conv_layer(input,
         param_attr.attr["initial_strategy"] = 0
         param_attr.attr["initial_smart"] = False
 
-    lt = LayerType.CONVTRANS_LAYER if trans else LayerType.CONV_LAYER
+    if layer_type:
+        if trans:
+            assert layer_type in ["exconvt"]
+        else:
+            assert layer_type in ["exconv", "cudnn_conv"]
+        lt = layer_type
+    else:
+        lt = LayerType.CONVTRANS_LAYER if trans else LayerType.CONV_LAYER
 
     l = Layer(
         name=name,
@@ -1908,8 +1931,7 @@ def img_pool_layer(input,
                    layer_attr=None,
                    pool_size_y=None,
                    stride_y=None,
-                   padding_y=None,
-                   img_width=None):
+                   padding_y=None):
     """
     Image pooling Layer.
 
@@ -1940,9 +1962,6 @@ def img_pool_layer(input,
     :type stride_y: int|None
     :param layer_attr: Extra Layer attribute.
     :type layer_attr: ExtraLayerAttribute
-    :param img_width: the width of input feature map. If it is None, the input feature
-                      map should be square.
-    :type img_width: int|None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1978,8 +1997,7 @@ def img_pool_layer(input,
                     padding=padding,
                     size_y=pool_size_y,
                     stride_y=stride_y,
-                    padding_y=padding_y,
-                    img_width=img_width))
+                    padding_y=padding_y))
         ],
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
@@ -1997,7 +2015,6 @@ def spp_layer(input,
               num_channels=None,
               pool_type=None,
               pyramid_height=None,
-              img_width=None,
               layer_attr=None):
     """
     Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition.
@@ -2014,9 +2031,6 @@ def spp_layer(input,
     :type scale: BasePoolingType
     :param pyramid_height: pyramid height.
     :type pyramid_height: int
-    :param img_width: the width of input feature map. If it is None, the input feature
-                      map should be square.
-    :type img_width: int|None
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
@@ -2043,8 +2057,7 @@ def spp_layer(input,
             spp=SpatialPyramidPool(
                 pool_type=type_name,
                 channels=num_channels,
-                pyramid_height=pyramid_height,
-                img_width=img_width)),
+                pyramid_height=pyramid_height)),
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
         name,
@@ -2853,11 +2866,11 @@ def recurrent_group(step,
     :type targetInlink: LayerOutput|SubsequenceInput
 
     :param is_generating: If is generating, none of input type should be LayerOutput;
-                          else, for training or testing, one of the input type must 
+                          else, for training or testing, one of the input type must
                           be LayerOutput.
 
     : type is_generating: bool
-    
+
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -2905,15 +2918,16 @@ def recurrent_group(step,
         seq_reversed=reverse,
         target_inlinkname=targetInlinkName)
     in_args = []
-    has_LayerOutput = True
+    has_LayerOutput = False
     for each_input in input:
         assert is_single_input(each_input)
         if isinstance(each_input, LayerOutput):
             in_args.append(each_input)
+            has_LayerOutput = True
         elif isinstance(each_input, SubsequenceInput):
             in_args.append(each_input.input)
+            has_LayerOutput = True
         else:
-            has_LayerOutput = False
             mem_name = "__%s_memory__" % each_input.input.name
             mem = memory(
                 name=mem_name,
@@ -4084,6 +4098,83 @@ def ctc_layer(input,
     return LayerOutput(name, LayerType.CTC_LAYER, [input, label], size=size)
 
 
+@wrap_name_default()
+@layer_support()
+def warp_ctc_layer(input,
+                   label,
+                   size=None,
+                   name=None,
+                   blank=0,
+                   norm_by_times=False,
+                   layer_attr=None):
+    """
+    A layer intergrating the open-source `warp-ctc
+    <https://github.com/baidu-research/warp-ctc>` library, which is used in
+    `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin
+    <https://arxiv.org/pdf/1512.02595v1.pdf>`, to compute Connectionist Temporal
+    Classification (CTC) loss.
+
+    More details of CTC can be found by referring to `Connectionist Temporal
+    Classification: Labelling Unsegmented Sequence Data with Recurrent
+    Neural Networks <http://machinelearning.wustl.edu/mlpapers/paper_files/
+    icml2006_GravesFGS06.pdf>`_
+
+    Note:
+        - Let num_classes represent the category number. Considering the 'blank'
+          label needed by CTC, you need to use (num_classes + 1) as the input
+          size. Thus, the size of both warp_ctc_layer and 'input' layer should
+          be set to num_classes + 1.
+        - You can set 'blank' to any value ranged in [0, num_classes], which
+          should be consistent as that used in your labels.
+        - As a native 'softmax' activation is interated to the warp-ctc library,
+         'linear' activation is expected instead in the 'input' layer.
+
+    The simple usage:
+
+    .. code-block:: python
+
+      ctc = warp_ctc_layer(input=input,
+                           label=label,
+                           size=1001,
+                           blank=1000,
+                           norm_by_times=False)
+
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param label: The data layer of label with variable length.
+    :type label: LayerOutput
+    :param size: category numbers + 1.
+    :type size: int
+    :param name: The name of this layer, which can not specify.
+    :type name: basestring|None
+    :param blank: the 'blank' label used in ctc
+    :type blank: int
+    :param norm_by_times: Whether to normalization by times. False by default.
+    :type norm_by_times: bool
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert isinstance(label, LayerOutput)
+    if label.size is not None:
+        if size is not None:
+            assert size == label.size + 1
+        else:
+            size = label.size + 1
+    Layer(
+        name=name,
+        type=LayerType.WARP_CTC_LAYER,
+        size=size,
+        blank=blank,
+        norm_by_times=norm_by_times,
+        inputs=[input.name, label.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.WARP_CTC_LAYER, parents=[input, label], size=size)
+
+
 @wrap_name_default()
 @wrap_param_attr_default()
 @layer_support()
diff --git a/python/paddle/trainer_config_helpers/math.py b/python/paddle/trainer_config_helpers/math.py
index 30a9b1c4e895f2d7629c222208b79545b9c56fda..2d9e36f2b0d379d907634208a45c69efa9dbba3d 100644
--- a/python/paddle/trainer_config_helpers/math.py
+++ b/python/paddle/trainer_config_helpers/math.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index ff6d2e1cffcebf3b55ba7dfbefce9e2af6d09672..375bea34e8aa0ac2ea222531f313a627414495b0 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py
index 501fc3211bf2b955065f40109ab5ceeaa8041e6e..a53ebe160be3b5d6d115e3e15d059d3d87e80942 100644
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -361,9 +361,6 @@ def settings(batch_size,
              learning_rate_decay_b=0.,
              learning_rate_schedule='poly',
              learning_rate_args='',
-             average_window=0,
-             do_average_in_cpu=False,
-             max_average_window=None,
              learning_method=None,
              regularization=None,
              is_async=False,
@@ -411,8 +408,7 @@ def settings(batch_size,
 
     args = [
         'batch_size', 'learning_rate', 'learning_rate_decay_a',
-        'learning_rate_decay_b', 'learning_rate_schedule', 'learning_rate_args',
-        'average_window', 'do_average_in_cpu', 'max_average_window'
+        'learning_rate_decay_b', 'learning_rate_schedule', 'learning_rate_args'
     ]
     kwargs = dict()
     kwargs['algorithm'] = algorithm
diff --git a/python/paddle/trainer_config_helpers/poolings.py b/python/paddle/trainer_config_helpers/poolings.py
index 6f13a76f25a106dfda5e71dbc237b95afefff884..0c38a8dce553ec120cacc72edb604bfeb1819f93 100644
--- a/python/paddle/trainer_config_helpers/poolings.py
+++ b/python/paddle/trainer_config_helpers/poolings.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
index cf52b06bfea06d6a88b5f11558627936a74bd0b9..d1a9843d326669711bf3d0769df1b804cfcfa673 100644
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -4,7 +4,22 @@ add_test(NAME layers_test
         python ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/layers_test.py
     WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
 
-add_test(NAME test_layerHelpers
-  COMMAND
-  ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
-)
+add_test(NAME test_reset_hook
+  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        python ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
+    WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
+
+if (PROTOBUF_3)
+  add_paddle_exe(protobuf_equal
+    ProtobufEqualMain.cpp)
+  add_test(NAME test_layerHelpers
+    COMMAND
+    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+    ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
+  )
+else()
+  add_test(NAME test_layerHelpers
+    COMMAND
+    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+  )
+endif()
diff --git a/python/paddle/trainer_config_helpers/tests/ProtobufEqualMain.cpp b/python/paddle/trainer_config_helpers/tests/ProtobufEqualMain.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fc53422afd44a6cb1d81a8ec9d27d83523f30e6c
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/ProtobufEqualMain.cpp
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <google/protobuf/text_format.h>
+#include <google/protobuf/util/message_differencer.h>
+#include <fstream>
+#include <iostream>
+#include "TrainerConfig.pb.h"
+
+bool loadPb(google::protobuf::Message* conf, const std::string& filename) {
+  std::ifstream fin;
+  fin.open(filename.c_str());
+  if (fin.is_open()) {
+    std::string str((std::istreambuf_iterator<char>(fin)),
+                    std::istreambuf_iterator<char>());
+    bool ok = google::protobuf::TextFormat::ParseFromString(str, conf);
+    fin.close();
+    return ok;
+  } else {
+    return false;
+  }
+}
+
+int main(int argc, char** argv) {
+  std::unique_ptr<google::protobuf::Message> config1;
+  std::unique_ptr<google::protobuf::Message> config2;
+  if (argc == 3) {
+    config1.reset(new paddle::ModelConfig());
+    config2.reset(new paddle::ModelConfig());
+  } else if (argc == 4) {
+    config1.reset(new paddle::TrainerConfig());
+    config2.reset(new paddle::TrainerConfig());
+  }
+  if (!config1 || !config2) {
+    return 1;
+  } else if (!loadPb(config1.get(), argv[1])) {
+    return 2;
+  } else if (!loadPb(config2.get(), argv[2])) {
+    return 3;
+  } else {
+    if (google::protobuf::util::MessageDifferencer::ApproximatelyEquals(
+            *config1, *config2)) {
+      return 0;
+    } else {
+      return 4;
+    }
+  }
+}
diff --git a/python/paddle/trainer_config_helpers/tests/configs/.gitignore b/python/paddle/trainer_config_helpers/tests/configs/.gitignore
index eb646b4a71ec1ac0e7992aabf2992fef7a9264a0..c654bd41b0b4dd0cb510943540b660b4e4a147d9 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/.gitignore
+++ b/python/paddle/trainer_config_helpers/tests/configs/.gitignore
@@ -1 +1 @@
-protostr/*.unitest
+protostr/*.unittest
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3f1d99701afe5425553feb129c7619b5e3e689fa
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+export configs=(test_fc layer_activations projections test_print_layer
+test_sequence_pooling test_lstmemory_layer test_grumemory_layer
+last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
+img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
+test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight
+test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops)
+
+export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
index bb594ac2c245d8882569ba2c3cf00623a8fa8e2c..a54af94ce3db4ed300dee697b30516c3b6448d7c 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -5,24 +5,18 @@ cd `dirname $0`
 export PYTHONPATH=$PWD/../../../../
 
 protostr=$PWD/protostr
-
-configs=(test_fc layer_activations projections test_print_layer
-test_sequence_pooling test_lstmemory_layer test_grumemory_layer
-last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
-img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
-test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight
-test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops)
-
-whole_configs=(test_split_datasource)
+. file_list.sh
 
 for conf in ${configs[*]}
 do
     echo "Generating " $conf
-    python -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unitest
+    python -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unittest
+    cat ${conf}.py |python test_config_parser_for_non_file_config.py > $protostr/$conf.protostr.non_file_config.unittest
 done
 
 for conf in ${whole_configs[*]}
 do
     echo "Generating " $conf
-    python -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unitest
+    python -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unittest
+    cat ${conf}.py |python test_config_parser_for_non_file_config.py --whole > $protostr/$conf.protostr.non_file_config.unittest
 done
diff --git a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
index c4c6d4020f59c9d39c0cfc1f075c16ac16ac33db..3331c10d6497f58eb135208bd7abe48aacfb10ae 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
@@ -1,15 +1,14 @@
 from paddle.trainer_config_helpers import *
-from paddle.trainer_config_helpers import math
 
 settings(batch_size=1000, learning_rate=1e-5)
 
 x = data_layer(name='data', size=100)
-x = math.exp(x)
-x = math.log(x)
-x = math.abs(x)
-x = math.sigmoid(x)
-x = math.square(x)
-x = math.square(x)
+x = layer_math.exp(x)
+x = layer_math.log(x)
+x = layer_math.abs(x)
+x = layer_math.sigmoid(x)
+x = layer_math.square(x)
+x = layer_math.square(x)
 y = 1 + x
 y = y + 1
 y = x + y
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
index 1f262af21126c17eb133b92c84a1ae3bb280a1d6..1a577b8d9b1e1915236ba6afcfa97040d70c707a 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
@@ -26,11 +26,15 @@ layers {
       filter_size_y: 32
       padding_y: 1
       stride_y: 1
+      output_y: 227
+      img_size_y: 256
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
   num_filters: 64
   shared_biases: true
+  height: 227
+  width: 227
 }
 layers {
   name: "__batch_norm_0__"
@@ -43,6 +47,7 @@ layers {
     image_conf {
       channels: 64
       img_size: 227
+      img_size_y: 227
     }
   }
   inputs {
@@ -55,6 +60,8 @@ layers {
   }
   bias_parameter_name: "___batch_norm_0__.wbias"
   moving_average_fraction: 0.9
+  height: 227
+  width: 227
 }
 layers {
   name: "__crmnorm_0__"
@@ -72,8 +79,12 @@ layers {
       output_x: 227
       img_size: 227
       blocked: false
+      output_y: 227
+      img_size_y: 227
     }
   }
+  height: 227
+  width: 227
 }
 layers {
   name: "__pool_0__"
@@ -97,6 +108,8 @@ layers {
       padding_y: 0
     }
   }
+  height: 196
+  width: 196
 }
 parameters {
   name: "___conv_0__.w0"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
index 38346354080b02bebd937fd998fd3c63c8030346..cd310bd13b39aca57d7a1f38ac2a8966c706b60a 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
@@ -26,6 +26,8 @@ layers {
       filter_size_y: 32
       padding_y: 1
       stride_y: 1
+      output_y: 227
+      img_size_y: 256
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
@@ -43,6 +45,7 @@ layers {
     image_conf {
       channels: 64
       img_size: 256
+      img_size_y: 256
     }
   }
   inputs {
@@ -55,6 +58,8 @@ layers {
   }
   bias_parameter_name: "___batch_norm_0__.wbias"
   moving_average_fraction: 0.9
+  height: 256
+  width: 256
 }
 layers {
   name: "__crmnorm_0__"
@@ -72,8 +77,12 @@ layers {
       output_x: 256
       img_size: 256
       blocked: false
+      output_y: 256
+      img_size_y: 256
     }
   }
+  height: 256
+  width: 256
 }
 layers {
   name: "__pool_0__"
@@ -97,6 +106,8 @@ layers {
       padding_y: 0
     }
   }
+  height: 225
+  width: 225
 }
 parameters {
   name: "___conv_0__.w0"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
index 2b3951c242411e0c0990a52bcb2ae6b1723a9367..2943ab130bd7d6f3b78ea611f1c35850ccaf5e92 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
@@ -177,6 +177,8 @@ layers {
       filter_size_y: 3
       padding_y: 0
       stride_y: 1
+      output_y: 30
+      img_size_y: 32
     }
     num_filters: 64
   }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
index 13d0d477eb58f6da887d0ad9c683caef37e00010..9fae596f281d44dc24c45cb3c750233266e95948 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
@@ -26,11 +26,15 @@ layers {
       filter_size_y: 3
       padding_y: 1
       stride_y: 1
+      output_y: 48
+      img_size_y: 48
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
   num_filters: 16
   shared_biases: true
+  height: 48
+  width: 48
 }
 layers {
   name: "__bilinear_interp_layer_0__"
@@ -40,11 +44,17 @@ layers {
   inputs {
     input_layer_name: "__conv_0__"
     bilinear_interp_conf {
+      image_conf {
+        channels: 16
+        img_size: 48
+        img_size_y: 48
+      }
       out_size_x: 64
       out_size_y: 64
-      num_channels: 16
     }
   }
+  height: 64
+  width: 64
 }
 layers {
   name: "__pool_0__"
@@ -55,19 +65,21 @@ layers {
     input_layer_name: "__bilinear_interp_layer_0__"
     pool_conf {
       pool_type: "max-projection"
-      channels: 4
+      channels: 16
       size_x: 2
       stride: 2
-      output_x: 64
-      img_size: 128
+      output_x: 32
+      img_size: 64
       padding: 0
       size_y: 2
       stride_y: 2
-      output_y: 64
-      img_size_y: 128
+      output_y: 32
+      img_size_y: 64
       padding_y: 0
     }
   }
+  height: 32
+  width: 32
 }
 layers {
   name: "__fc_layer_0__"
@@ -78,6 +90,8 @@ layers {
     input_layer_name: "__pool_0__"
     input_parameter_name: "___fc_layer_0__.w0"
   }
+  height: 32
+  width: 32
 }
 parameters {
   name: "___conv_0__.w0"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
index f6045fe1f68255daf0d9b5ab05034eec633e4503..10e59e21bc7a48bc53fb535f86f053c91f57c1df 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
@@ -47,6 +47,20 @@ layers {
   }
   norm_by_times: false
 }
+layers {
+  name: "__warp_ctc_layer_0__"
+  type: "warp_ctc"
+  size: 5001
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+  inputs {
+    input_layer_name: "labels"
+  }
+  norm_by_times: false
+  blank: 0
+}
 layers {
   name: "crf_label"
   type: "data"
@@ -244,6 +258,7 @@ input_layer_names: "xe-label"
 input_layer_names: "huber_probs"
 input_layer_names: "huber_label"
 output_layer_names: "__ctc_layer_0__"
+output_layer_names: "__warp_ctc_layer_0__"
 output_layer_names: "__crf_layer_0__"
 output_layer_names: "__rank_cost_0__"
 output_layer_names: "__lambda_cost_0__"
@@ -260,6 +275,7 @@ sub_models {
   layer_names: "xe-label"
   layer_names: "__fc_layer_0__"
   layer_names: "__ctc_layer_0__"
+  layer_names: "__warp_ctc_layer_0__"
   layer_names: "crf_label"
   layer_names: "__crf_layer_0__"
   layer_names: "left"
@@ -289,6 +305,7 @@ sub_models {
   input_layer_names: "huber_probs"
   input_layer_names: "huber_label"
   output_layer_names: "__ctc_layer_0__"
+  output_layer_names: "__warp_ctc_layer_0__"
   output_layer_names: "__crf_layer_0__"
   output_layer_names: "__rank_cost_0__"
   output_layer_names: "__lambda_cost_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
index 1be2a7ceebfb74d677ac056dcc3a9f72fd31ccd6..c763a95f9d1aefa022f38e0beef6d1c86ebb360d 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
@@ -4,6 +4,8 @@ layers {
   type: "data"
   size: 2304
   active_type: ""
+  height: 48
+  width: 48
 }
 layers {
   name: "__conv_0__"
@@ -26,11 +28,15 @@ layers {
       filter_size_y: 3
       padding_y: 1
       stride_y: 1
+      output_y: 48
+      img_size_y: 48
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
   num_filters: 16
   shared_biases: true
+  height: 48
+  width: 48
 }
 layers {
   name: "__maxout_layer_0__"
@@ -40,12 +46,16 @@ layers {
   inputs {
     input_layer_name: "__conv_0__"
     maxout_conf {
-      channels: 16
+      image_conf {
+        channels: 16
+        img_size: 48
+        img_size_y: 48
+      }
       groups: 2
-      img_size_x: 0
-      img_size_y: 0
     }
   }
+  height: 48
+  width: 48
 }
 layers {
   name: "__pool_0__"
@@ -69,48 +79,58 @@ layers {
       padding_y: 0
     }
   }
+  height: 24
+  width: 24
 }
 layers {
   name: "__conv_1__"
   type: "exconv"
-  size: 18432
+  size: 73728
   active_type: ""
   inputs {
     input_layer_name: "__pool_0__"
     input_parameter_name: "___conv_1__.w0"
     conv_conf {
       filter_size: 3
-      channels: 32
+      channels: 8
       stride: 1
       padding: 1
       groups: 1
-      filter_channels: 32
-      output_x: 12
-      img_size: 12
+      filter_channels: 8
+      output_x: 24
+      img_size: 24
       caffe_mode: true
       filter_size_y: 3
       padding_y: 1
       stride_y: 1
+      output_y: 24
+      img_size_y: 24
     }
   }
   bias_parameter_name: "___conv_1__.wbias"
   num_filters: 128
   shared_biases: true
+  height: 24
+  width: 24
 }
 layers {
   name: "__maxout_layer_1__"
   type: "maxout"
-  size: 9216
+  size: 18432
   active_type: ""
   inputs {
-    input_layer_name: "__conv_0__"
+    input_layer_name: "__conv_1__"
     maxout_conf {
-      channels: 128
+      image_conf {
+        channels: 128
+        img_size: 24
+        img_size_y: 24
+      }
       groups: 4
-      img_size_x: 0
-      img_size_y: 0
     }
   }
+  height: 24
+  width: 24
 }
 layers {
   name: "__block_expand_layer_0__"
@@ -118,7 +138,7 @@ layers {
   size: 192
   active_type: ""
   inputs {
-    input_layer_name: "__maxout_layer_0__"
+    input_layer_name: "__maxout_layer_1__"
     block_expand_conf {
       channels: 32
       stride_x: 1
@@ -133,6 +153,8 @@ layers {
       img_size_y: 0
     }
   }
+  height: 24
+  width: 24
 }
 layers {
   name: "__fc_layer_0__"
@@ -143,6 +165,8 @@ layers {
     input_layer_name: "__block_expand_layer_0__"
     input_parameter_name: "___fc_layer_0__.w0"
   }
+  height: 24
+  width: 24
 }
 parameters {
   name: "___conv_0__.w0"
@@ -164,9 +188,9 @@ parameters {
 }
 parameters {
   name: "___conv_1__.w0"
-  size: 36864
+  size: 9216
   initial_mean: 0.0
-  initial_std: 0.0833333333333
+  initial_std: 0.166666666667
   initial_strategy: 0
   initial_smart: false
 }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr
index 8b0a8f2146b709ee67981049da8061597e1716be..ca1b2d8cffd6b472dfe40feeeb762e169bc853c7 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr
@@ -4,6 +4,8 @@ layers {
   type: "data"
   size: 3200
   active_type: ""
+  height: 20
+  width: 10
 }
 layers {
   name: "__spp_0__"
@@ -13,13 +15,17 @@ layers {
   inputs {
     input_layer_name: "data"
     spp_conf {
+      image_conf {
+        channels: 16
+        img_size: 10
+        img_size_y: 20
+      }
       pool_type: "max-projection"
       pyramid_height: 2
-      channels: 16
-      img_size: 10
-      img_size_y: 20
     }
   }
+  height: 1
+  width: 5
 }
 input_layer_names: "data"
 output_layer_names: "__spp_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
index 968328835842667470467830c7ca59e4b9fa723d..e984ee70625456241b3cfe6202fdadaa3807d33c 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
@@ -5,13 +5,38 @@ set -e
 
 protostr=`dirname $0`/protostr
 
-files=`ls $protostr | grep -v "unitest"`
+files=`ls $protostr | grep -v "unittest"`
 
 ./generate_protostr.sh
 
-for file in $files
-do
-    base_protostr=$protostr/$file
-    new_protostr=$protostr/$file.unitest
-    diff $base_protostr $new_protostr -u
-done
+. ./file_list.sh
+
+if [ -z $1 ]; then
+  for file in $files
+  do
+      base_protostr=$protostr/$file
+      new_protostr=$protostr/$file.unittest
+      diff $base_protostr $new_protostr -u
+      diff $protostr/$file $protostr/$file.non_file_config.unittest -u
+  done
+else
+  for file in ${configs[*]}
+  do
+    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.unittest; then
+      diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
+    fi
+    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest; then
+      diff $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest -u
+    fi
+  done
+
+  for file in ${whole_configs[*]}
+  do
+    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.unittest --whole; then
+      diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
+    fi
+    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest --whole; then
+      diff $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest -u
+    fi
+  done
+fi
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py b/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
index e15a55b412f9459ecd89a0f654256097099c1398..be83f4f83c5d05ea2ffd9e3df0c09fb1a37a3e57 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
@@ -17,7 +17,7 @@ bilinear = bilinear_interp_layer(input=conv, out_size_x=64, out_size_y=64)
 
 pool = img_pool_layer(
     input=bilinear,
-    num_channels=4,
+    num_channels=16,
     pool_size=2,
     stride=2,
     pool_type=MaxPooling())
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py b/python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b791a0222dd60e9ae2fca8b2798cddd13ed1d1c
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import re
+import getopt
+
+
+def main(print_whole_config, globals, locals):
+    '''
+     this test will all test_config.py
+  '''
+    cmdstr = """from paddle.trainer.config_parser import parse_config\n"""
+    importstr = ""
+    functionstr = ""
+
+    for line in sys.stdin:
+        if re.match("^import", line) or re.match("^from.*import", line):
+            importstr = importstr + line
+        else:
+            functionstr = functionstr + "  " + line
+
+    cmdstr = cmdstr + importstr + """def configs():\n""" + functionstr
+    #cmdstr = cmdstr + """def configs():\n""" + importstr + functionstr
+    if print_whole_config:
+        cmdstr = cmdstr + """print parse_config(configs, "")"""
+    else:
+        cmdstr = cmdstr + """print parse_config(configs, "").model_config"""
+
+    exec (cmdstr, globals, locals)
+
+
+if __name__ == '__main__':
+    whole = False
+    opts, args = getopt.getopt(sys.argv[1:], "", ["whole"])
+    for op, value in opts:
+        if op == "--whole":
+            whole = True
+    main(whole, globals(), locals())
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
index fd979a1e9f4337417512b4d6581c34e54c3957bd..18ff6b48c495b7a9d61595916ade1a54b1fa6a10 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
@@ -12,6 +12,8 @@ hidden = fc_layer(input=seq_in, size=4)
 outputs(
     ctc_layer(
         input=seq_in, label=labels),
+    warp_ctc_layer(
+        input=seq_in, label=labels, blank=0),
     crf_layer(
         input=hidden, label=data_layer(
             name='crf_label', size=4)),
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
index 081430d716093877db6b2e44ac5417c37ede9a6e..eb14270baa0c4ca0b84d2121a80fde0b45eda54a 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
@@ -2,7 +2,7 @@ from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
 
-data = data_layer(name='data', size=2304)
+data = data_layer(name='data', size=2304, height=48, width=48)
 
 conv = img_conv_layer(
     input=data,
@@ -21,16 +21,21 @@ pool = img_pool_layer(
 conv2 = img_conv_layer(
     input=pool,
     filter_size=3,
-    num_channels=32,
+    num_channels=8,
     num_filters=128,
     padding=1,
     act=LinearActivation(),
     bias_attr=True)
 
-maxout2 = maxout_layer(input=conv, num_channels=128, groups=4)
+maxout2 = maxout_layer(input=conv2, num_channels=128, groups=4)
 
 block = block_expand_layer(
-    input=maxout, num_channels=32, stride_x=1, stride_y=1, block_x=1, block_y=6)
+    input=maxout2,
+    num_channels=32,
+    stride_x=1,
+    stride_y=1,
+    block_x=1,
+    block_y=6)
 
 fc = fc_layer(input=block, size=384, bias_attr=False)
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
index e20ffb584e8bdd86100455d4e55fe633b878e034..e0b0d0d3be252700d99f7097f0353df885efcf07 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
@@ -2,13 +2,9 @@ from paddle.trainer_config_helpers import *
 
 settings(batch_size=100, learning_rate=1e-5)
 
-data = data_layer(name='data', size=3200)
+data = data_layer(name='data', size=3200, height=20, width=10)
 
 spp = spp_layer(
-    input=data,
-    pyramid_height=2,
-    num_channels=16,
-    pool_type=MaxPooling(),
-    img_width=10)
+    input=data, pyramid_height=2, num_channels=16, pool_type=MaxPooling())
 
 outputs(spp)
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test.py b/python/paddle/trainer_config_helpers/tests/layers_test.py
index 3b55667354750066a7d3ab3a0af59eb9e7d47d86..05902ea293df5a3e9c10f6700930ca6a343603c2 100644
--- a/python/paddle/trainer_config_helpers/tests/layers_test.py
+++ b/python/paddle/trainer_config_helpers/tests/layers_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test_config.py b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
index 44d134d1f7d5fb5de790cf564f4c1e0899571473..ae275735aa2b852b3b226a4a0e5b2d4d000ba199 100644
--- a/python/paddle/trainer_config_helpers/tests/layers_test_config.py
+++ b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/trainer_config_helpers/tests/test_reset_hook.py b/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..0423babdb720191d8e9dfc67f1af3be339dbe27d
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
@@ -0,0 +1,28 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from paddle.trainer.config_parser import parse_config
+
+
+class TestParse(unittest.TestCase):
+    def test_parse(self):
+        a = parse_config('trainer_config_helpers/tests/layers_test_config.py',
+                         '')
+        b = parse_config('trainer_config_helpers/tests/layers_test_config.py',
+                         '')
+        self.assertEqual(a, b)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/trainer_config_helpers/utils.py b/python/paddle/trainer_config_helpers/utils.py
index c0235b28cdfb96fbff9c02c217ffd972e4f8816e..fe6e9cd53cc821d2b6dbdabb7130567e22f8000f 100644
--- a/python/paddle/trainer_config_helpers/utils.py
+++ b/python/paddle/trainer_config_helpers/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index 3e93f41c2e32025b3e29a0990833d7e97a7c8caa..15595d208583b567b8f768c8d7bd84986ca5a03f 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/utils/dump_config.py b/python/paddle/utils/dump_config.py
index c5ce5c8d9a084d68b250d091808f528459f46921..73bf349c46726163d664c374aa47598871b90106 100644
--- a/python/paddle/utils/dump_config.py
+++ b/python/paddle/utils/dump_config.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/utils/image_multiproc.py b/python/paddle/utils/image_multiproc.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ce32f7811d6be6864a567cf41bf408f422409a7
--- /dev/null
+++ b/python/paddle/utils/image_multiproc.py
@@ -0,0 +1,262 @@
+import os, sys
+import numpy as np
+from PIL import Image
+from cStringIO import StringIO
+import multiprocessing
+import functools
+import itertools
+
+from paddle.utils.image_util import *
+from paddle.trainer.config_parser import logger
+
+try:
+    import cv2
+except ImportError:
+    logger.warning("OpenCV2 is not installed, using PIL to prcoess")
+    cv2 = None
+
+__all__ = ["CvTransformer", "PILTransformer", "MultiProcessImageTransformer"]
+
+
+class CvTransformer(ImageTransformer):
+    """
+    CvTransformer used python-opencv to process image.
+    """
+
+    def __init__(
+            self,
+            min_size=None,
+            crop_size=None,
+            transpose=(2, 0, 1),  # transpose to C * H * W
+            channel_swap=None,
+            mean=None,
+            is_train=True,
+            is_color=True):
+        ImageTransformer.__init__(self, transpose, channel_swap, mean, is_color)
+        self.min_size = min_size
+        self.crop_size = crop_size
+        self.is_train = is_train
+
+    def resize(self, im, min_size):
+        row, col = im.shape[:2]
+        new_row, new_col = min_size, min_size
+        if row > col:
+            new_row = min_size * row / col
+        else:
+            new_col = min_size * col / row
+        im = cv2.resize(im, (new_row, new_col), interpolation=cv2.INTER_CUBIC)
+        return im
+
+    def crop_and_flip(self, im):
+        """
+        Return cropped image.
+        The size of the cropped image is inner_size * inner_size.
+        im: (H x W x K) ndarrays
+        """
+        row, col = im.shape[:2]
+        start_h, start_w = 0, 0
+        if self.is_train:
+            start_h = np.random.randint(0, row - self.crop_size + 1)
+            start_w = np.random.randint(0, col - self.crop_size + 1)
+        else:
+            start_h = (row - self.crop_size) / 2
+            start_w = (col - self.crop_size) / 2
+        end_h, end_w = start_h + self.crop_size, start_w + self.crop_size
+        if self.is_color:
+            im = im[start_h:end_h, start_w:end_w, :]
+        else:
+            im = im[start_h:end_h, start_w:end_w]
+        if (self.is_train) and (np.random.randint(2) == 0):
+            if self.is_color:
+                im = im[:, ::-1, :]
+            else:
+                im = im[:, ::-1]
+        return im
+
+    def transform(self, im):
+        im = self.resize(im, self.min_size)
+        im = self.crop_and_flip(im)
+        # transpose, swap channel, sub mean
+        im = im.astype('float32')
+        ImageTransformer.transformer(self, im)
+        return im
+
+    def load_image_from_string(self, data):
+        flag = cv2.CV_LOAD_IMAGE_COLOR if self.is_color else cv2.CV_LOAD_IMAGE_GRAYSCALE
+        im = cv2.imdecode(np.fromstring(data, np.uint8), flag)
+        return im
+
+    def transform_from_string(self, data):
+        im = self.load_image_from_string(data)
+        return self.transform(im)
+
+    def load_image_from_file(self, file):
+        flag = cv2.CV_LOAD_IMAGE_COLOR if self.is_color else cv2.CV_LOAD_IMAGE_GRAYSCALE
+        im = cv2.imread(file, flag)
+        return im
+
+    def transform_from_file(self, file):
+        im = self.load_image_from_file(file)
+        return self.transform(im)
+
+
+class PILTransformer(ImageTransformer):
+    """
+    PILTransformer used PIL to process image.
+    """
+
+    def __init__(
+            self,
+            min_size=None,
+            crop_size=None,
+            transpose=(2, 0, 1),  # transpose to C * H * W
+            channel_swap=None,
+            mean=None,
+            is_train=True,
+            is_color=True):
+        ImageTransformer.__init__(self, transpose, channel_swap, mean, is_color)
+        self.min_size = min_size
+        self.crop_size = crop_size
+        self.is_train = is_train
+
+    def resize(self, im, min_size):
+        row, col = im.size[:2]
+        new_row, new_col = min_size, min_size
+        if row > col:
+            new_row = min_size * row / col
+        else:
+            new_col = min_size * col / row
+        im = im.resize((new_row, new_col), Image.ANTIALIAS)
+        return im
+
+    def crop_and_flip(self, im):
+        """
+        Return cropped image.
+        The size of the cropped image is inner_size * inner_size.
+        """
+        row, col = im.size[:2]
+        start_h, start_w = 0, 0
+        if self.is_train:
+            start_h = np.random.randint(0, row - self.crop_size + 1)
+            start_w = np.random.randint(0, col - self.crop_size + 1)
+        else:
+            start_h = (row - self.crop_size) / 2
+            start_w = (col - self.crop_size) / 2
+        end_h, end_w = start_h + self.crop_size, start_w + self.crop_size
+        im = im.crop((start_h, start_w, end_h, end_w))
+        if (self.is_train) and (np.random.randint(2) == 0):
+            im = im.transpose(Image.FLIP_LEFT_RIGHT)
+        return im
+
+    def transform(self, im):
+        im = self.resize(im, self.min_size)
+        im = self.crop_and_flip(im)
+        im = np.array(im, dtype=np.float32)  # convert to numpy.array
+        # transpose, swap channel, sub mean
+        ImageTransformer.transformer(self, im)
+        return im
+
+    def load_image_from_string(self, data):
+        im = Image.open(StringIO(data))
+        return im
+
+    def transform_from_string(self, data):
+        im = self.load_image_from_string(data)
+        return self.transform(im)
+
+    def load_image_from_file(self, file):
+        im = Image.open(file)
+        return im
+
+    def transform_from_file(self, file):
+        im = self.load_image_from_file(file)
+        return self.transform(im)
+
+
+def job(is_img_string, transformer, (data, label)):
+    if is_img_string:
+        return transformer.transform_from_string(data), label
+    else:
+        return transformer.transform_from_file(data), label
+
+
+class MultiProcessImageTransformer(object):
+    def __init__(self,
+                 procnum=10,
+                 resize_size=None,
+                 crop_size=None,
+                 transpose=(2, 0, 1),
+                 channel_swap=None,
+                 mean=None,
+                 is_train=True,
+                 is_color=True,
+                 is_img_string=True):
+        """
+        Processing image with multi-process. If it is used in PyDataProvider,
+        the simple usage for CNN is as follows:
+       
+        .. code-block:: python
+
+            def hool(settings, is_train,  **kwargs):
+                settings.is_train = is_train
+                settings.mean_value = np.array([103.939,116.779,123.68], dtype=np.float32)
+                settings.input_types = [
+                    dense_vector(3 * 224 * 224),
+                    integer_value(1)]
+                settings.transformer = MultiProcessImageTransformer(
+                    procnum=10,
+                    resize_size=256,
+                    crop_size=224,
+                    transpose=(2, 0, 1),
+                    mean=settings.mean_values,
+                    is_train=settings.is_train)
+
+
+            @provider(init_hook=hook, pool_size=20480)
+            def process(settings, file_list):
+                with open(file_list, 'r') as fdata:
+                    for line in fdata: 
+                        data_dic = np.load(line.strip()) # load the data batch pickled by Pickle.
+                        data = data_dic['data']
+                        labels = data_dic['label']
+                        labels = np.array(labels, dtype=np.float32)
+                        for im, lab in settings.dp.run(data, labels):
+                            yield [im.astype('float32'), int(lab)]
+
+        :param procnum: processor number.
+        :type procnum: int
+        :param resize_size: the shorter edge size of image after resizing.
+        :type resize_size: int
+        :param crop_size: the croping size.
+        :type crop_size: int
+        :param transpose: the transpose order, Paddle only allow C * H * W order.
+        :type transpose: tuple or list
+        :param channel_swap: the channel swap order, RGB or BRG.
+        :type channel_swap: tuple or list
+        :param mean: the mean values of image, per-channel mean or element-wise mean.
+        :type mean: array, The dimension is 1 for per-channel mean.
+                    The dimension is 3 for element-wise mean. 
+        :param is_train: training peroid or testing peroid.
+        :type is_train: bool.
+        :param is_color: the image is color or gray. 
+        :type is_color: bool.
+        :param is_img_string: The input can be the file name of image or image string.
+        :type is_img_string: bool.
+        """
+
+        self.procnum = procnum
+        self.pool = multiprocessing.Pool(procnum)
+        self.is_img_string = is_img_string
+        if cv2 is not None:
+            self.transformer = CvTransformer(resize_size, crop_size, transpose,
+                                             channel_swap, mean, is_train,
+                                             is_color)
+        else:
+            self.transformer = PILTransformer(resize_size, crop_size, transpose,
+                                              channel_swap, mean, is_train,
+                                              is_color)
+
+    def run(self, data, label):
+        fun = functools.partial(job, self.is_img_string, self.transformer)
+        return self.pool.imap_unordered(
+            fun, itertools.izip(data, label), chunksize=100 * self.procnum)
diff --git a/python/paddle/utils/image_util.py b/python/paddle/utils/image_util.py
index b5c6431c06f77cef5c31ca844a8427eebaea2fce..d3d79b14405256bbc95c41d805dbee56cb104f5e 100644
--- a/python/paddle/utils/image_util.py
+++ b/python/paddle/utils/image_util.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -186,29 +186,32 @@ class ImageTransformer:
                  channel_swap=None,
                  mean=None,
                  is_color=True):
-        self.transpose = transpose
-        self.channel_swap = None
-        self.mean = None
         self.is_color = is_color
+        self.set_transpose(transpose)
+        self.set_channel_swap(channel_swap)
+        self.set_mean(mean)
 
     def set_transpose(self, order):
-        if self.is_color:
-            assert 3 == len(order)
+        if order is not None:
+            if self.is_color:
+                assert 3 == len(order)
         self.transpose = order
 
     def set_channel_swap(self, order):
-        if self.is_color:
-            assert 3 == len(order)
+        if order is not None:
+            if self.is_color:
+                assert 3 == len(order)
         self.channel_swap = order
 
     def set_mean(self, mean):
-        # mean value, may be one value per channel 
-        if mean.ndim == 1:
-            mean = mean[:, np.newaxis, np.newaxis]
-        else:
-            # elementwise mean
-            if self.is_color:
-                assert len(mean.shape) == 3
+        if mean is not None:
+            # mean value, may be one value per channel 
+            if mean.ndim == 1:
+                mean = mean[:, np.newaxis, np.newaxis]
+            else:
+                # elementwise mean
+                if self.is_color:
+                    assert len(mean.shape) == 3
         self.mean = mean
 
     def transformer(self, data):
diff --git a/python/paddle/utils/make_model_diagram.py b/python/paddle/utils/make_model_diagram.py
index 29e271717d7108f343d4c28d51c7dfb11bb33fba..1370ea83a49955d3152a1147f8e8108371a8ae12 100644
--- a/python/paddle/utils/make_model_diagram.py
+++ b/python/paddle/utils/make_model_diagram.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/utils/plotcurve.py b/python/paddle/utils/plotcurve.py
index 7bc7c5f8d243ed4cca834c48197c511e44baf215..27bd8157d39632913e2fa3278f3af20ddea61da7 100644
--- a/python/paddle/utils/plotcurve.py
+++ b/python/paddle/utils/plotcurve.py
@@ -1,5 +1,5 @@
 #!/usr/bin/python
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/utils/predefined_net.py b/python/paddle/utils/predefined_net.py
index e9033432ed5200a88ce1ce4f3d7e74d03cf8c8e5..fa05f981f2b66bf55303a6f7c332c0bc9b112d29 100644
--- a/python/paddle/utils/predefined_net.py
+++ b/python/paddle/utils/predefined_net.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/utils/preprocess_img.py b/python/paddle/utils/preprocess_img.py
index f3c609e4cd1a3714219965cd543ab11136d3585f..975f1e9edea161331d37afbc6b5af46286f185bf 100644
--- a/python/paddle/utils/preprocess_img.py
+++ b/python/paddle/utils/preprocess_img.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/utils/preprocess_util.py b/python/paddle/utils/preprocess_util.py
index e5067a80ea7005bee7781e885b3658a2c03dc6f2..1d17a488243eb81e46bea3ead686efd021499e22 100644
--- a/python/paddle/utils/preprocess_util.py
+++ b/python/paddle/utils/preprocess_util.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/utils/show_pb.py b/python/paddle/utils/show_pb.py
index 3b371727b84e51f9f7db80b34e6e38fd149fcaaa..20614826d1d01f50a2bb54a840e2c584fb93b247 100644
--- a/python/paddle/utils/show_pb.py
+++ b/python/paddle/utils/show_pb.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/utils/torch2paddle.py b/python/paddle/utils/torch2paddle.py
index 958f55dbc4ee0b588b78fb630153b585f1ad4be0..91490111a1144ae25ed6566ff1c83db4f7954d33 100644
--- a/python/paddle/utils/torch2paddle.py
+++ b/python/paddle/utils/torch2paddle.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/third_party/gflags.BUILD b/third_party/gflags.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..85e8bd0bd74942102e5e9a9f817dc49383a745e7
--- /dev/null
+++ b/third_party/gflags.BUILD
@@ -0,0 +1,12 @@
+# Bazel (http://bazel.io/) BUILD file for gflags.
+#
+# See INSTALL.md for instructions for adding gflags to a Bazel workspace.
+
+licenses(["notice"])
+
+exports_files(["src/gflags_complections.sh", "COPYING.txt"])
+
+load(":bazel/gflags.bzl", "gflags_sources", "gflags_library")
+(hdrs, srcs) = gflags_sources(namespace=["google", "gflags"])
+gflags_library(hdrs=hdrs, srcs=srcs, threads=0)
+gflags_library(hdrs=hdrs, srcs=srcs, threads=1)
diff --git a/third_party/gflags_test/BUILD b/third_party/gflags_test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..b50615203ba17c74a4c7611b685f3d3210389bbf
--- /dev/null
+++ b/third_party/gflags_test/BUILD
@@ -0,0 +1,10 @@
+licenses(["notice"])  # Apache 2.0
+
+cc_test(
+    name="gflags_test",
+    srcs=["gflags_test.cc"],
+    copts=["-Iexternal/gtest/include"],
+    deps=[
+        "@gtest//:gtest",
+        "@gflags//:gflags",
+    ], )
diff --git a/third_party/gflags_test/gflags_test.cc b/third_party/gflags_test/gflags_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..53286e7e5be062cf66b37d07047b173ea831e6c4
--- /dev/null
+++ b/third_party/gflags_test/gflags_test.cc
@@ -0,0 +1,33 @@
+#include <iostream>
+#include <string>
+
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+
+DEFINE_bool(verbose, false, "Display program name before message");
+DEFINE_string(message, "Hello world!", "Message to print");
+
+static bool IsNonEmptyMessage(const char *flagname, const std::string &value) {
+  return value[0] != '\0';
+}
+DEFINE_validator(message, &IsNonEmptyMessage);
+
+namespace third_party {
+namespace gflags_test {
+
+TEST(GflagsTest, ParseAndPrint) {
+  gflags::SetUsageMessage("some usage message");
+  gflags::SetVersionString("1.0.0");
+  int argc = 1;
+  char program_name[] = "gflags_test";
+  char **argv = new char *[2];
+  argv[0] = program_name;
+  argv[1] = NULL;
+  gflags::ParseCommandLineFlags(&argc, reinterpret_cast<char ***>(&argv), true);
+  EXPECT_EQ("gflags_test", std::string(gflags::ProgramInvocationShortName()));
+  EXPECT_EQ("Hello world!", FLAGS_message);
+  gflags::ShutDownCommandLineFlags();
+}
+
+}  // namespace gflags_test
+}  // namespace third_party
diff --git a/third_party/glog.BUILD b/third_party/glog.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a0ff1d6b416c2217b62f64bceee3c6a611c11dfe
--- /dev/null
+++ b/third_party/glog.BUILD
@@ -0,0 +1,128 @@
+licenses(["notice"])
+
+cc_library(
+    visibility=["//visibility:public"],
+    name="glog",
+    includes=[
+        ".",
+        "src",
+    ],
+    copts=[
+        "-D_START_GOOGLE_NAMESPACE_='namespace google {'",
+        "-D_END_GOOGLE_NAMESPACE_='}'",
+        "-DGOOGLE_NAMESPACE='google'",
+        "-DGOOGLE_GLOG_DLL_DECL=''",
+        "-DHAVE_DLADDR",
+        "-DHAVE_SNPRINTF",
+        "-DHAVE_DLFCN_H",
+        "-DHAVE_FCNTL",
+        "-DHAVE_GLOB_H",
+        "-DHAVE_INTTYPES_H",
+        "-DHAVE_LIBPTHREAD",
+        "-DHAVE_SYS_SYSCALL_H",
+        "-DHAVE_MEMORY_H",
+        "-DHAVE_NAMESPACES",
+        "-DHAVE_PREAD",
+        "-DHAVE_PTHREAD",
+        "-DHAVE_PWD_H",
+        "-DHAVE_PWRITE",
+        "-DHAVE_RWLOCK",
+        "-DHAVE_SIGACTION",
+        "-DHAVE_SIGALTSTACK",
+        "-DHAVE_STDINT_H",
+        "-DHAVE_STRING_H",
+        "-DHAVE_SYS_TIME_H",
+        "-DHAVE_SYS_TYPES_H",
+        "-DHAVE_SYS_UCONTEXT_H",
+        "-DHAVE_SYS_UTSNAME_H",
+        "-DHAVE_UNISTD_H",
+        "-DHAVE_USING_OPERATOR",
+        "-DHAVE_HAVE___ATTRIBUTE___",
+        "-DHAVE_HAVE___BUILTIN_EXPECT",
+        #"-DNO_FRAME_POINTER",
+        "-D_GNU_SOURCE",
+        #"-fno-sanitize=thread",
+        #"-fno-sanitize=address",
+        "-Iexternal/glog/src",
+    ],
+    srcs=[
+        "src/demangle.cc",
+        "src/logging.cc",
+        "src/raw_logging.cc",
+        "src/signalhandler.cc",
+        "src/symbolize.cc",
+        "src/utilities.cc",
+        "src/vlog_is_on.cc",
+        ":config_h",
+        ":logging_h",
+        ":raw_logging_h",
+        ":stl_logging_h",
+        ":vlog_is_on_h",
+    ],
+    hdrs=[
+        "src/demangle.h",
+        "src/mock-log.h",
+        "src/stacktrace.h",
+        "src/symbolize.h",
+        "src/utilities.h",
+        "src/base/commandlineflags.h",
+        "src/base/googleinit.h",
+        "src/base/mutex.h",
+        "src/glog/log_severity.h",
+    ])
+
+genrule(
+    name="config_h",
+    srcs=["src/config.h.cmake.in"],
+    outs=["config.h"],
+    cmd="awk '{ gsub(/^#cmakedefine/, \"//cmakedefine\"); print; }' $(<) > $(@)",
+)
+
+genrule(
+    name="logging_h",
+    srcs=["src/glog/logging.h.in"],
+    outs=["glog/logging.h"],
+    cmd="$(location :gen_sh) < $(<) > $(@)",
+    tools=[":gen_sh"])
+
+genrule(
+    name="raw_logging_h",
+    srcs=["src/glog/raw_logging.h.in"],
+    outs=["glog/raw_logging.h"],
+    cmd="$(location :gen_sh) < $(<) > $(@)",
+    tools=[":gen_sh"])
+
+genrule(
+    name="stl_logging_h",
+    srcs=["src/glog/stl_logging.h.in"],
+    outs=["glog/stl_logging.h"],
+    cmd="$(location :gen_sh) < $(<) > $(@)",
+    tools=[":gen_sh"])
+
+genrule(
+    name="vlog_is_on_h",
+    srcs=["src/glog/vlog_is_on.h.in"],
+    outs=["glog/vlog_is_on.h"],
+    cmd="$(location :gen_sh) < $(<) > $(@)",
+    tools=[":gen_sh"])
+
+genrule(
+    name="gen_sh",
+    outs=["gen.sh"],
+    cmd="""
+cat > $@ <<"EOF"
+#! /bin/sh
+sed -e 's/@ac_cv_have_unistd_h@/1/g' \
+    -e 's/@ac_cv_have_stdint_h@/1/g' \
+    -e 's/@ac_cv_have_systypes_h@/1/g' \
+    -e 's/@ac_cv_have_libgflags_h@/1/g' \
+    -e 's/@ac_cv_have_uint16_t@/1/g' \
+    -e 's/@ac_cv_have___builtin_expect@/1/g' \
+    -e 's/@ac_cv_have_.*@/0/g' \
+    -e 's/@ac_google_start_namespace@/namespace google {/g' \
+    -e 's/@ac_google_end_namespace@/}/g' \
+    -e 's/@ac_google_namespace@/google/g' \
+    -e 's/@ac_cv___attribute___noinline@/__attribute__((noinline))/g' \
+    -e 's/@ac_cv___attribute___noreturn@/__attribute__((noreturn))/g' \
+    -e 's/@ac_cv___attribute___printf_4_5@/__attribute__((__format__ (__printf__, 4, 5)))/g'
+EOF""")
diff --git a/third_party/glog_test/BUILD b/third_party/glog_test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..56d08e95f8e8f063829ae68586fa9ef53306fef6
--- /dev/null
+++ b/third_party/glog_test/BUILD
@@ -0,0 +1,10 @@
+licenses(["notice"])  # Apache 2.0
+
+cc_test(
+    name="glog_test",
+    srcs=["glog_test.cc"],
+    copts=["-Iexternal/gtest/include"],
+    deps=[
+        "@gtest//:gtest",
+        "@glog//:glog",
+    ], )
diff --git a/third_party/glog_test/glog_test.cc b/third_party/glog_test/glog_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f1d737d625d25e8675f636075876903c42881a35
--- /dev/null
+++ b/third_party/glog_test/glog_test.cc
@@ -0,0 +1,7 @@
+#include <iostream>
+#include <string>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+TEST(GlogTest, Logging) { LOG(INFO) << "Hello world"; }
diff --git a/third_party/gtest.BUILD b/third_party/gtest.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..9255b51d9aaa9c7ee5cbc1b2d537815c7ecbfcba
--- /dev/null
+++ b/third_party/gtest.BUILD
@@ -0,0 +1,8 @@
+cc_library(
+    name="gtest",
+    srcs=glob(
+        ["src/*.cc"], exclude=["src/gtest-all.cc"]),
+    hdrs=glob(["include/**/*.h", "src/*.h"]),
+    copts=["-Iexternal/gtest/include"],
+    linkopts=["-pthread"],
+    visibility=["//visibility:public"], )
diff --git a/third_party/protobuf_test/BUILD b/third_party/protobuf_test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..67d4293c70eef081f6bb95de9774613a19ba91dd
--- /dev/null
+++ b/third_party/protobuf_test/BUILD
@@ -0,0 +1,24 @@
+licenses(["notice"])  # Apache 2.0
+
+load("@protobuf//:protobuf.bzl", "cc_proto_library")
+
+cc_proto_library(
+    name="example_proto",
+    srcs=["example.proto"],
+    protoc="@protobuf//:protoc",
+    default_runtime="@protobuf//:protobuf", )
+
+cc_library(
+    name="example_lib",
+    srcs=["example_lib.cc"],
+    hdrs=["example_lib.h"],
+    deps=[":example_proto"], )
+
+cc_test(
+    name="example_lib_test",
+    srcs=["example_lib_test.cc"],
+    copts=["-Iexternal/gtest/include"],
+    deps=[
+        "@gtest//:gtest",
+        ":example_lib",
+    ], )
diff --git a/third_party/protobuf_test/README.md b/third_party/protobuf_test/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e8bdeee6fee66ef79d0b813b4d8dfa4c180754c6
--- /dev/null
+++ b/third_party/protobuf_test/README.md
@@ -0,0 +1 @@
+This package tests that Bazel can build protobuf related rules.
diff --git a/third_party/protobuf_test/example.proto b/third_party/protobuf_test/example.proto
new file mode 100644
index 0000000000000000000000000000000000000000..6a7eada9c14a9df5d3ef8971b636c14a11da3d11
--- /dev/null
+++ b/third_party/protobuf_test/example.proto
@@ -0,0 +1,7 @@
+syntax = "proto3";
+
+package third_party.protobuf_test;
+
+message Greeting {
+  string name = 1;
+}
diff --git a/third_party/protobuf_test/example_lib.cc b/third_party/protobuf_test/example_lib.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ced377bc0a17dde31c5c853dec1a852fa0be7223
--- /dev/null
+++ b/third_party/protobuf_test/example_lib.cc
@@ -0,0 +1,9 @@
+#include "third_party/protobuf_test/example_lib.h"
+
+namespace third_party {
+namespace protobuf_test {
+
+std::string get_greet(const Greeting& who) { return "Hello " + who.name(); }
+
+}  // namespace protobuf_test
+}  // namespace thrid_party
diff --git a/third_party/protobuf_test/example_lib.h b/third_party/protobuf_test/example_lib.h
new file mode 100644
index 0000000000000000000000000000000000000000..516326e812e19eb162f5392b519904a65c66c660
--- /dev/null
+++ b/third_party/protobuf_test/example_lib.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "third_party/protobuf_test/example.pb.h"
+
+#include <string>
+
+namespace third_party {
+namespace protobuf_test {
+
+std::string get_greet(const Greeting &who);
+
+}  // namespace protobuf_test
+}  // namespace third_party
diff --git a/third_party/protobuf_test/example_lib_test.cc b/third_party/protobuf_test/example_lib_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6229f56e6026908fff991765bd6bdaff6f8236ac
--- /dev/null
+++ b/third_party/protobuf_test/example_lib_test.cc
@@ -0,0 +1,15 @@
+#include "third_party/protobuf_test/example_lib.h"
+
+#include "gtest/gtest.h"
+
+namespace third_party {
+namespace protobuf_test {
+
+TEST(ProtobufTest, GetGreet) {
+  Greeting g;
+  g.set_name("Paddle");
+  EXPECT_EQ("Hello Paddle", get_greet(g));
+}
+
+}  // namespace protobuf_test
+}  // namespace third_party
diff --git a/warp-ctc b/warp-ctc
new file mode 160000
index 0000000000000000000000000000000000000000..bd535c8d44e03c8ebd2d768e06c8c05fdccd11d2
--- /dev/null
+++ b/warp-ctc
@@ -0,0 +1 @@
+Subproject commit bd535c8d44e03c8ebd2d768e06c8c05fdccd11d2