Merge remote-tracking branch 'upstream/develop' into fix-4388

e65960ed · wangmeng28 · 0d79e973 · 9276ca78 · e65960ed · e65960ed
849 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@ cmake_install.cmake
 paddle/.timestamp
 python/paddlepaddle.egg-info/
 paddle/pybind/pybind.h
+python/paddle/v2/framework/tests/tmp/*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -31,6 +31,3 @@
    -   id: go-fmt
        types:
        - go
-    -   id: gometalinter
-        types:
-        - go
--- a/.travis.yml
+++ b/.travis.yml
@@ -30,6 +30,7 @@ addons:
      - automake
      - libtool
      - ccache
+  ssh_known_hosts: 52.76.173.135
 before_install:
  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
@@ -42,6 +43,14 @@ script:
  - |
    timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
+  - |
+    if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
+    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
+    if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
+    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
+    export DOCS_DIR=`pwd`
+    cd ..
+    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc   
 notifications:
  email:
    on_success: change

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,6 +86,14 @@ if(ANDROID OR IOS)
        "Disable MKLDNN when cross-compiling for Android and iOS" FORCE)
    set(WITH_MKLML OFF CACHE STRING
        "Disable MKLML package when cross-compiling for Android and iOS" FORCE)
+    # Compile PaddlePaddle mobile inference library
+    if (NOT WITH_C_API)
+        set(WITH_C_API ON CACHE STRING
+            "Always compile the C_API when cross-compiling for Android and iOS" FORCE)
+    endif()
+    set(MOBILE_INFERENCE ON)
+    add_definitions(-DPADDLE_MOBILE_INFERENCE)
 endif()
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
@@ -97,6 +105,12 @@ if (WITH_C_API AND WITH_PYTHON)
    "different Python interpreter from compiling.")
 endif()
+if(MOBILE_INFERENCE)
+    set(THIRD_PARTY_BUILD_TYPE MinSizeRel)
+else()
+    set(THIRD_PARTY_BUILD_TYPE Release)
+endif()
 ########################################################################################
 include(external/mklml)     # download mklml package
@@ -113,6 +127,7 @@ include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)    # download pybind11
+include(external/nccl)
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration
@@ -145,7 +160,7 @@ set(EXTERNAL_LIBS
 if(WITH_GPU)
    list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
    if(NOT WITH_DSO)
-        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
+        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
    endif(NOT WITH_DSO)
 endif(WITH_GPU)
@@ -160,9 +175,11 @@ endif(USE_NNPACK)
 add_subdirectory(proto)
-# "add_subdirectory(go)" should be placed after the following loine,
+if(NOT MOBILE_INFERENCE)
-# because it depends on paddle/optimizer.
+    # "add_subdirectory(go)" should be placed after the following loine,
-add_subdirectory(paddle/optimizer)
+    # because it depends on paddle/optimizer.
+    add_subdirectory(paddle/optimizer)
+endif()
 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
 # placed after this block, because they depends on it.

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
-./doc/howto/dev/contribute_to_paddle_en.md
+# Contribute Code
+We sincerely appreciate your contribution.  This document explains our workflow and work style.
+## Workflow
+PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-git-branching-model/).  The following steps guide usual contributions.
+1. Fork
+   Our development community has been growing fastly; it doesn't make sense for everyone to write into the official repo.  So, please file Pull Requests from your fork.  To make a fork,  just head over to the GitHub page and click the ["Fork" button](https://help.github.com/articles/fork-a-repo/).
+1. Clone
+   To make a copy of your fork to your local computers, please run
+   ```bash
+   git clone https://github.com/your-github-account/paddle
+   cd paddle
+   ```
+1. Create the local feature branch
+   For daily works like adding a new feature or fixing a bug, please open your feature branch before coding:
+   ```bash
+   git checkout -b my-cool-stuff
+   ```
+1. Commit
+   Before issuing your first `git commit` command, please install [`pre-commit`](http://pre-commit.com/) by running the following commands:
+   ```bash
+   pip install pre-commit
+   pre-commit install
+   ```
+   Our pre-commit configuration requires clang-format 3.8 for auto-formating C/C++ code and yapf for Python.
+   Once installed, `pre-commit` checks the style of code and documentation in every commit.  We will see something like the following when you run `git commit`:
+   ```
+   ➜  git commit
+   CRLF end-lines remover...............................(no files to check)Skipped
+   yapf.................................................(no files to check)Skipped
+   Check for added large files..............................................Passed
+   Check for merge conflicts................................................Passed
+   Check for broken symlinks................................................Passed
+   Detect Private Key...................................(no files to check)Skipped
+   Fix End of Files.....................................(no files to check)Skipped
+   clang-formater.......................................(no files to check)Skipped
+   [my-cool-stuff c703c041] add test file
+    1 file changed, 0 insertions(+), 0 deletions(-)
+    create mode 100644 233
+   ```
+1. Build and test
+   Users can build PaddlePaddle natively on Linux and Mac OS X.  But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md).
+1. Keep pulling
+   An experienced Git user pulls from the official repo often -- daily or even hourly, so they notice conflicts with others work early, and it's easier to resolve smaller conflicts.
+   ```bash
+   git remote add upstream https://github.com/PaddlePaddle/Paddle
+   git pull upstream develop
+   ```
+1. Push and file a pull request
+   You can "push" your local work into your forked repo:
+   ```bash
+   git push origin my-cool-stuff
+   ```
+   The push allows you to create a pull request, requesting owners of this [official repo](https://github.com/PaddlePaddle/Paddle) to pull your change into the official one.
+   To create a pull request, please follow [these steps](https://help.github.com/articles/creating-a-pull-request/).
+   If your change is for fixing an issue, please write ["Fixes <issue-URL>"](https://help.github.com/articles/closing-issues-using-keywords/) in the description section of your pull request.  Github would close the issue when the owners merge your pull request.
+   Please remember to specify some reviewers for your pull request.  If you don't know who are the right ones, please follow Github's recommendation.
+1. Delete local and remote branches
+   To keep your local workspace and your fork clean, you might want to remove merged branches:
+   ```bash
+   git push origin :my-cool-stuff
+   git checkout develop
+   git pull upstream develop
+   git branch -d my-cool-stuff
+   ```
+### Code Review
+-  Please feel free to ping your reviewers by sending them the URL of your pull request via IM or email.  Please do this after your pull request passes the CI.
+- Please answer reviewers' every comment.  If you are to follow the comment, please write "Done"; please give a reason otherwise.
+- If you don't want your reviewers to get overwhelmed by email notifications, you might reply their comments by [in a batch](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/).
+- Reduce the unnecessary commits.  Some developers commit often.  It is recommended to append a sequence of small changes into one commit by running `git commit --amend` instead of `git commit`.
+## Coding Standard
+### Code Style
+Our C/C++ code follows the [Google style guide](http://google.github.io/styleguide/cppguide.html).
+Our Python code follows the [PEP8 style guide](https://www.python.org/dev/peps/pep-0008/).
+Our build process helps to check the code style.  In [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/docker/build.sh#L42), the entry point of our [builder Docker image](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/Dockerfile#L88), the CMake argument `WITH_STYLE_CHECK` is set to `ON` by default.  This flag is on
+Please install pre-commit, which automatically reformat the changes to C/C++ and Python code whenever we run `git commit`.  To check the whole codebase, we can run the command `pre-commit run -a`, as in the [`check_style.sh` file](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/travis/check_style.sh#L30), which is invoked by [our Travis CI configuration](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/.travis.yml#L43).
+### Unit Tests
+Please remember to add related unit tests.
+- For C/C++ code, please follow [`google-test` Primer](https://github.com/google/googletest/blob/master/googletest/docs/Primer.md).
+- For Python code, please use [Python's standard `unittest` package](http://pythontesting.net/framework/unittest/unittest-introduction/).
+### Writing Logs
+We use [glog](https://github.com/google/glog) for logging in our C/C++ code.
+For general information, please use `LOG`.  For debug information, please use [`VLOG`](http://htmlpreview.github.io/?https://github.com/google/glog/blob/master/doc/glog.html#verbose).  The reason is at [here](https://groups.google.com/a/chromium.org/d/msg/chromium-dev/3NDNd1KzXeY/AZKMMx37fdQJ).
+`VLOG` requires a *verbose level* parameter.  For example:
+```c++
+VLOG(3) << "Operator FC is taking " << num_inputs << "inputs."
+```
+When we run a PaddlePaddle application or test, we can specify a verbose threshold.  For example:
+```bash
+GLOG_vmodule=buddy_allocator=2 \
+GLOG_v=10 \
+python \
+../python/paddle/v2/framework/tests/test_recurrent_op.py
+```
+This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3.  This suggests that we output overall messages in lower verbose levels, so they display with higher probability.  When coding C++, please follow the verbose level convention as follows:
+- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework)
+- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)
+- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform)
+- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/math)
--- a/Dockerfile
+++ b/Dockerfile
@@ -22,7 +22,7 @@ COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
    apt-get install -y \
-    git python-pip python-dev openssh-server bison  \
+    git python-pip python-dev openssh-server bison libnccl-dev \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
    python-matplotlib gcc-4.8 g++-4.8 \

--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
+# Benchmark
+Machine:
+- Server
+ 	- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
+- Laptop
+ 	- DELL XPS15-9560-R1745: i7-7700HQ 8G 256GSSD
+ 	- i5 MacBook Pro (Retina, 13-inch, Early 2015)
+- Desktop
+ 	- i7-6700k
+System: CentOS release 6.3 (Final), Docker 1.12.1.
+PaddlePaddle: paddlepaddle/paddle:latest (TODO: will rerun after 0.11.0)
+- MKL-DNN tag v0.10
+- MKLML 2018.0.20170720
+- OpenBLAS v0.2.20
+On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
+## Benchmark Model
+### Server
+Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148M CPU @ 2.40GHz
+Input image size - 3 * 224 * 224, Time: images/second
+- VGG-19
+| BatchSize    | 64    | 128  | 256     |
+|--------------|-------| -----| --------|
+| OpenBLAS     | 7.82  | 8.62  | 10.34  | 
+| MKLML        | 11.02 | 12.86 | 15.33  |
+| MKL-DNN      | 27.69 | 28.8 | 29.27  |
+chart on batch size 128
+TBD
+ - ResNet
+ - GoogLeNet
+### Laptop
+TBD
+### Desktop
+TBD
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -24,6 +24,10 @@ if(WITH_DOUBLE)
    add_definitions(-DPADDLE_TYPE_DOUBLE)
 endif(WITH_DOUBLE)
+if(WITH_TESTING)
+    add_definitions(-DPADDLE_WITH_TESTING)
+endif(WITH_TESTING)
 if(NOT WITH_TIMER)
    add_definitions(-DPADDLE_DISABLE_TIMER)
 endif(NOT WITH_TIMER)
@@ -49,19 +53,20 @@ if(NOT WITH_GOLANG)
 endif(NOT WITH_GOLANG)
 if(NOT WITH_GPU)
-    add_definitions(-DPADDLE_ONLY_CPU)
    add_definitions(-DHPPL_STUB_FUNC)
    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
+    add_definitions(-DPADDLE_WITH_CUDA)
    FIND_PACKAGE(CUDA REQUIRED)
    if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
-        message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile")
+        message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile")
    endif()
    if(NOT CUDNN_FOUND)
-        message(FATAL_ERROR "Paddle need cudnn to compile")
+        message(FATAL_ERROR "Paddle needs cudnn to compile")
    endif()
    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")

--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -8,7 +8,7 @@ ExternalProject_Add(
    extern_eigen3
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
-    GIT_TAG         "master"
+    GIT_TAG         70661066beef694cadf6c304d0d07e0758825c10
    PREFIX          ${EIGEN_SOURCE_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""

--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -36,6 +36,7 @@ ExternalProject_Add(
    # change this back to the official Github repo once my PR is
    # merged.
    GIT_REPOSITORY  "https://github.com/wangkuiyi/gflags.git"
+    GIT_TAG         986964c07427ecb9cdb5bd73f73ebbd40e54dadb
    PREFIX          ${GFLAGS_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -45,11 +46,11 @@ ExternalProject_Add(
                    -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                    -DBUILD_TESTING=OFF
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
 ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)

--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -31,6 +31,7 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS gflags
    GIT_REPOSITORY  "https://github.com/google/glog.git"
+    GIT_TAG         v0.3.5
    PREFIX          ${GLOG_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -43,12 +44,12 @@ ExternalProject_Add(
                    -DWITH_GFLAGS=ON
                    -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
                    -DBUILD_TESTING=OFF
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
                     -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
 ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)

--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -56,11 +56,11 @@ IF(WITH_TESTING)
                        -DBUILD_GMOCK=ON
                        -Dgtest_disable_pthreads=ON
                        -Dgtest_force_shared_crt=ON
-                        -DCMAKE_BUILD_TYPE=Release
+                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                        ${EXTERNAL_OPTIONAL_ARGS}
        CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                         -DCMAKE_BUILD_TYPE:STRING=Release
+                         -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
    )
    ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL)

--- a/cmake/external/nccl.cmake
+++ b/cmake/external/nccl.cmake
+include(ExternalProject)
+set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
+include_directories(${NCCL_SOURCE_DIR}/src/extern_nccl/src)
+if(WITH_DSO)
+  # If we use DSO, we do not build nccl, just download the dependencies
+  set(NCCL_BUILD_COMMAND "")
+  set(NCCL_INSTALL_COMMAND "")
+  set(NCCL_INSTALL_DIR "")
+else()
+  # otherwise, we build nccl and link it.
+  set(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
+  # Note: cuda 8.0 is needed to make nccl
+  # When cuda is not installed on the system directory, need to set CUDA_HOME to your cuda root
+  set(NCCL_BUILD_COMMAND "make -j 8")
+  set(NCCL_INSTALL_COMMAND  "make install PREFIX=${NCCL_INSTALL_DIR}")
+endif()
+ExternalProject_Add(
+    extern_nccl
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/NVIDIA/nccl.git"
+    GIT_TAG         "v1.3.4-1"
+    PREFIX          "${NCCL_SOURCE_DIR}"
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     "${NCCL_BUILD_COMMAND}"
+    INSTALL_COMMAND   "${NCCL_INSTALL_COMMAND}"
+    INSTALL_DIR       "${NCCL_INSTALL_DIR}"
+    TEST_COMMAND      ""
+)
+if(WITH_DSO)
+  if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_nccl_dummy.c)
+    file(WRITE ${dummyfile} "const char * dummy_nccl = \"${dummyfile}\";")
+    add_library(nccl STATIC ${dummyfile})
+  else()
+    add_library(nccl INTERFACE)
+  endif()
+else()
+  add_library(nccl STATIC IMPORTED GLOBAL)
+  set_property(TARGET nccl PROPERTY IMPORTED_LOCATION
+               ${NCCL_INSTALL_DIR}/lib/libnccl_static.a)
+endif()
+add_dependencies(nccl extern_nccl)
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -191,12 +191,12 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
            ${OPTIONAL_ARGS}
            -Dprotobuf_BUILD_TESTS=OFF
            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-            -DCMAKE_BUILD_TYPE=Release
+            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
            -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
            -DCMAKE_INSTALL_LIBDIR=lib
        CMAKE_CACHE_ARGS
            -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-            -DCMAKE_BUILD_TYPE:STRING=Release
+            -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
            -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
            -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
            ${OPTIONAL_CACHE_ARGS}

--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -35,6 +35,7 @@ ExternalProject_Add(
    extern_warpctc
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY  "https://github.com/gangliao/warp-ctc.git"
+    GIT_TAG         b63a0644654a3e0ed624c85a1767bc8193aead09
    PREFIX          ${WARPCTC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -48,9 +49,9 @@ ExternalProject_Add(
                    -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
                    -DBUILD_SHARED=ON
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release
+    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                     -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )

--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -42,11 +42,11 @@ ExternalProject_Add(
                    -DBUILD_SHARED_LIBS=OFF
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                    -DCMAKE_MACOSX_RPATH=ON
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
 LIST(APPEND external_project_dependencies zlib)

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -389,13 +389,60 @@ function(go_test TARGET_NAME)
    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
 endfunction(go_test)
+# Modification of standard 'protobuf_generate_cpp()' with protobuf-lite support
+# Usage:
+#   paddle_protobuf_generate_cpp(<proto_srcs> <proto_hdrs> <proto_files>)
+function(paddle_protobuf_generate_cpp SRCS HDRS)
+  if(NOT ARGN)
+    message(SEND_ERROR "Error: paddle_protobuf_generate_cpp() called without any proto files")
+    return()
+  endif()
+  set(${SRCS})
+  set(${HDRS})
+  if (MOBILE_INFERENCE)
+      set(EXTRA_FLAG "lite:")  
+  else()
+      set(EXTRA_FLAG "") 
+  endif()
+  foreach(FIL ${ARGN})
+    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+    get_filename_component(FIL_WE ${FIL} NAME_WE)
+    set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc")
+    set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h")
+    list(APPEND ${SRCS} "${_protobuf_protoc_src}")
+    list(APPEND ${HDRS} "${_protobuf_protoc_hdr}")
+    add_custom_command(
+      OUTPUT "${_protobuf_protoc_src}"
+             "${_protobuf_protoc_hdr}"
+      COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
+      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} 
+      -I${CMAKE_CURRENT_SOURCE_DIR}
+      --cpp_out "${EXTRA_FLAG}${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
+      DEPENDS ${ABS_FIL} protoc
+      COMMENT "Running C++ protocol buffer compiler on ${FIL}"
+      VERBATIM )
+  endforeach()
+  set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
+  set(${SRCS} ${${SRCS}} PARENT_SCOPE)
+  set(${HDRS} ${${HDRS}} PARENT_SCOPE)
+endfunction()
 function(proto_library TARGET_NAME)
  set(oneValueArgs "")
  set(multiValueArgs SRCS DEPS)
  cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  set(proto_srcs)
  set(proto_hdrs)
-  protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
+  paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
  cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
 endfunction()

--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -73,25 +73,43 @@ function(link_paddle_exe TARGET_NAME)
        generate_rdma_links()
    endif()
-    target_circle_link_libraries(${TARGET_NAME}
+    if(MOBILE_INFERENCE)
-        ARCHIVE_START
+        target_circle_link_libraries(${TARGET_NAME}
-        paddle_gserver
+            ARCHIVE_START
-        paddle_function
+            paddle_gserver
-        ARCHIVE_END
+            paddle_function
-        paddle_pserver
+            ARCHIVE_END
-        paddle_trainer_lib
+            paddle_math
-        paddle_network
+            paddle_utils
-        paddle_math
+            paddle_parameter
-        paddle_utils
+            paddle_proto
-        paddle_parameter
+            paddle_cuda
-        paddle_proto
+            ${EXTERNAL_LIBS}
-        paddle_cuda
+            ${CMAKE_THREAD_LIBS_INIT}
-        paddle_optimizer
+            ${CMAKE_DL_LIBS}
-        ${EXTERNAL_LIBS}
+            ${RDMA_LD_FLAGS}
-        ${CMAKE_THREAD_LIBS_INIT}
+            ${RDMA_LIBS})
-        ${CMAKE_DL_LIBS}
+    else()
-        ${RDMA_LD_FLAGS}
+        target_circle_link_libraries(${TARGET_NAME}
-        ${RDMA_LIBS})
+            ARCHIVE_START
+            paddle_gserver
+            paddle_function
+            ARCHIVE_END
+            paddle_pserver
+            paddle_trainer_lib
+            paddle_network
+            paddle_math
+            paddle_utils
+            paddle_parameter
+            paddle_proto
+            paddle_cuda
+            paddle_optimizer
+            ${EXTERNAL_LIBS}
+            ${CMAKE_THREAD_LIBS_INIT}
+            ${CMAKE_DL_LIBS}
+            ${RDMA_LD_FLAGS}
+            ${RDMA_LIBS})
+    endif()
    if(ANDROID)
        target_link_libraries(${TARGET_NAME} log)

--- a/doc/api/v1/index_cn.rst
+++ b/doc/api/v1/index_cn.rst
@@ -21,7 +21,7 @@ Model Config API
    trainer_config_helpers/optimizers.rst
    trainer_config_helpers/data_sources.rst
    trainer_config_helpers/layers.rst
-    trainer_config_helpers/activations.rst 
+    trainer_config_helpers/activations.rst
    trainer_config_helpers/poolings.rst
    trainer_config_helpers/networks.rst
    trainer_config_helpers/evaluators.rst

--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -345,6 +345,11 @@ clip
 ..  autoclass:: paddle.v2.layer.clip
    :noindex:
+resize
+------
+..  autoclass:: paddle.v2.layer.resize
+    :noindex:
 slope_intercept
 ---------------
 ..  autoclass:: paddle.v2.layer.slope_intercept

--- a/doc/api/v2/config/networks.rst
+++ b/doc/api/v2/config/networks.rst
@@ -125,3 +125,8 @@ simple_attention
    :members: simple_attention
    :noindex:
+dot_product_attention
+---------------------
+..  automodule:: paddle.v2.networks
+    :members: dot_product_attention
+    :noindex:
--- a/doc/design/block.md
+++ b/doc/design/block.md
@@ -5,12 +5,12 @@
 Both deep learning systems and programming languages help users describe computation procedures.  These systems use various representations of computation:
 - Caffe, Torch, and Paddle: sequences of layers.
- TensorFlow, Caffe2, Mxnet: graphs of operators.
+- TensorFlow, Caffe2, Mxnet: graph of operators.
 - PaddlePaddle: nested blocks, like C++ and Java programs.
 ## Block in Programming Languages and Deep Learning
-In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions, or operators.
+In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions or operators.
 Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:
@@ -24,14 +24,14 @@ A key difference is that a C++ program describes a one pass computation, whereas
 ## Stack Frames and the Scope Hierarchy
-The existence of the backward makes the execution of a block of traditional programs and PaddlePaddle different to each other:
+The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:
-| programming languages | PaddlePaddle                  |
+| programming languages | PaddlePaddle                    |
-|-----------------------|-------------------------------|
+|-----------------------|---------------------------------|
-| stack                 | scope hierarchy               |
+| stack                 | scope hierarchy                 |
-| stack frame           | scope                         |
+| stack frame           | scope                           |
-| push at entering block| push at entering block        |
+| push at entering block| push at entering block          |
-| pop at leaving block  | destroy at minibatch completes|
+| pop at leaving block  | destroy when minibatch completes|
 1. In traditional programs:
@@ -42,9 +42,9 @@ The existence of the backward makes the execution of a block of traditional prog
 1. In PaddlePaddle
   - When the execution enters a block, PaddlePaddle adds a new scope, where it realizes variables.
-   - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are to be used by the backward pass.  So it has a stack forest known as a *scope hierarchy*.
+   - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are used by the backward pass.  So it has a stack forest known as a *scope hierarchy*.
   - The height of the highest tree is the maximum depth of nested blocks.
-   - After the process of a minibatch, PaddlePaddle destroys the scope hierarchy.
+   - After the processing of a minibatch, PaddlePaddle destroys the scope hierarchy.
 ## Use Blocks in C++ and PaddlePaddle Programs
@@ -55,17 +55,23 @@ Let us consolidate the discussion by presenting some examples.
 The following C++ programs shows how blocks are used with the `if-else` structure:
 ```c++
+namespace pd = paddle;
 int x = 10;
-int y = 20;
+int y = 1;
-int out;
+int z = 10;
 bool cond = false;
+int o1, o2;
 if (cond) {
  int z = x + y;
-  out = softmax(z);
+  o1 = z;
+  o2 = pd::layer::softmax(z);
 } else {
-  int z = fc(x);
+  int d = pd::layer::fc(z);
-  out = z;
+  o1 = d;
+  o2 = d+1;
 }
 ```
 An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](./if_else_op.md) is as follows:
@@ -73,57 +79,55 @@ An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator
 ```python
 import paddle as pd
-x = var(10)
+x = minibatch([10, 20, 30]) # shape=[None, 1]
-y = var(20)
+y = var(1) # shape=[1], value=1
-cond = var(false)
+z = minibatch([10, 20, 30]) # shape=[None, 1]
-ie = pd.create_ifelseop(inputs=[x], output_num=1)
+cond = larger_than(x, 15) # [false, true, true]
+ie = pd.ifelse()
 with ie.true_block():
-    x = ie.inputs(true, 0)
+    d = pd.layer.add_scalar(x, y)
-    z = operator.add(x, y)
+    ie.output(d, pd.layer.softmax(d))
-    ie.set_output(true, 0, operator.softmax(z))
 with ie.false_block():
-    x = ie.inputs(false, 0)
+    d = pd.layer.fc(z)
-    z = layer.fc(x)
+    ie.output(d, d+1)
-    ie.set_output(true, 0, operator.softmax(z))
+o1, o2 = ie(cond)
-out = b(cond)
 ```
-In both examples, the left branch computes `softmax(x+y)` and the right branch computes `fc(x)`.
+In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `fc(x)` and `x+1` .
+The difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.
-A difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.  The `ie.input(true, 0)` invocation returns instances in the 0-th input, `x`, that corresponds to true values in `cond` as the local variable `x`, where `ie.input(false, 0)` returns instances corresponding to false values.
 ### Blocks with `for` and `RNNOp`
-The following RNN model from the [RNN design doc](./rnn.md)
+The following RNN model in PaddlePaddle from the [RNN design doc](./rnn.md) :
 ```python
-x = sequence([10, 20, 30])
+x = sequence([10, 20, 30]) # shape=[None, 1]
-m = var(0)
+m = var(0) # shape=[1]
-W = tensor()
+W = var(0.314, param=true) # shape=[1]
-U = tensor()
+U = var(0.375, param=true) # shape=[1]
-rnn = create_rnn(inputs=[input])
+rnn = pd.rnn()
-with rnn.stepnet() as net:
+with rnn.step():
-  x = net.set_inputs(0)
+  h = rnn.memory(init = m)
-  h = net.add_memory(init=m)
+  h_prev = rnn.previous_memory(h)
-  fc_out = pd.matmul(W, x)
+  a = layer.fc(W, x)
-  hidden_out = pd.matmul(U, h.pre(n=1))
+  b = layer.fc(U, h_prev)  
-  sum = pd.add_two(fc_out, hidden_out)
+  s = pd.add(a, b)
-  act = pd.sigmoid(sum)
+  act = pd.sigmoid(s)
-  h.update(act)                       # update memory with act
+  rnn.update_memory(h, act)
-  net.set_outputs(0, act, hidden_out) # two outputs
+  rnn.output(a, b)
 o1, o2 = rnn()
-print o1, o2
 ```
 has its equivalent C++ program as follows
 ```c++
 int* x = {10, 20, 30};
-int m = 0;
+int* m = {0};
-int W = some_value();
+int* W = {0.314};
-int U = some_other_value();
+int* U = {0.375};
 int mem[sizeof(x) / sizeof(x[0]) + 1];
 int o1[sizeof(x) / sizeof(x[0]) + 1];
@@ -131,25 +135,21 @@ int o2[sizeof(x) / sizeof(x[0]) + 1];
 for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) {
  int x = x[i-1];
  if (i == 1) mem[0] = m;
-  int fc_out = W * x;
+  int a = W * x;
-  int hidden_out = Y * mem[i-1];
+  int b = Y * mem[i-1];
-  int sum = fc_out + hidden_out;
+  int s = fc_out + hidden_out;
  int act = sigmoid(sum);
  mem[i] = act;
  o1[i] = act;
  o2[i] = hidden_out;
 }
-print_array(o1);
-print_array(o2);
 ```
 ## Compilation and Execution
-Like TensorFlow programs, a PaddlePaddle program is written in Python.  The first part describes a neural network as a protobuf message, and the rest part executes the message for training or inference.
+Like TensorFlow, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest executes the message for training or inference.
-The generation of this protobuf message is like what a compiler generates a binary executable file.  The execution of the message that the OS executes the binary file.
+The generation of this protobuf message is similar to how a compiler generates a binary executable file. The execution of the message is similar to how the OS executes the binary file.
 ## The "Binary Executable File Format"
@@ -186,10 +186,10 @@ Also, the RNN operator in above example is serialized into a protobuf message of
 ```
 OpDesc {
-  inputs = {0} // the index of x
+  inputs = {0} // the index of x in vars of BlockDesc above
-  outputs = {5, 3} // indices of act and hidden_out
+  outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above
  attrs {
-    "memories" : {1} // the index of h
+    "states" : {1} // the index of h
    "step_net" : <above step net>
  }
 };
@@ -203,32 +203,32 @@ This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing
 During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator).
 VarDesc in a block should have its name scope to avoid local variables affect parent block's name scope.
-Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example
+Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example:
 ```python
-a = pd.Varaible(shape=[20, 20])
+a = pd.Variable(shape=[20, 20])
 b = pd.fc(a, params=["fc.w", "fc.b"])
 rnn = pd.create_rnn()
-with rnn.stepnet() as net:
+with rnn.stepnet():
-    x = net.set_inputs(a)
+    x = a.as_step_input()
    # reuse fc's parameter
    fc_without_b = pd.get_variable("fc.w")
-    net.set_outputs(fc_without_b)
+    rnn.output(fc_without_b)
 out = rnn()
 ```
-the method `pd.get_variable` can help retrieve a Variable by a name, a Variable may store in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance.
+The method `pd.get_variable` can help retrieve a Variable by the name. The Variable may be stored in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance.
 In compiler design, the symbol table is a data structure created and maintained by compilers to store information about the occurrence of various entities such as variable names, function names, classes, etc.
 To store the definition of variables and operators, we define a C++ class `SymbolTable`, like the one used in compilers.
-`SymbolTable` can do the following stuff:
+`SymbolTable` can do the following:
 - store the definitions (some names and attributes) of variables and operators,
- to verify if a variable was declared,
+- verify if a variable was declared,
- to make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers).
+- make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers).
 ```c++
@@ -240,19 +240,18 @@ class SymbolTable {
  OpDesc* NewOp(const string& name="");
-  // TODO determine whether name is generated by python or C++
+  // TODO determine whether name is generated by python or C++.
-  // currently assume that a unique name will be generated by C++ if the
+  // Currently assume that a unique name will be generated by C++ if the
-  // argument name left default.
+  // argument name is left default.
-  VarDesc* NewVar(const string& name="");
+  VarDesc* Var(const string& name="");
-  // find a VarDesc by name, if recursive true, find parent's SymbolTable
+  // find a VarDesc by name, if recursive is true, find parent's SymbolTable
  // recursively.
  // this interface is introduced to support InferShape, find protobuf messages
  // of variables and operators, pass pointers into InferShape.
-  // operator
  //
  // NOTE maybe some C++ classes such as VarDescBuilder and OpDescBuilder should
-  // be proposed and embedded into pybind to enable python operate on C++ pointers.
+  // be proposed and embedded into pybind to enable python operation on C++ pointers.
  VarDesc* FindVar(const string& name, bool recursive=true);
  OpDesc* FindOp(const string& name);
@@ -270,7 +269,7 @@ class SymbolTable {
 After all the description of variables and operators is added into SymbolTable,
 the block has enough information to run.
-The `Block` class takes a `BlockDesc` as input, and provide `Run` and `InferShape` functions.
+The `Block` class takes a `BlockDesc` as input, and provides `Run` and `InferShape` functions.
 ```c++
@@ -302,7 +301,7 @@ public:
  void CreateVariables(const framework::Scope& scope);
  void CreateOperators();
-  // some other necessary interfaces of NetOp are list below
+  // some other necessary interfaces of NetOp are listed below
  // ...
 private:
@@ -316,15 +315,14 @@ private:
 Block inherits from OperatorBase, which has a Run method.
 Block's Run method will run its operators sequentially.
-There is another important interface called `Eval`, which take some arguments called targets, and generate a minimal graph which takes targets as the end points and creates a new Block,
+There is another important interface called `Eval`, which takes some arguments called targets and generates a minimal graph which treats targets as the end points and creates a new Block. After `Run`, `Eval` will get the latest value and return the targets.
-after `Run`, `Eval` will get the latest value and return the targets.
 The definition of Eval is as follows:
 ```c++
 // clean a block description by targets using the corresponding dependency graph.
 // return a new BlockDesc with minimal number of operators.
-// NOTE not return a Block but the block's description so that this can be distributed
+// NOTE: The return type is not a Block but the block's description so that this can be distributed
 // to a cluster.
 BlockDesc Prune(const BlockDesc& desc, vector<string> targets);

--- a/doc/design/cluster_train/src/trainer.graffle
+++ b/doc/design/cluster_train/src/trainer.graffle
--- a/doc/design/dcgan.png
+++ b/doc/design/dcgan.png
--- a/doc/design/executor.md
+++ b/doc/design/executor.md
+# Executor Design Doc
+## Motivation
+We use executor to do the runtime evaluation of a `ProgramDesc`.
+## Overview
+An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs.
+### What does executor do?
+It evaluates all the operators in the `block_id`th block of a `ProgramDesc`.
+### What does executor NOT do?
+It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run.
+It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices.
+## Implementation
+`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)
--- a/doc/design/gan_api.md
+++ b/doc/design/gan_api.md
+# Design for GAN
+GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas. 
+It applies several important concepts in machine learning system design, including building and running subgraphs, dependency tracing, different optimizers in one executor and so forth.
+In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation.
+<p align="center">
+<img src="./test.dot.png" width = "35%" align="center"/><br/>
+Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run.
+</p>
+The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563.
+<p align="center">
+<img src="./dcgan.png" width = "90%" align="center"/><br/>
+Figure 2. Photo borrowed from the original DC-GAN paper.
+</p>
+## The Conditional-GAN might be a class. 
+This design we adopt the popular open source design in https://github.com/carpedm20/DCGAN-tensorflow and https://github.com/rajathkmp/DCGAN. It contains following data structure:
+- DCGAN(object): which contains everything required to build a GAN model. It provides following member functions methods as API:
+- __init__(...): Initialize hyper-parameters (like conv dimension and so forth), and declare model parameters of discriminator and generator as well.
+- generator(z, y=None): Generate a fake image from input noise z. If the label y is provided, the conditional GAN model will be chosen.
+Returns a generated image.
+- discriminator(image):
+Given an image, decide if it is from a real source or a fake one. 
+Returns a 0/1 binary label.
+- build_model(self):
+build the whole GAN model, define training loss for both generator and discrimator.
+## Discussion on Engine Functions required to build GAN
+- Trace the tensor and variable dependency in the engine executor. (Very critical, otherwise GAN can'be be trained correctly)
+- Different optimizers responsible for optimizing different loss.
+To be more detailed, we introduce our design of DCGAN as following:
+### Class member Function: Initializer
+- Set up hyper-parameters, including condtional dimension, noise dimension, batch size and so forth.
+- Declare and define all the model variables. All the discriminator parameters are included in the list self.theta_D and all the generator parameters are included in the list self.theta_G.
+```python
+class DCGAN(object):
+  def __init__(self, y_dim=None):
+    # hyper parameters  
+    self.y_dim = y_dim # conditional gan or not
+    self.batch_size = 100
+    self.z_dim = z_dim # input noise dimension
+    # define parameters of discriminators
+    self.D_W0 = pd.Variable(shape=[3,3, 1, 128], data=pd.gaussian_normal_randomizer())
+    self.D_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.D_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.D_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.D_W2 = pd.Varialble(np.random.rand(128, 1))
+    self.D_b2 = pd.Variable(np.zeros(128))
+    self.theta_D = [self.D_W0, self.D_b0, self.D_W1, self.D_b1, self.D_W2, self.D_b2]
+    # define parameters of generators
+    self.G_W0 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.G_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.G_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.G_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.G_W2 = pd.Varialble(np.random.rand(128, 1))
+    self.G_b2 = pd.Variable(np.zeros(128))
+    self.theta_G = [self.G_W0, self.G_b0, self.G_W1, self.G_b1, self.G_W2, self.G_b2]
+```
+### Class member Function: Generator
+- Given a noisy input z, returns a fake image.
+- Concatenation, batch-norm, FC operations required;
+- Deconv layer required, which is missing now...
+```python
+class DCGAN(object):
+  def generator(self, z, y = None):
+    # input z: the random noise
+    # input y: input data label (optional)
+    # output G_im: generated fake images
+    if not self.y_dim:
+      z = pd.layer.concat(1, [z, y])
+    G_h0 = pd.layer.fc(z, self.G_w0, self.G_b0)
+    G_h0_bn = pd.layer.batch_norm(G_h0)
+    G_h0_relu = pd.layer.relu(G_h0_bn)
+    G_h1 = pd.layer.deconv(G_h0_relu, self.G_w1, self.G_b1)
+    G_h1_bn = pd.layer.batch_norm(G_h1)
+    G_h1_relu = pd.layer.relu(G_h1_bn)
+    G_h2 = pd.layer.deconv(G_h1_relu, self.G_W2, self.G_b2))
+    G_im = pd.layer.tanh(G_im)
+    return G_im
+```
+### Class member function: Discriminator
+- Given a noisy input z, returns a fake image.
+- Concatenation, Convolution, batch-norm, FC, Leaky-ReLU operations required;
+```python
+class DCGAN(object):
+  def discriminator(self, image):
+    # input image: either generated images or real ones
+    # output D_h2: binary logit of the label
+    D_h0 = pd.layer.conv2d(image, w=self.D_w0, b=self.D_b0)
+    D_h0_bn = pd.layer.batchnorm(h0)
+    D_h0_relu = pd.layer.lrelu(h0_bn)
+    D_h1 = pd.layer.conv2d(D_h0_relu, w=self.D_w1, b=self.D_b1)
+    D_h1_bn = pd.layer.batchnorm(D_h1)
+    D_h1_relu = pd.layer.lrelu(D_h1_bn)
+    D_h2 = pd.layer.fc(D_h1_relu, w=self.D_w2, b=self.D_b2)
+    return D_h2
+```
+### Class member function: Build the model
+- Define data readers as placeholders to hold the data;
+- Build generator and discriminators;
+- Define two training losses for discriminator and generator, respectively. 
+If we have execution dependency engine to back-trace all tensors, the module building our GAN model will be like this:
+```python
+class DCGAN(object):
+  def build_model(self):
+    if self.y_dim:
+        self.y = pd.data(pd.float32, [self.batch_size, self.y_dim])
+    self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.z = pd.data(tf.float32, [None, self.z_size])
+    # step 1: generate images by generator, classify real/fake images with discriminator
+    if self.y_dim: # if conditional GAN, includes label
+        self.G = self.generator(self.z, self.y)
+        self.D_t = self.discriminator(self.images)
+        # generated fake images
+        self.sampled = self.sampler(self.z, self.y)
+        self.D_f = self.discriminator(self.G)
+    else: # original version of GAN
+        self.G = self.generator(self.z)
+        self.D_t = self.discriminator(self.images)
+        # generate fake images
+        self.sampled = self.sampler(self.z)
+        self.D_f = self.discriminator(self.images)
+    # step 2: define the two losses
+    self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
+    self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
+    self.d_loss = self.d_loss_real + self.d_loss_fake
+    self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie))
+```
+If we do not have dependency engine but blocks, the module building our GAN model will be like this:
+```python
+class DCGAN(object):
+  def build_model(self, default_block):
+    # input data in the default block
+    if self.y_dim:
+        self.y = pd.data(pd.float32, [self.batch_size, self.y_dim])
+    self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    # self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.z = pd.data(tf.float32, [None, self.z_size])
+    # step 1: generate images by generator, classify real/fake images with discriminator
+    with pd.default_block().g_block():
+      if self.y_dim: # if conditional GAN, includes label
+        self.G = self.generator(self.z, self.y)
+        self.D_g = self.discriminator(self.G, self.y)
+      else: # original version of GAN
+        self.G = self.generator(self.z)
+        self.D_g = self.discriminator(self.G, self.y)
+      self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie))
+    with pd.default_block().d_block():
+      if self.y_dim: # if conditional GAN, includes label
+        self.D_t = self.discriminator(self.images, self.y)
+        self.D_f = self.discriminator(self.G, self.y)
+      else: # original version of GAN
+        self.D_t = self.discriminator(self.images)
+        self.D_f = self.discriminator(self.G)
+      # step 2: define the two losses
+      self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
+      self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
+      self.d_loss = self.d_loss_real + self.d_loss_fake
+```
+Some small confusion and problems with this design:
+- D\_g and D\_f are actually the same thing, but has to be written twice; i.e., if we want to run two sub-graphs conceptually, the same codes have to be written twice if they are shared by the graph.
+- Requires ability to create a block anytime, rather than in if-else or rnn only;
+## Main function for the demo:
+Generally, the user of GAN just need to the following things:
+- Define an object as DCGAN class;
+- Build the DCGAN model;
+- Specify two optimizers for two different losses with respect to different parameters.
+```python
+# pd for short, should be more concise.
+from paddle.v2 as pd
+import numpy as np
+import logging
+if __name__ == "__main__":
+    # dcgan class in the default graph/block
+    # if we use dependency engine as tensorflow
+    # the codes, will be slightly different like:
+    # dcgan = DCGAN()
+    # dcgan.build_model()
+    with pd.block() as def_block:
+      dcgan = DCGAN()
+      dcgan.build_model(def_block)
+    # load mnist data
+    data_X, data_y = self.load_mnist()
+    # Two subgraphs required!!!
+    with pd.block().d_block():
+      d_optim = pd.train.Adam(lr = .001, beta= .1)
+      d_step = d_optim.minimize(dcgan.d_loss, dcgan.theta_D)
+    with pd.block.g_block():
+      g_optim = pd.train.Adam(lr = .001, beta= .1)
+      g_step = pd.minimize(dcgan.g_loss, dcgan.theta_G)
+    # executor
+    sess = pd.executor()
+    # training
+    for epoch in xrange(10000):
+      for batch_id in range(N / batch_size):
+        idx = ...
+        # sample a batch
+        batch_im, batch_label = data_X[idx:idx+batch_size], data_y[idx:idx+batch_size]
+        # sample z
+        batch_z = np.random.uniform(-1., 1., [batch_size, z_dim])
+        if batch_id % 2 == 0:
+          sess.run(d_step, 
+                   feed_dict = {dcgan.images: batch_im,
+                                dcgan.y: batch_label,
+                                dcgan.z: batch_z})
+        else:
+          sess.run(g_step,
+                   feed_dict = {dcgan.z: batch_z})
+```
+# More thinking about dependency engine v.s. block design:
+- What if we just want to run an intermediate result? Do we need to run the whole block/graph?
+- Should we call eval() to get the fake images in the first stage? And then train the discriminator in the second stage?
--- a/doc/design/graph_survey.md
+++ b/doc/design/graph_survey.md
+## Survey on Graph
+Neural network framework often provides symbolic API for users to write network topology conveniently. This doc manily focus on symbolic API in most popular neural network frameworks, and try to find out how to parse symbolic configuration to a portable file, such as protobuf or json.
+### Mxnet
+The core concept of symbolic API is `Symbol`. Mxnet implements `Symbol` class in C++, and export to Python using C-API. Please refer to the comments in Mxnet:
+`Symbol` is help class used to represent the operator node in Graph.
+`Symbol` acts as an interface for building graphs from different components like Variable, Functor and Group. `Symbol` is also exported to python front-end (while Graph is not) to enable quick test and deployment. Conceptually, symbol is the final operation of a graph and thus including all the information required (the graph) to evaluate its output value.
+A simple network topology wrote by Symbol is as follows:
+```python
+def get_symbol(num_classes=10, **kwargs):
+    data = mx.symbol.Variable('data')
+    data = mx.symbol.Flatten(data=data)
+    fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
+    act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+    fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
+    act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+    fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes)
+    mlp  = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
+    return mlp
+```
+Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own NodeAttr. There is a op field in NodeAttr class, when a Symbol represents Variable(often input data), the op field is null.
+Symbol contains a data member, std::vector<NodeEntry> outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph.
+And Symbol can be saved to a Json file.
+Here is a detailed example:
+```
+>>> import mxnet as mx
+>>> data = mx.symbol.Variable('data')
+>>> print data.debug_str()
+Variable:data
+>>> data = mx.symbol.Flatten(data=data)
+>>> print data.debug_str()
+Symbol Outputs:
+	output[0]=flatten0(0)
+Variable:data
+--------------------
+Op:Flatten, Name=flatten0
+Inputs:
+	arg[0]=data(0) version=0
+>>> fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
+>>> print fc1.debug_str()
+Symbol Outputs:
+	output[0]=fc1(0)
+Variable:data
+--------------------
+Op:Flatten, Name=flatten0
+Inputs:
+	arg[0]=data(0) version=0
+Variable:fc1_weight
+Variable:fc1_bias
+--------------------
+Op:FullyConnected, Name=fc1
+Inputs:
+	arg[0]=flatten0(0)
+	arg[1]=fc1_weight(0) version=0
+	arg[2]=fc1_bias(0) version=0
+Attrs:
+	num_hidden=128
+```
+### TensorFlow
+The core concept of symbolic API is `Tensor`. Tensorflow defines `Tensor` in Python. Please refer to the comments in TensorFlow:
+A `Tensor` is a symbolic handle to one of the outputs of an `Operation`. It does not hold the values of that operation's output, but instead provides a means of computing those values in a TensorFlow [Session](https://www.tensorflow.org/api_docs/python/tf/Session).
+A simple example is as follows:
+```python
+  # Build a dataflow graph.
+  c = tf.constant([[1.0, 2.0], [3.0, 4.0]])
+  d = tf.constant([[1.0, 1.0], [0.0, 1.0]])
+  e = tf.matmul(c, d)
+  # Construct a `Session` to execute the graph.
+  sess = tf.Session()
+  # Execute the graph and store the value that `e` represents in `result`.
+  result = sess.run(e)
+```
+The main method of `Tensor` is as follows: 
+```python
+@property
+def op(self):
+  """The `Operation` that produces this tensor as an output."""
+  return self._op
+@property
+def dtype(self):
+   """The `DType` of elements in this tensor."""
+  return self._dtype
+@property
+def graph(self):
+  """The `Graph` that contains this tensor."""
+  return self._op.graph
+@property
+def name(self):
+  """The string name of this tensor."""
+  if not self._op.name:
+    raise ValueError("Operation was not named: %s" % self._op)
+  return "%s:%d" % (self._op.name, self._value_index)
+@property
+def device(self):
+  """The name of the device on which this tensor will be produced, or None."""
+  return self._op.device
+```
+Tensor can be taken as target to run by session. Tensor contains all the information of Graph, and tracks data dependency.
+Here is a detailed example:
+```
+>>> import tensorflow as tf
+>>> c = tf.constant([[1.0, 2.0], [3.0, 4.0]])
+>>> print c.graph
+<tensorflow.python.framework.ops.Graph object at 0x10f256d50>
+>>> d = tf.constant([[1.0, 1.0], [0.0, 1.0]])
+>>> print d.graph
+<tensorflow.python.framework.ops.Graph object at 0x10f256d50>
+>>> e = tf.matmul(c, d)
+>>> print e.graph
+<tensorflow.python.framework.ops.Graph object at 0x10f256d50>
+```
+### Dynet
+The core concept of symbolic API is `Expression`, and Dynet defines `Expression` class in C++.
+A simple example is as follows:
+```cpp
+ComputationGraph cg;
+Expression W = parameter(cg, pW);
+Expression in = input(cg, xs[i]);
+Expression label = input(cg, ys[i]);
+Expression pred = W * in;
+Expression loss = square(pred - label);
+```
+The input data and parameter are also represented by Expression. Every basci Expression corresponds to a Node. And input data is also a Node. 
+Expression has a data member ComputationGraph, and ComputationGraph will be modified in users' configuring process. Expression can be a running target, beacuse Expression contains all dependency.
+Here is a detailed example:
+write topology in C++
+```
+ComputationGraph cg;
+Expression W = parameter(cg, pW);
+cg.print_graphviz();
+Expression pred = W * xs[i];
+cg.print_graphviz();
+Expression loss = square(pred - ys[i]);
+cg.print_graphviz();
+```
+compile and print
+```
+# first print
+digraph G {
+  rankdir=LR;
+  nodesep=.05;
+  N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
+}
+# second print
+digraph G {
+  rankdir=LR;
+  nodesep=.05;
+  N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
+  N1 [label="v1 = v0 * -0.98"];
+  N0 -> N1;
+}
+# third print
+digraph G {
+  rankdir=LR;
+  nodesep=.05;
+  N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
+  N1 [label="v1 = v0 * -0.98"];
+  N0 -> N1;
+  N2 [label="v2 = -1.88387 - v1"];
+  N1 -> N2;
+  N3 [label="v3 = -v2"];
+  N2 -> N3;
+  N4 [label="v4 = square(v3)"];
+  N3 -> N4;
+}
+```
+### Conclusion
+Actually, Symbol/Tensor/Expression in Mxnet/TensorFlow/Dynet are the same level concepts. We use a unified name Expression here, this level concept has following features:
+- Users wirte topoloy with symbolic API, and all return value is Expression, including input data and parameter.
+- Expression corresponds with a global Graph, and Expression can also be composed.
+- Expression tracks all dependency and can be taken as a run target
--- a/doc/design/if_else_op.md
+++ b/doc/design/if_else_op.md
-IfOp should have only one branch. An IfOp operator takes a `cond` variable whose value must be a vector of N boolean elements. Its return value has N instances. If cond[i] == True, input instance input[i] will go through true_block() and generate output[i]; otherwise it will produce output from false_bloack().
+# The `IfElse` Operator
-```python
+PaddlePaddle's `IfElse` operator differs from TensorFlow's:
-import paddle as pd
-x = var()
+- the TensorFlow version takes a scalar boolean value as the condition so that the whole mini-batch goes to either the true or the false branch, whereas
-y = var()
+- the PaddlePaddle version takes a vector of boolean value as the condition, and instances corresponding to true values go to the true branch, those corresponding to false values go to the false branch.
-cond = var()
-default_value = var()
+## Example
-b = pd.create_ifelseop(inputs=[x], output_num=1)
-with b.true_block():
+The following PaddlePaddle program shows the usage of the IfElse operator:
-    x = b.inputs(0)
-    z = operator.add(x, y)
-    b.set_output(0, operator.softmax(z))
-with b.false_block():
-    x = b.inputs(0)
-    z = layer.fc(x)
-    b.set_output(0, operator.softmax(z))
-out = b(cond)
-```
-If only true_block is set in an IfElseOp, a special case is that we can have a default value for false as:
 ```python
 import paddle as pd
-x = var()
+x = minibatch([10, 20, 30]) # shape=[None, 1]
-y = var()
+y = var(1) # shape=[1], value=1
-cond = var()
+z = minibatch([10, 20, 30]) # shape=[None, 1]
-default_value = var()
+cond = larger_than(x, 15) # [false, true, true]
-b = pd.create_ifelseop(inputs=[x], output_num=1, default_value)
+ie = pd.ifelse()
-with b.true_block():
+with ie.true_block():
-    x = b.inputs(0)
+    d = pd.layer.add(x, y)
-    z = operator.add(x, y)
+    ie.output(d, pd.layer.softmax(d))
-    b.set_output(0, operator.softmax(z))
+with ie.false_block():
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
-out = b(cond)
+A challenge to implement the `IfElse` operator is to infer those variables to be split, or, say, to identify the variable of the mini-batch or those derived from the mini-batch.
+An equivalent C++ program is as follows:
+```c++
+namespace pd = paddle;
+int x = 10;
+int y = 1;
+int z = 10;
+bool cond = false;
+int o1, o2;
+if (cond) {
+  int d = x + y;
+  o1 = z;
+  o2 = pd::layer::softmax(z);
+} else {
+  int d = pd::layer::fc(z);
+  o1 = d;
+  o2 = d+1;
+}
 ```
-where default_value is a list of vars for `cond` == False.
--- a/doc/design/images/asgd.gif
+++ b/doc/design/images/asgd.gif
--- a/doc/design/images/feed_forward.png
+++ b/doc/design/images/feed_forward.png
--- a/doc/design/images/feed_forward_regularized.png
+++ b/doc/design/images/feed_forward_regularized.png
--- a/doc/design/images/graph_construction_example.dot
+++ b/doc/design/images/graph_construction_example.dot
@@ -33,7 +33,6 @@ digraph ImageClassificationGraph {
        cost -> MSE_Grad [color=red];
        d_cost -> MSE_Grad [color=red];
-        x -> MSE_Grad [color=red];
        l -> MSE_Grad [color=red];
        y -> MSE_Grad -> d_y [color=red];

--- a/doc/design/images/graph_construction_example_all.png
+++ b/doc/design/images/graph_construction_example_all.png
--- a/doc/design/images/graph_construction_example_forward_backward.png
+++ b/doc/design/images/graph_construction_example_forward_backward.png
--- a/doc/design/images/graph_construction_example_forward_only.png
+++ b/doc/design/images/graph_construction_example_forward_only.png
--- a/doc/design/images/l1_regularization.png
+++ b/doc/design/images/l1_regularization.png
--- a/doc/design/images/l2_regularization.png
+++ b/doc/design/images/l2_regularization.png
--- a/doc/design/images/loss_equation.png
+++ b/doc/design/images/loss_equation.png
--- a/doc/design/images/theta_star.gif
+++ b/doc/design/images/theta_star.gif
--- a/doc/design/infer_var_type.md
+++ b/doc/design/infer_var_type.md
+# Design Doc: InferVarType
+## The Problem Posed
+The variable in our design can hold variant types. Such as `LoDTensor` and `SelectedRows`. An operator should be able to inference the variable types of its output.
+For example, a `lookup table` operator takes two `LoDTensor`; one is a float tensor as the embedding table, the other is an int tensor as word ID. The gradient operator of `lookup table` will generate a `SelectedRows` as its output. A `sum` operator can take both `LoDTensor` and `SelectedRows` as its inputs and will generate a `LoDTensor` if any of its inputs is `LoDTensor`, otherwise, the `sum` operator will generate `SelectedRows` as its output.
+The variable type will be constant at runtime. Every variable's type can either be set by the user (input data and parameter) or be inferred by the operator in compile time.
+## Proposed Solution
+The `InferVarType` is a compile-time function which is registered to each operator. The inferface of that function is:
+```c++
+using InferVarTypeFN = std::function<
+    void (const OpDescBind& /*op_desc*/, BlockDescBind* /*block*/)>;
+```
+It takes an operator description as its input and will write the output variable type and store them in block description.
+The `InferVarTypeFN` will be registered in `OpInfo`, to replace `infer_var_type_` field. The `OpInfo` should be
+```cpp
+struct OpInfo {
+  InferVarTypeFN infer_var_type_;
+  ...
+};
+```
+The default `InferVarType` will set output type as `LoDTensor`. It can be done by `GetInferVarType()`.
+```cpp
+void DefaultInferVarType(const OpDescBind& op_desc, BlockDescBind* block) {
+  // set the output type of variable as `LoDTensor`.
+  // ...
+}
+struct OpInfo {
+  InferVarTypeFN infer_var_type_;
+  InferVarTypeFN GetInferVarType() const {
+    if (infer_var_type_) {
+      return infer_var_type_;
+    } else {
+      return DefaultInferVarType;
+    }
+  }
+};
+```
+## Register InferVarType
+We provide a thin base class for registering an `InferVarTypeFN`. To use a base class will ease the implementation of registry since we can detect the registry entry is an `InferVarTypeFN` or not.
+```cpp
+class VarTypeInferer {
+public:
+  virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const = 0;
+}
+```
+Operator developers can write the specialize `VarTypeInferer` as follow.
+```cpp
+class SpecialVarTypeInferer : public VarTypeInferer {
+public:
+  virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const {
+    // .. own logic
+  }
+}
+```
+Then user can register the `InferVarType` just like `GradOpDescMaker` and `OpInfoMaker`.
+```
+REGISTER_OPERATOR(some_op, OpType, SpecialVarTypeInferer, ...);
+```
--- a/doc/design/model_format.md
+++ b/doc/design/model_format.md
+# Design Doc: Model Format
+## Motivation
+A model is an output of the training process. One complete model consists of two parts, the **topology** and the **parameters**. In order to support industrial deployment, the model format must be self-complete and must not expose any training source code.
+As a result, In PaddlePaddle, the **topology** is represented as a  [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters. 
+## Implementation
+The topology is saved as a plain text in a detailed self-contain protobuf file. 
+The parameters are saved as a binary file. As we all know, the protobuf message has a limit of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We have done a [benchmark experiment](https://github.com/PaddlePaddle/Paddle/pull/4610), which shows that protobuf is not fit for the task.
+As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is, 
+The table below shows a tensor's byte view in detail. Note that all the signed values are written in the little-endian format.
+|field name  | type | description |
+| --- | --- | --- |
+| version | uint32_t | Version of saved file. Always 0 now. |
+| tensor desc length | uint32_t | TensorDesc(Protobuf message) length in bytes. |
+| tensor desc | void* | TensorDesc protobuf binary message |
+| tensor data | void* | Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` |
+| lod_level | uint64_t | Level of LoD |
+| length of lod[0] | uint64_t | [Optional] length of lod[0] in bytes. |
+| data of lod[0] | uint64_t*  | [Optional] lod[0].data() |
+| ... | ... | ... |
+## Summary
+- We introduce a model format.
+- The model represented by its forward-pass computation procedure is saved in a **ProgramDesc** protobuf message.
+- A bunch of specified format binary tensors describe the **parameters**.
--- a/doc/design/optimizer.md
+++ b/doc/design/optimizer.md
+## Optimizer Design
+### The Problem
+A PaddlePaddle program, or a block, is a sequence of operators operating variables.  A training program needs to do three kinds of works:
+1. the forward pass, which computes intermediate results and the cost(s),
+1. the backward pass, which derives gradients from intermediate results and costs, and
+1. the optimization pass, which update model parameters to optimize the cost(s).
+These works rely on three kinds of operators:
+1. forward operators,
+1. gradient operators, and
+1. optimization operators.
+It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically.
+In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass.
+### High-level Python API to describe the training process
+1. User write code to describe the network:
+	```python
+	images = layer.data("images")
+	labels = layer.data("labels")
+	w1 = pd.var("w1")
+	b1 = pd.var("b1")
+	hidden = layer.fc(images, w=w1, b=b1)
+	cost = layer.mse(hidden, labels)
+	```
+	The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
+2. Users create a certain kind of Optimizer with some argument.
+	```python
+	optimizer = AdagradOptimizer(learing_rate=0.001)
+	```
+3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list.
+	```python
+	opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1])
+	```
+	The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session.
+4. Users use Session/Executor to run this opt_op_list as target to do training.
+	```python
+	sess.run(target= opt_op_list, ...)
+	```
+#### Optimizer Python interface:
+```python
+class Optimizer(object):
+    """Optimizer Base class.
+    """
+    def __init__(self):
+        pass
+    def create_optimization_pass(self, parameters_and_grads):
+        """Add optimization operators to update gradients to variables.
+        Args:
+          parameters_and_grads: a list of (variable, gradient) pair to update.
+        Returns:
+          optmization_op_list: a list of optimization operator that will update parameter using gradient.
+        """
+        return None
+    def minimize(self, loss, parameter_list):
+        """Add operations to minimize `loss` by updating `parameter_list`.
+        This method combines interface `append_backward_ops()` and
+        `create_optimization_pass()` into one.
+        """
+        params_grads = self.create_backward_pass(loss, parameter_list)
+        update_ops = self.create_optimization_pass(params_grads)
+        return update_ops
+```
+Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer.
--- a/doc/design/parameter_average.md
+++ b/doc/design/parameter_average.md
+# Averaging Parameter in PaddlePaddle
+## Why Averaging
+In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable if we can obtain the optimal values of parameters by going through the data in as few passes as we can.
+Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
+Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="./images/theta_star.gif"/><br/> . The averaging is done as follows:
+<img src="./images/asgd.gif" align="center"/><br/>
+We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
+### How to perform Parameter Averaging in PaddlePaddle
+Parameter Averaging in PaddlePaddle works in the following way during training :
+1. It will take in an instance of a normal optimizer as an input, e.g. RMSPropOptimizer
+2. The optimizer itself is responsible for updating the parameters.
+3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself:
+    1. In concept, the values of this copy are the average of the values of the parameters in the most recent N batches.
+    2. However, saving all the N instances of the parameters in memory is not feasible.
+    3. Therefore, an approximation algorithm is used.
+Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved.
+During the testing/ saving the model phase, we perform the following steps:
+1. Perform the delayed operations.
+2. Save current values of the parameters to a temporary variable.
+3. Replace the values of the parameters with the averaged values.
+4. Perform testing and/or save the parameters.
+5. Restore the values of the parameters once done.
+### How to implement Averaging of Parameter in PaddlePaddle
+We can add the ParameterAverageOptimizer op to the graph through Python API. Using this approach, we manually add this op to the graph and direct the output of the optimizer op to this op during training.
+	**Advantages**:
+    - Allows for greater flexibility to the users of PaddlePaddle. Using this approach, the users can plug different optimizers into ParameterAverageOptimizer by passing in the optimizer to the op.
+    - Makes it easy for the users to customize and extend the framework.
+	**Disadvantages**:
+    - Implementation requires re-writing the averaging methodology in Python.  
+### Low-Level implementation
+In the new design, we propose to create a new operation for averaging parameter updates (ParameterAverageOptimizer). For now, we can add an op that takes in the following as input:
+- the optimizer
+- the window_size to keep the updates
+The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU.
+The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API.
+### Python API implementation for ParameterAverageOptimizer
+Based on Polyak and Juditsky (1992), we can generalize the averaging of updates to any optimizer. The input to the op would be the following:
+- Any optimizer (RMSProp , AdaGrad etc.)
+- A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision.
+Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions.
+We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.cc)
+#### Creation of the ParameterAverageOptimizer operator
+There are two ways for creating the ParameterAverageOptimizer op:
+1. We create the op immediately while building the computation graph.
+2. We add the op in a lazy manner, just before the backward pass, similar to the way the optimization ops are added.
+The proposal is to add the op immediately while building the computation graph.
+#### High-level API
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions.
--- a/doc/design/program.md
+++ b/doc/design/program.md
-# Design Doc: ProgramDesc
+# Design Doc: PaddlePaddle Programs
-The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
+## Compile and Execution
+A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`.
-As described in [graph.md](./graph.md), the first five lines of the following PaddlePaddle program
+A simple example PaddlePaddle program can be found in [graph.md](./graph.md):
 ```python
 x = layer.data("images")
@@ -13,36 +15,112 @@ optimize(cost)
 train(cost, reader=mnist.train())
 ```
-generates, or compiles, a PaddelPaddle program, which is represented by the following protobuf message:
+The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message.  The last line runs it.
-```protobuf
+## Programs and Blocks
-message ProgramDesc {
-  repeated BlockDesc blocks = 1;
+The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
+- program: some nested blocks
+- [block](./block.md):
+  - some local variable definitions, and
+  - a sequence of operators
+The concept of block comes from usual programs.  For example, the following C++ program has three blocks:
+```c++
+int main() { // block 0
+  int i = 0;
+  if (i < 10) { // block 1
+    for (int j = 0; j < 10; j++) { // block 2
+    }
+  }
+  return 0;
 }
+```
+The following PaddlePaddle program has three blocks:
+```python
+import paddle as pd  // block 0
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+ie = pd.ifelse()
+with ie.true_block():  // block 1
+    d = pd.layer.add_scalar(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():  // block 2
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+## `BlockDesc` and `ProgramDesc`
+All protobuf messages are defined in `framework.proto`.
+`BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`.
+```protobuf
 message BlockDesc {
  required int32 parent = 1;
  repeated VarDesc vars = 2;
  repeated OpDesc ops = 3;
 }
+```
+The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks.
+All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array.
+```protobuf
+message ProgramDesc {
+  repeated BlockDesc blocks = 1;
+}
+```
+### Global Block
+The global block is the first one in the above array.
+## Operators that Use Blocks
+In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch.
+The definition of `OpDesc` shows that an operator could have some attributes:
+```protobuf
 message OpDesc {
  AttrDesc attrs = 1;
  ...
 }
+```
+and an attribute could be of type block, which is, in fact, a block ID as described above:
+```
 message AttrDesc {
-  required AttrType type = 1;
+  required string name = 1;
-  // index into ProgramDesc::blocks when type==BLOCK
+  enum AttrType {
-  optional int32 block = 2;
+    INT = 1,
+    STRING = 2,
+    ...
+    BLOCK = ...
+  }
+  required AttrType type = 2;
+  optional int32 block = 10; // when type == BLOCK
  ...
 }
 ```
-When each of the first five lines runs, related Python function, e.g., `layer.fc`, calls C++ InferShape functions.  This InferShape function needs to access the properties of VarDesc's accessed by the current OpDesc. These VarDesc's might not be defined in the current block, but in some ancestor blocks.  This requires that we can trace the parent of a block.
+## InferShape
-A nested block is often an attribute of an operator, most likely, an IfElseOp or a WhileOp.  In above solution, all blocks are in `ProgramDesc::blocks`, this implicitly assigns a zero-based ID to each block -- the index of the block in `ProgramDesc::blocks`.  So that `AttrDesc::block` could be an integer block ID.
 With this design, the InferShape function should take the following parameters:

--- a/doc/design/prune.md
+++ b/doc/design/prune.md
+# Prune
+## Motivation
+We want to support running inference, training and checkpointing in one `ProgramDesc`. We implement 
+`void Prune(const ProgramDesc* input, ProgramDesc* output)` function, which takes a `ProgramDesc`
+and generate a pruned `ProgramDesc`.
+## Challenge
+Pruning need to support both variables and operators being evaluation targets. Consider the following
+different situations.
+```python
+# Case 1: run foward pass.
+cost_np = session.run(target=cost)
+# Case 2: run backward passing.
+opts_np, _ = session.run(target=[cost, opt])
+# Case 3: run checkpointing
+_ = session.run(target=checkpoint)
+```
+## Solution
+To support evaluation of operators, we add `is_target` field in the `OpDesc`.
+```c++
+message OpDesc {
+  required string type = 3;
+  repeated Var inputs = 1;
+  repeated Var outputs = 2;
+  repeated Attr attrs = 4;
+  optional bool is_target = 5 [ default = false ];
+};
+```
+To support evaluation of variables, we add [fetch_op](https://github.com/PaddlePaddle/Paddle/pull/4599).
+For each variable in the `target`, we insert a `fetch_op` into the `ProgramDesc` with `variable` being
+`fetch_op`'s input. Then we also set `fetch_op` is a target.
+### Algorithm
+If an operator needs to be run, it must fall into one of the following cases:
+1. It is the target.
+2. It is depended by some other ops, meaning its output is some other op's input.
+The first case can be checked by `op_desc.is_traget()` . The second case can be implement as
+```c++
+bool HasDependentVar(const OpDesc& op_desc, const std::set<string>& dependent_vars) {
+  for (auto& var : op_desc.outputs()) {
+    for (auto& argu : var.arguments()) {
+      if (dependent_vars.count(argu) != 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+```
+Then the whole algorithm can be implemented as the following [code](https://github.com/tonyyang-svail/Paddle/blob/prune_impl/paddle/framework/prune.cc).
--- a/doc/design/python_api.md
+++ b/doc/design/python_api.md
+# Design Doc: Python API
+Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
+| Python classes | Protobuf messages |
+| --- | --- |
+| Program | ProgramDesc |
+| Block | BlockDesc |
+| Operator | OpDesc |
+| Variable | VarDesc |
+Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
+## Core Concepts
+### Program
+A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s.  The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array.  For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks.
+Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`.
+```python
+class Program(objects):
+    def __init__(self):
+        self.desc = core.NewProgram() # a C++ ProgramDesc pointer.
+        self.blocks = vector<Block>()
+        self.blocks.append(Block(self, -1)) # the global block
+        self.current_block = 0          # initialized to the global block
+    def global_block():
+        return self.blocks[0]
+    def current_block():
+        return self.get_block(self.current_block)
+    def rollback():
+        self.current_block = self.current_block().parent_idx
+    def create_block():
+        new_block_idx = len(self.block)
+        self.blocks.append(Block(self, self.current_block))
+        self.current_block = new_block_idx
+        return current_block()
+```
+`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`.
+`Program` creates the first block as the global block in its constructor.  All parameters and their initializer operators are in the global block.
+### Block
+A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) includes
+1. a map from variable names to an instance of the Python `Variable` class, and
+1. a list of `Operator` instances.
+```python
+class Block(objects):
+    def __init__(self, program, parent_idx):
+        self.desc = core.NewBlock(program.desc)
+        self.program = program
+        self.vars = map<string, Variable>()
+        self.ops = vector<Operator>()
+        self.parent_idx = parent_idx
+    def create_var(self, ...):
+        return Variable(self, ...)
+    def _create_global_var(self, ...):
+        program.global_block().create_var(...)
+    def create_parameter(self, name, ...):
+        # Parameter is a subclass of variable. See Parameter section for details.
+        self.vars[name] = Parameter(self._create_global_var(...), ...)
+        return self.vars[name]
+    def append_operator(self, ...):
+        self.ops.append(Operator(self, ...))
+    def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
+       self.ops.prepend(Operator(self, ...))
+```
+`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.
+`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
+### Operator
+The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes.
+```python
+class Operator(object):
+    def __init__(self,
+                 block,  # Block
+                 type,   # string
+                 inputs, # dict<string, Variable>
+                 outputs,# dict<stirng, Variable>
+                 attrs   # dict<string, Any>
+                 ):
+        self.desc = core.NewOpDesc(block.desc, type, inputs, outputs, attrs)
+        core.infer_shape(self.desc, inputs, outputs)
+    def type(self):
+        return self.desc.type()
+```
+`Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++.
+### Variable
+Operators take Variables as its inputs and outputs.
+```python
+class Variable(object):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 ):
+        if name is None:
+            name = unique_name_generator()
+        self.name = name
+        self.block = block
+        self.desc = core.NewVarDesc(block.desc, name, shape, lod_level)
+        self.writer = None
+```
+Please be aware of `self.writer`, that tracks operator who creates the variable.  It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class.  This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**.
+### Parameter
+A parameter is a global variable with an initializer (or load) operator.
+```python
+class Parameter(Variable):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 trainable,       # bool
+                 initialize_op_attrs,
+                 optimize_op_attrs):
+        super(Parameter, self).__init__(block, name, shape, dtype, lod_level)
+        self.trainable = trainable
+        self.optimize_op_attrs = optimize_op_attrs
+        block.prepend(Operator(block,  # Block
+                               initialize_op_attrs['type'],   # string
+                               None,   # no inputs
+                               self,   # output is the parameter
+                               initialize_op_attrs)
+```
+When users create a parameter, they can call
+```python
+program.create_parameter(
+  ...,
+  init_attr={
+    type: "uniform_random",
+    min: -1.0,
+    max: 1.0,
+  })
+)
+```
+In above example, `init_attr.type` names an initialize operator.  It can also name the load operator
+```python
+init_attr={
+ type: "load",
+ filename: "something.numpy",
+}
+```
+`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message.
+## Layer Function
+A layer is a Python function that creates some operators and variables. Layers simplify the work of application programmers.
+Layer functions take `Variable` and configuration parameters as its input and return the output variable(s).
+For example, `FullyConnected` take one or more variable as its input. The input could be input data or another layer's output. There are many configuration options for a `FullyConnected` layer, such as layer size, activation, parameter names, initialization strategies of parameters, and so on. The `FullyConnected` layer will return an output variable.
+### Necessity for reusing code between layer functions
+There are a lot of code that can be reused. Such as
+* Give the default value of configuration. e.g., default initialize strategy for parameters is uniform random with `min = -1.0`, `max = 1.0`. and default initialize strategy for bias is to fill zero.
+* Append the activation operator.
+* Create a temporary variable.
+* Create parameter.
+* Generate a unique name.
+* Add a bias.
+* ...
+A mechanism to reuse code between layer functions is necessary. It will be around [150 lines of code](https://github.com/PaddlePaddle/Paddle/pull/4724/files#diff-823b27e07e93914ada859232ae23f846R12) if we write a `FullyConnected` layer without any helper functions.
+### Comparision between global functions and helper class
+The `FullyConnected` layer will be as follow when we provide global functions:
+```python
+def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
+  if name is None:
+    name = unique_name("fc")
+  input = multiple_input(input)
+  param_attr = default_param_attr(param_attr)
+  param_attr = multiple_param_attr(param_attr, len(input))
+  # mul
+  mul_results = []
+  for ipt, attr in zip(input, param_attr):
+    shape = ipt.shape[1:] + [size]
+    w = g_program.global_block().create_parameter(shape, ipt.dtype, name, attr)
+    tmp = create_tmp_var(name)
+    g_program.current_block().append_op("mul", {ipt, w}, {tmp})
+  mul_results.append(tmp)
+  # add sum
+  ...
+  # add bias
+  ...
+  # add activation
+  ...
+  return out
+```
+We can provide many helpers functions for layer developers. However, there are several disadvantages for global helper functions:
+1. We need a namespace for these methods, then layer developers can quickly figure out what method they can use.
+2. Global functions will force layer developers to pass its parameter time by time.
+So we provide a helper class, `LayerHelper`, to share code between layer functions. The `FullyConnected` Layer will be as follow.
+```python
+def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
+  helper = LayerHelper(locals())  # pass all parameter to LayerHelper
+  mul_results = []
+  for ipt, param in helper.iter_multiple_input_and_param():
+    w = helper.create_parameter(shape=ipt.shape[1:] + [size], dtype = ipt.dtype)
+    tmp = helper.create_tmp_variable()
+    helper.append_op('mul', {ipt, w}, {tmp})
+    mul_results.append(tmp)
+  pre_bias = helper.add_sum(mul_results)
+  pre_activation = helper.add_bias(pre_bias)
+  return helper.add_activation(pre_activation)
+```
+We not only use the fewer lines of code to write `fc_layer` but also make the code clearer to understand. At the same time, layer developers can figure out what function they can invoke by typing `helper.` in a python editor.
+### Implementation of layer helper
+We just keep all parameters of a layer function as a dictionary in layer helper as a private data member. Every method of layer helper will look up the dictionary after it is invoked. In that way, we can implement a layer helper for all layer functions even some layer does not contain some operator. For example, The `activation` is used by the FullyConnected layer or convolution layers, but a cross-entropy layer does not use it. The example code of `add_activation` are:
+```python
+class LayerHelper(object):
+  def __init__(self, **kwargs):  # kwargs is short for `keyword arguments`
+    self.kwargs = kwargs
+  def add_activation(self, input_var):
+    act = self.kwargs.get("act", None)  # default value is None
+    if act is None:  # do nothing if no act
+      return input_var
+    tmp = self.create_tmp_var(self)
+    self.append_op(type=act, input=input_var, output=tmp)
+    return tmp
+```
+## Optimizer
+[Optimizer Design Doc](./optimizer.md)
--- a/doc/design/refactor/session.md
+++ b/doc/design/refactor/session.md
+# Design Doc: Session
+## Abstract
+The *session* object encapsulates the environment in which the
+computation graph is executed.
+We will have the *local* session and *remote* session, they offer the
+same [interface](#interface). The local session encapsulates the local
+runtime environment and the remote session encapsulates the cluster
+runtime environment.
+The local runtime environment contains:
+1. computation devices (i.e., CPU, GPU) handles, and
+1. the [scope](../scope.md) which holds all variables.
+The remote runtime environment contains:
+1. computation devices (i.e., CPU and GPU on node 0, 1) in a cluster,
+   and
+1. the distributed [scope](../scope.md) in a cluster which holds all
+   variables.
+The user can create a remote session on Paddle Cloud and evaluate the
+computation graph with it. In this way, the user can control the
+remote computation resource in a cluster from his local computer.
+## Background
+The current design has an implicit global session in which
+`paddle.eval()` is executed. The pain point is:
+Since the user is not able to explicitly switch between runtime
+environments, the user cannot run a topology in two independent
+environments.
+For example, in reinforcement learning, the user may want to have a
+stale model for inference and a fresh model for training, and only
+replace the stale model with the fresh model periodically.
+Furthermore, we have no concept that encapsulates a remote environment
+that executes a computation graph.
+We need the session object to address above issues.
+## Session
+A session is an object that owns the runtime environment. All
+computations are executed through `session.eval()`.
+### Interface
+```python
+eval(
+    targets,
+    feed_dict=None,
+)
+```
+Evaluates the target Operations or Variables in `targets`.
+- *targets*: the evaluation targets. Can be a single Operation or
+  Variable, or a list with the Operations or Variables as
+  elements. The value returned by `eval()` has the same shape as the
+  `target` argument.
+  The PaddlePaddle program is represented by
+  the [ProgramDesc](../design/program.md), `eval()` will infer the
+  ProgramDesc from the given targets and run the PaddlePaddle
+  program. Please
+  see
+  [this graph](./distributed_architecture.md#local-training-architecture) for
+  the detailed illustration for the local session
+  and
+  [this graph](./distributed_architecture.md#distributed-training-architecture) for
+  the detailed illustration for the remote session.
+- *feed_dict*: a dictionary that contains the tensors which override
+  the edges of the computation graph.
+  feed_dict not only can provide the input data, it can override any
+  OP's input as well:
+  ```python
+  a = pd.constant(2.0, name="a")
+  b = pd.variable(name="b")
+  c = pd.mul(a,b)
+  sess.eval(targets=c, feed_dict={"b":3.0}) # returns 6.0
+  ```
+```python
+close()
+```
+Closes the session and releases the scope that the session owns.
+### Create a Local Session
+```python
+session(
+    devices=None
+)
+```
+Creates a new session. One session owns one global scope, so creating
+multiple sessions will create different scopes.
+- *devices*: a single `string` or a list of `string` of device names,
+  the corresponding devices will be the computation devices for
+  `eval()`. If not specified, all available devices (e.g., all GPUs)
+  will be used. The user doesn't need to specify the CPU device since
+  it will be always used. Multiple sessions can use the same device.
+#### Example
+```Python
+a = paddle.constant(1.0)
+b = paddle.constant(2.0)
+c = a + b
+sess = paddle.session(devices=["gpu:0", "gpu:1", "fpga:0"])
+sess.eval(c)
+sess.close()
+```
+### Create a Remote Session
+```python
+create_cloud_job(
+    name,
+    num_trainer,
+    mem_per_trainer,
+    gpu_per_trainer,
+    cpu_per_trainer,
+    num_ps,
+    mem_per_ps,
+    cpu_per_ps,
+)
+```
+Creates a Paddle Cloud job. Fails if the job name exists.
+```python
+get_cloud_job(
+    name
+)
+```
+Gets a Paddle Cloud job.
+```python
+remote_session(
+    job
+)
+```
+- *job*: the Paddle Cloud job.
+#### Example
+```Python
+reader = paddle.reader.recordio("/pfs/home/peter/mnist-train-*") # data stored on Paddle Cloud
+image = reader.column(0)
+label = reader.column(1)
+fc1 = paddle.op.fc(image, size=256, act="sigmoid")
+fc2 = paddle.op.fc(fc1, size=10, act="softmax")
+cost = paddle.op.cross_entropy(fc2, label)
+opt = paddle.optimizer.sgd(cost)
+job = paddle.create_cloud_job("test", 3, "1G", 1, 1, 2, "1G", 1)
+sess = paddle.remote_ession(job)
+for i in range(1000):
+    sess.eval(opt)
+sess.close()
+```
--- a/doc/design/refactorization.md
+++ b/doc/design/refactorization.md
 # Design Doc: Refactorization Overview
-The goal of refactorizaiton include:
+The goals of refactoring include:
-1. Make it easy for external contributors to write new elementory computaiton operations.
+1. Making it easy for external contributors to write new elementary computation operations.
-1. Make the codebase clean and readable.
+1. Making the codebase clean and readable.
-1. Introduce a new design of computation representation -- a computation graph of operators and variables.
+1. Designing a new computation representation -- a computation graph of operators and variables.
-1. The graph representation helps implementing auto-scalable and auto fault recoverable distributed computing.
+1. Implementing auto-scalability and auto fault recoverable distributed computing with the help of computation graphs.
 ## Computation Graphs
-1. PaddlePaddle represent the computation, training and inference of DL models, by computation graphs.
+1. PaddlePaddle represents the computation, training and inference of Deep Learning models, by computation graphs.
-  1. Please dig into [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a solid example.
+  1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a concrete example.
-1. Users write Python programs to describe the graphs and run it (locally or remotely).
+1. Users write Python programs to describe the graphs and run them (locally or remotely).
 1. A graph is composed of *variables* and *operators*.
-1. The description of graphs must be able to be serialized/deserialized, so it
+1. The description of graphs must be serializable/deserializable, so that:
-   1. could to be sent to the cloud for distributed execution, and
+   1. It can be sent to the cloud for distributed execution, and
-   1. be sent to clients for mobile or enterprise deployment.
+   1. It can be sent to clients for mobile or enterprise deployment.
-1. The Python program do
+1. The Python program does two things
-   1. *compilation*: runs a Python program to generate a protobuf message representation of the graph and send it to
+   1. *Compilation* runs a Python program to generate a protobuf message representation of the graph and send it to
      1. the C++ library `libpaddle.so` for local execution,
      1. the master process of a distributed training job for training, or
      1. the server process of a Kubernetes serving job for distributed serving.
-   1. *execution*: according to the protobuf message, constructs instances of class `Variable` and `OperatorBase`, and run them.
+   1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message.
-## Description and Realization
+## Description and Realization of Computation Graph
-At compile time, the Python program generates protobuf message representation of the graph, or the description of the graph.
+At compile time, the Python program generates a protobuf message representation of the graph, or a description of the graph.
-At runtime, the C++ program realizes the graph and run it.
+At runtime, the C++ program realizes the graph and runs it.
 | | Representation (protobuf messages) | Realization (C++ class objects) |
 |---|---|---|
@@ -42,30 +42,31 @@ At runtime, the C++ program realizes the graph and run it.
 |Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)|
 |Block|BlockDesc|Block|
-The word *graph* is exchangable with *block* in this document.  A graph represent computation steps and local variables as a C++/Java program block, or a pair of { and }.
+The word *graph* is interchangeable with *block* in this document.  A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
 ## Compilation and Execution
-1. Run an applicaton Python program to describe the graph.  In particular,
+1. Run a Python program to describe the graph.  In particular, the Python application program does the following:
-   1. create VarDesc to represent local/intermediate variables,
+   1. Create `VarDesc` to represent local/intermediate variables,
-   1. create operators and set attributes,
+   1. Create operators and set attributes,
-   1. validate attribute values,
+   1. Validate attribute values,
-   1. inference the type and the shape of variables,
+   1. Infer the type and the shape of variables,
-   1. plan for memory-reuse for variables,
+   1. Plan memory-reuse for variables,
-   1. generate backward and optimization part of the Graph.
+   1. Generate the backward graph
-   1. possiblly split the graph for distributed training.
+   1. Add optimization operators to the computation graph.
+   1. Optionally, split the graph for distributed training.
-1. The invocation of `train` or `infer` in the application Python program:
+1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following:
-   1. create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block,
+   1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block,
      1. realize local variables defined in the BlockDesc message in the new scope,
      1. a scope is similar to the stack frame in programming languages,
-   1. create an instance of class `Block`, in which,
+   1. Create an instance of class `Block`, in which,
      1. realize operators in the BlockDesc message,
-   1. run the Block by calling
+   1. Run the Block by calling
      1. `Block::Eval(vector<Variable>* targets)` for forward and backward computations, or
      1. `Block::Eval(vector<Operator>* targets)` for optimization.
@@ -76,14 +77,14 @@ The word *graph* is exchangable with *block* in this document.  A graph represen
 Compile Time -> IR -> Runtime
 ```
-### Benefit
+### Benefits of IR
 - Optimization
  ```text
  Compile Time -> IR -> Optimized IR -> Runtime
  ```
- Send automatically partitioned IR to different nodes.
+- Automatically send partitioned IR to different nodes.
-  - Automatic data parallel
+  - Automatic Data Parallelism
    ```text
    Compile Time
    |-> Single GPU IR
@@ -92,7 +93,7 @@ Compile Time -> IR -> Runtime
            |-> Node-1 (runs trainer-IR-1)
            |-> Node-2 (runs pserver-IR)
    ```
-  - Automatic model parallel (planned for future)
+  - Automatic Model Parallelism (planned for future)
 ---
@@ -105,10 +106,10 @@ Compile Time -> IR -> Runtime
 # Operator
 ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot)
-* `Operator` is the fundamental building block as the user interface.
+* `Operator` is the fundamental building block of the user interface.
-    * Operator stores input/output variable name, and attributes.
+    * Operator stores input/output variable names and attributes.
-    * The `InferShape` interface is used to infer output variable shapes by its input shapes.
+    * The `InferShape` interface is used to infer the shape of the output variables based on the shapes of the input variables.
-    * Use `Run` to compute `input variables` to `output variables`.
+    * Use `Run` to compute the `output` variables from the `input` variables.
 ---
@@ -126,30 +127,29 @@ Compile Time -> IR -> Runtime
 # Why separate Kernel and Operator
 * Separate GPU and CPU code.
-    * Make Paddle can run without GPU.
+    * Make Paddle capable of running without GPU.
-* Make one operator (which is user interface) can contain many implementations.
+* Make one operator (which is a user interface) and create many implementations.
-    * Same mul op, different FP16, FP32 Kernel. different MKL, eigen kernel.
+    * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel.
 ---
 # Libraries for Kernel development
 * `Eigen::Tensor` contains basic math and element-wise functions.
    * Note that `Eigen::Tensor` has broadcast implementation.
-    * Limit number of `tensor.device(dev) = ` in your code.
+    * Limit the number of `tensor.device(dev) = ` in your code.
-* `thrust::tranform` and `std::transform`.
+* `thrust::transform` and `std::transform`.
-    * `thrust` has the same API as C++ standard library. Using `transform` can quickly implement a customized elementwise kernel.
+    * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels.
-    * `thrust` has more complex API, like `scan`, `reduce`, `reduce_by_key`.
+    * `thrust`, in addition, supports more complex APIs, like `scan`, `reduce`, `reduce_by_key`.
 * Hand-writing `GPUKernel` and `CPU` code
-    * Do not write `.h`. CPU Kernel should be in `.cc`. GPU kernel should be in `.cu`. (`GCC` cannot compile GPU code.)
+    * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
 ---
-# Operator Register
+# Operator Registration
-## Why register is necessary?
+## Why is registration necessary?
 We need a method to build mappings between Op type names and Op classes.
-## How to do the register?
+## How is registration implemented?
+Maintaining a map, whose key is the type name and the value is the corresponding Op constructor.
-Maintain a map, whose key is the type name and value is corresponding Op constructor.
 ---
 # The Registry Map
@@ -169,7 +169,7 @@ Maintain a map, whose key is the type name and value is corresponding Op constru
 # Related Concepts
 ### Op_Maker
-It's constructor takes `proto` and `checker`. They are compeleted during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
+It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
 ### Register Macros
 ```cpp
@@ -177,34 +177,30 @@ REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class)
 REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 ```
-### `USE` Macros
-make sure the registration process is executed and linked.
 ---
-# Register Process
+# Registration Process
-1. Write Op class, as well as its gradient Op class if there is.
+1. Write an Op class and its gradient Op class, if required.
-2. Write Op maker class. In the constructor, describe its inputs, outputs, and attributes.
+2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator.
-3. Invoke macro `REGISTER_OP`. The macro will
+3. Invoke the macro `REGISTER_OP`. This macro will
-	1. call maker class to complete `proto` and `checker`
+	1. Call maker class to complete `proto` and `checker`
-	2. with the completed `proto` and `checker`, build a new key-value pair in the `OpInfoMap`
+	2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`
-4. Invoke `USE` macro in where the Op is used to make sure it is linked.
 ---
 # Backward Module (1/2)
 ### Create Backward Operator
- Mapping from forwarding Op to backward Op
+- Mapping from forward Op to backward Op
 ![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png)
 ---
 # Backward Module (2/2)
 ### Build Backward Network
- **Input** graph of forwarding operators
+- **Input**: a graph of forward operators
- **Output** graph of backward operators
+- **Output**: a graph of backward operators
- **corner case in construction**
+- **Corner cases in construction**
-	- shared variable => insert `Add` operator
+	- Shared Variables => insert an `Add` operator to combine gradients
-	- no gradient => insert `fill_zero_grad` operator
+	- No Gradient => insert a `fill_zero_grad` operator
-	- recursive netOp => call `Backward` recursively
+	- Recursive NetOp => call `Backward` recursively
+	- RNN Op => recursively call `Backward` on stepnet
 	- RNN Op => recursively call `Backward` on stepnet
@@ -213,41 +209,41 @@ make sure the registration process is executed and linked.
 * `Tensor` is an n-dimension array with type.
 	* Only dims and data pointers are stored in `Tensor`.
-	* All operators on `Tensor` is written in `Operator` or global functions.
+	* All operations on `Tensor` are written in `Operator` or global functions.
-	* variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
+	* Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
-* `Variable` is the inputs and outputs of an operator. Not just `Tensor`.
+* `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`.
-	* step_scopes in RNN is a variable and not a tensor.
+	* `step_scopes` in RNN is a variable and not a tensor.
-* `Scope` is where variables store at.
+* `Scope` is where variables are stored.
-	* map<string/*var name */, Variable>
+	* map<string `var name`, Variable>
-	* `Scope` has a hierarchical structure. The local scope can get variable from its parent scope.
+	* `Scope` has a hierarchical structure. The local scope can get variables from its parent scope.
 ---
 # Block (in design)
-## the difference with original RNNOp
+## the difference between original RNNOp and Block
- as an operator is more intuitive than `RNNOp`,
+- As an operator is more intuitive than `RNNOp`,
- offers new interface `Eval(targets)` to deduce the minimal block to `Run`,
+- Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`,
- fits the compile-time/ runtime separation design.
+- Fits the compile-time/ runtime separation design paradigm.
-  - during the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc`
+  - During the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc`
-  - when graph executes, a Block with `BlockDesc` passed in creates `Op` and `Var` then `Run`
+  - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`.
 ---
 # Milestone
- take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
+- Take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
- model migration
+- Model migration
-  - framework development gives **priority support** to model migration, for example,
+  - Framework development gives **priority support** to model migration, for example,
    - the MNIST demo needs a Python interface,
    - the RNN models require the framework to support `LoDTensor`.
-  - determine some timelines,
+  - Determine some timelines,
-  - heavily-relied Ops need to be migrated first,
+  - Frequently used Ops need to be migrated first,
-  - different models can be migrated parallelly.
+  - Different models can be migrated in parallel.
- improve the framework at the same time
+- Improve the framework at the same time
- accept imperfection, concentrated on solving the specific problem at the right price.
+- Accept imperfection, concentrate on solving the specific problem at the right price.
 ---
 # Control the migration quality
- compare the performance of migrated models with old ones.
+- Compare the performance of migrated models with old ones.
- follow google C style
+- Follow the google C++ style guide.
- build the automatic workflow of generating Python/C++ documentations
+- Build the automatic workflow of generating Python/C++ documentations.
-  - the documentation of layers and ops should be written inside the code
+  - The documentation of layers and ops should be written inside the code.
-  - take the documentation quality into account when doing PR
+  - Take the documentation quality into account when submitting pull requests.
-  - preview the documentations, read and improve them from users' perspective
+  - Preview the documentations, read and improve them from a user's perspective.
--- a/doc/design/register_grad_op.md
+++ b/doc/design/register_grad_op.md
+# Design Doc: Gradient Operators Registration
+## The Problem Posed
+Currently, for each C++ operator class definition, a *gradient operator creator* function is registered, which takes as input a C++ operator instance and returns the corresponding gradient operator instance.
+However, we noticed two problems with the current design:
+1. As we decided to separate the *compilation* and the *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message.
+1. For some operators, the gradient computation can be written in terms of existing operators.  For example, the gradient of *minus* operator consists of two operators -- an *identity* operator followed by a *scale* operator.  Hence the registration mechanism needs to support mapping from an operator to a set of operators for the gradient computation.
+## The Current Implementation
+Instances of the C++ class `OpInfo` are stored an associative map whose key is the operator type. The `grad_op_type` indicates the associated gradient operator type. An operator can create the gradient operator by invoking `OpInfo::creator_` of the gradient operator. The pseudo code is as follows
+```cpp
+struct OpInfo {
+  std::function<OperatorBase*(...)> creator_;
+  std::string grad_op_type_;
+  ...
+};
+map<string, OpInfo> OpInfoMap;
+OperatorBase* CreateGradientOperator(const OperatorBase& op) {
+  return OpInfoMap.at(op.Type()).creator_(...);
+}
+```
+## Proposed Solution
+The mapping relationship between an operator and its gradient operators is a function. The interface of this function is:
+```cpp
+// (OpDesc) --> vector<OpDesc>
+std::function<std::vector<OpDescBind>(const OpDescBind&)>;
+```
+The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for  the protobuf message `OpDesc` for rapid manipulation of `OpDesc`.
+The `GradOpDescMaker` will be registered in `OpInfo` and will replace the `grad_op_type_` field. The `OpInfo` should look like 
+```cpp
+struct OpInfo {
+  std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>  grad_op_maker_;
+  ...
+};
+```
+The `grad_op_maker_ ` is a `nullptr` if the operator does not have any associated gradient operators.
+We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is
+```cpp
+class GradOpDescMakerBase {
+public:
+  GradOpDescMakerBase(const OpDescBind& );
+  virtual std::vector<std::unique_ptr<OpDescBind>> operator()()const = 0;
+};
+```
+We can convert `GradOpDescMakerBase` to `std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>` by
+```cpp
+using GradOpMaker = ...;
+std::function<std::vector<OpDescBind>(const OpDescBind&)> func;
+func = [] (const OpDescBind& fwd_op) {
+  GradOpMaker maker(fwd_op);
+  return maker();
+};
+```
+We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator.
+We should change register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`.
+The user interface should be
+```cpp
+vector<OpDesc> MinusOpGradMaker(OpDesc) {...}
+REGISTER_OPERATOR(minus, MinusOp, MinusOpProtoAndCheckerMaker, SumOpGradMaker);
+// Developers can still manually implement gradient operator.
+REGISTER_OPERATOR(minus_grad, MinusGradOp);
+```
+The interface of current `REGISTER_OP` macro could not be changed. In `REGISTER_OP`, it will invoke `REGISTER_OPERATOR` two times and generate GradOpDescMaker inside.
+```cpp
+REGISTER_OP(minus, MinusOp, MinusOpProtoAndCheckerMaker, minus_grad, MinusGradOp);
+```
--- a/doc/design/regularization.md
+++ b/doc/design/regularization.md
+# Regularization in PaddlePaddle
+## Introduction to Regularization
+A central problem in machine learning is how to design an algorithm that will perform well not just on the training data, but also on new data. A frequently faced problem is the problem of **overfitting**, where the model does not make reliable predictions on new unseen data. **Regularization** is the process of introducing additional information in order to prevent overfitting. This is usually done by adding extra penalties to the loss function that restricts the parameter spaces that an optimization algorithm can explore.
+### Parameter Norm Penalties
+Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows:
+<img src="./images/loss_equation.png" align="center"/><br/>
+The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`.
+The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows:
+##### L2 Regularization:
+<img src="./images/l2_regularization.png" align="center"/><br/>
+##### L1 Regularization
+<img src="./images/l1_regularization.png" align="center"/><br/>
+A much more detailed mathematical background of regularization can be found [here](http://www.deeplearningbook.org/contents/regularization.html).
+## Regularization Survey
+A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey). 
+## Proposal for Regularization in PaddlePaddle
+### Low-Level implementation
+In the new design, we propose to create new operations for regularization. For now, we can add 2 ops that correspond to the most frequently used regularizations:
+- L2_regularization_op
+- L1_regularization_op
+These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties. 
+The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API. 
+### Computation Graph
+Below is an example of a really simple feed forward neural network.
+<img src="./images/feed_forward.png" align="center"/><br/>
+The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows:
+<img src="./images/feed_forward_regularized.png" align="center"/><br/>
+### Python API implementation for Regularization
+Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions. 
+#### Creation of Regularization ops
+There are two possibilities for creating the regularization ops:
+1. We create these ops immediately while building the computation graph. 
+2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added. 
+The proposal is to add these ops in a lazy manner just before the backward pass. 
+#### Storage of Regularization attributes
+Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters. 
+#### High-level API
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
--- a/doc/design/scope.md
+++ b/doc/design/scope.md
@@ -37,7 +37,7 @@ Scope is an association of a name to variable. All variables belong to `Scope`.
 ```cpp
 class Scope {
 public:
-  Variable* NewVar(const std::string& name);
+  Variable* Var(const std::string& name);
  const Variable* FindVar(const std::string& name) const;
 private:
@@ -98,7 +98,7 @@ class Scope {
  Variable* FindVar(const std::string& name) const;
  // return if already contains same name variable.
-  Variable* NewVar(const std::string& name);
+  Variable* Var(const std::string& name);
 private:
  std::shared_ptr<Scope> parent_;
@@ -107,7 +107,7 @@ class Scope {
 ```
 ## Only scope can create a variable
-To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `NewVar` can construct `Variable`.
+To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `Var` can construct `Variable`.
 ## When scope destroyed, all variables inside this scope should be destroyed together
@@ -121,4 +121,4 @@ Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shar
 ## Orthogonal interface
-`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `NewVar` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `NewVar`, we can implement `NewVar` easily.
+`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `Var` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `Var`, we can implement `Var` easily.
--- a/doc/design/selected_rows.md
+++ b/doc/design/selected_rows.md
+# Design Doc: Selected Rows
+`SelectedRows` is a type of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in this tensor. It is straight-forward to represent a sparse tensor by the following sparse tensor data structure:
+```cpp
+class SelectedRows {
+ private:
+  vector<int> rows_;
+  Tensor value_;
+  int height_;
+};
+```
+The field `height_` is the first dimension of `SelectedRows`. The `rows` are the indices of the non-zero rows of `SelectedRows`. The `value_` field is an N-dim tensor of shape `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`.
+Suppose that a SelectedRows-typed variable `x` has many rows, but only two of them have values -- row 73 is `[1, 2]` and row 84 is `[3, 4]`, the `SelectedRows` representation would be:
+```
+x = SelectedRow {
+  rows = [73, 84],
+  value = [[1, 2], [3,4]]
+}
+```
+## SelectedRows in Protobuf
+`SelectedRows` is a type of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time because the `rows_` and `value_` are dependent on the training data. 
+So we use `TensorDesc` to unify `data_type` and `dims`. A LodTensorDesc contains a `TensorDesc` and `lod_level`. The description of `SelectedRows` is a Tensor description.
+```proto
+message TensorDesc {
+  required DataType data_type = 1;
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+}
+message LodTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int lod_level = 2;
+}
+message VarDesc {
+  required string name = 1;
+  enum VarType { 
+    LOD_TENSOR = 0;
+    SELECTED_ROWS = 1;
+  }
+  required VarType type = 2;
+  optional LodTensorDesc lod_desc = 3;
+  optional TensorDesc selected_rows_desc = 4;
+  optional bool persistable = 5 [ default = false ];
+}
+```
+## InferShape for Selected Rows
+Just like `LoD` information, `InferShape` method will infer the output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor.
+For example, the gradient operator of `TableLookup` will always generate `SelectedRows`. Its `InferShape` method should be like following
+```cpp
+void TableLookupGrad::InferShape(context) {
+  ...
+  context.SetDataType("Embedding.Grad", kSelectedRows);
+}
+```
+## Sparse Operators
+There are several operators that need to be written to support `SelectedRows`. These are:
+1. Operators which generate `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`.
+2. Optimize operators which support `SelectedRows` gradient. e.g. `SGD` or `AdaGrad` for `SelectedRows`. However, there should be only one `SGD` operator. `OpWithKernel::Run` should select a suitable kernel for both `dense` tensor or `SelectedRows`.
--- a/doc/design/tensor_array.md
+++ b/doc/design/tensor_array.md
 # Design for TensorArray
+This design doc presents the necessity of a new C++ class `TensorArray`.
+In addition to the very simple C++ implementation
+```c++
+class TensorArray {
+ public:
+  explicit TensorArray(const LoDTensor&);
+  explicit TensorArray(size_t size);
+ private:
+  vector<LoDTensor> values_;
+};
+```
+We also need to expose it to PaddlePaddle's Python API,
+because users would want to use it with our very flexible operators `WhileLoop`.
+An example for a RNN based on dynamic operators is 
+```python
+input = pd.data(...)
+num_steps = Var(12)
+TensorArray states(size=num_steps)
+TensorArray step_inputs(unstack_from=input)
+TensorArray step_outputs(size=num_steps)
+W = Tensor(...)
+U = Tensor(...)
+default_state = some_op()
+step = Var(1)
+wloop = paddle.create_whileloop(loop_vars=[step])
+with wloop.frame():
+    wloop.break_if(pd.equal(step, num_steps)
+    pre_state = states.read(step-1, default_state)
+    step_input = step_inputs.read(step)
+    state = pd.sigmoid(pd.matmul(U, pre_state) + pd.matmul(W, step_input))
+    states.write(step, state)
+    step_outputs.write(step, state) # output state
+    step.update(state+1)
+output = step_outputs.stack()
+```
+## Background
+Steps are one of the core concepts of RNN. In each time step of RNN, there should be several input segments, states, and output segments; all these components act like arrays, for example, call `states[step_id]` will get the state in `step_id`th time step.
+An RNN can be implemented with the following pseudocode
+```c++
+Array states;
+Array input_segments;
+Array output_segments;
+Parameter W, U;
+step = 1
+seq_len = 12
+while_loop {
+   if (step == seq_len) break;
+    states[step] = sigmoid(W * states[step-1] + U * input_segments[step]);
+    output_segments[step] = states[step] // take state as output
+   step++;
+}
+```
+According to the [RNN roadmap](https://github.com/PaddlePaddle/Paddle/issues/4561), there are several different RNNs that PaddlePaddle will eventually support.
+Currently, the basic RNN implementation supported by PaddlePaddle is the `recurrent_op` which takes tensors as input and splits them into `input_segments`.
+Since a tensor cannot store variable-length sequences directly, PaddlePaddle implements the tensor with level of details (`LoDTensor` for short).
+Segmenting the `LoDTensor` is much more complicated than splitting a tensor, that makes it necessary to refactor the `recurrent_op` with `LoDTensor` segmenting support.
+As the next step in RNN support, `dynamic_recurrent_op` should be introduced to handle inputs with variable-length sequences.
+The implementation is similar to `recurrent_op`. 
+The key difference is the way **the original input `LoDTensors` and outupts are split to get the `input_segments` and the `output_segments`.**
+Though it can't be built over `recurrent_op` or `dynamic_recurrent_op` directly,
+the logic behind splitting a tensor or a LoD tensor into `input_segments` remains the same.
+## Why `TensorArray`
+The logic behind splitting the inputs to segments, states and outputs is similar and can be shared in a seperate module.
+The array of `states`, `input_segments` and `output_segments` would be exposed to users when writing a dynamic RNN model similar to the above pseudo codes. 
+So there should be an array-like container, which can store the segments of a tensor or LoD tensor.
+**This container can store an array of tensors and provides several methods to split a tensor or a LoD tensor** .
+This is where the notion of `TensorArray` comes from.
+## Introduce TensorArray to uniform all the three RNNs
 TensorArray as a new concept is borrowed from TensorFlow, 
 it is meant to be used with dynamic iteration primitives such as `while_loop` and `map_fn`.
 This concept can be used to support our new design of dynamic operations, and help to refactor some existing variant-sentence-related layers, 
-such as `RecurrentGradientMachine`.
+such as `recurrent_op`, `RecurrentGradientMachine`.
 In [our design for dynamic RNN](https://github.com/PaddlePaddle/Paddle/pull/4401), 
 `TensorArray` is used to segment inputs and store states in all time steps.
 By providing some methods similar to a C++ array,
-the definition of some state-based dynamic models such as RNN could be more natural and highly flexible.
+the definition of some state-based dynamic models such as RNN can be more natural and highly flexible.
-## Dynamic-Related Methods
+## Dynamic-operations on TensorArray
-Some basic methods should be proposed as follows:
+`TensorArray` will be used directly when defining dynamic models, so some operators listed below should be implemented
-### stack()
-Pack the values in a `TensorArray` into a tensor with rank one higher than each tensor in `values`.
+```python
-### unstack(axis=0)
+# several helper operators for TensorArray
-Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
+def tensor_array_stack(ta, tensor):
-### concat()
+    '''
-Return the values in the `TensorArray` as a concatenated Tensor.
+    get a tensor array `ta`, return a packed `tensor`.
-### write(index, value, data_shared=true)
+    '''
-Write value into index of the TensorArray.
+    pass
-### read(index)
-Read the value at location `index` in the `TensorArray`.
+def tensor_array_unstack(tensor, ta):
-### size()
+    '''
-Return the number of values.
+    get a `tensor`, unstack it and get a tensor array `ta`.
+    '''
+    pass
+def tensor_array_write(ta, index, tensor, data_shared):
+    '''
+    get a `tensor` and a scalar tensor `index`, write `tensor` into index-th
+    value of the tensor array `ta`.
+    `data_shared` is an attribute that specifies whether to copy or reference the tensors.
+    '''
+    pass
+def tensor_array_read(ta, index, tensor):
+    '''
+    get a tensor array `ta`, a scalar tensor `index`, read the index-th value of
+    `ta` and return as the `tensor`.
+    '''
+    pass
+def tensor_array_size(ta, tensor):
+    '''
+    get a tensor array `ta`, return the size of `ta` and return as the scalar `tensor`.
+    '''
+    pass
+```
+It is trivial for users to use so many low-level operators, so some helper methods should be proposed in python wrapper to make `TensorArray` easier to use, 
+for example
+```python
+class TensorArray:
+    def __init__(self, name):
+        self.name = name
+        self.desc = TensorArrayDesc()
+    def stack(self, name=None):
+        '''
+        Pack the values in a `TensorArray` into a tensor with rank one higher
+        than each tensor in `values`.
+        `stack` can be used to split tensor into time steps for RNN or whileloop.
+        @name: str
+            the name of the variable to output.
+        '''
+        tensor = Var(name)
+        tensor_array_stack(self.name, tensor)
+        return tensor
+    def unstack(self, input):
+        '''
+        Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
+        `unstack` can be used to concatenate all the time steps for RNN or whileloop.
+        @input: str
+            the name of input tensor
+        '''
+        tensor_array_unstack(tensor, self.name)
+    def write(self, index, value, data_shared=True):
+        '''
+        Write value into index of the TensorArray.
+        If `data_shared` is set to True, than the index-th value in TensorArray will
+        be shared with the tensor passed in.
+        @index: str
+            name of a scalar tensor
+        @value: str
+            name of a tensor
+        @data_shared: bool
+        '''
+        tensor_array_write(self.name, index, value, data_shared)
+    def read(self, index, output):
+        '''
+        Read the value at location `index` in the `TensorArray`.
+        @index: str
+            name of a scalar tensor
+        @output:
+            name of a output variable
+        '''
+        tensor_array_read(self.name, index, output)
+    def size(self, output):
+        '''
+        Return the number of values.
+        @output: str
+            name of a scalar tensor
+        '''
+        tensor_array_size(self.name, output)
+```
 ## LoDTensor-related Supports
-The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes variant length sequences as input, 
+The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes varience-length sequences as input, and output sequences too.
-because each step of RNN could only take a tensor-represented batch of data as input, 
+Since each step of RNN can only take a tensor-represented batch of data as input, 
 some preprocess should be taken on the inputs such as sorting the sentences by their length in descending order and cut each word and pack to new batches.
-Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`.
+Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`,
+these two operations are similar to `stack` and `unstack` except that they operate on variable-length sequences formated as a LoD tensor rather than a tensor.
+Some definitions are like
+```python
+def unpack(level):
+    '''
+    Split LodTensor in some `level` and generate batches, if set `sort_by_length`,
+    will sort by length.
-With these two methods, a variant-sentence-RNN can be implemented like
+    Returns:
+        - a new `TensorArray`, whose values are LodTensors and represents batches
+          of data.
+        - an int32 Tensor, which stores the map from the new batch's indices to
+          original LoDTensor
+    '''
+    pass
+def pack(level, indices_map):
+    '''
+    Recover the original LoD-arranged LoDTensor with the values in a `TensorArray`
+    and `level` and `indices_map`.
+    '''
+    pass
+```
+With these two methods, a varience-length sentence supported RNN can be implemented like
 ```c++
 // input is the varient-length data
@@ -58,16 +269,3 @@ LoDTensor rnn_output = ta.pack(ta, indice_map);
 ```
 the code above shows that by embedding the LoDTensor-related preprocess operations into `TensorArray`,
 the implementation of a RNN that supports varient-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend.
-some details are as follows.
-### unpack(level, sort_by_length)
-Split LodTensor in some `level` and generate batches, if set `sort_by_length`, will sort by length.
-Returns:
- a new `TensorArray`, whose values are LodTensors and represents batches of data.
- an int32 Tensor, which stores the map from the new batch's indices to original LoDTensor
-### pack(level, indices_map)
-Recover the original LoD-arranged LoDTensor with the values in a `TensorArray` and `level` and `indices_map`.
--- a/doc/design/test.dot
+++ b/doc/design/test.dot
+digraph Test {
+    z -> generator -> G_img;
+    G_img -> discriminator -> D_f -> d_loss_f;
+    label0 -> d_loss_f -> d_loss;
+    img -> discriminator -> D_t -> d_loss_t;
+    label1 -> d_loss_t -> d_loss;
+    d_loss -> d_loss_t[color=red, style=dashed];
+    d_loss -> d_loss_f[color=red, style=dashed];
+    d_loss_t -> D_t[color=red, style=dashed];
+    d_loss_f -> D_f[color=red, style=dashed];
+    D_t -> discriminator[color=red, style=dashed];
+    D_f -> discriminator[color=red, style=dashed];
+    D_f -> g_loss;
+    label2 -> g_loss;
+    g_loss -> D_f[color=green, style=dashed];
+    D_f -> discriminator[color=green, style=dashed];
+    discriminator -> G_img[color=green, style=dashed];
+    G_img -> generator[color=green, style=dashed];
+    discriminator [color=red, shape=box];
+    generator [color=green, shape=box];
+    z [shape=diamond];
+    img [shape=diamond];
+    label0 [shape=diamond];
+    label1 [shape=diamond];
+    label2 [shape=diamond];
+    d_loss [color=red];
+    g_loss [color=green];
+}
--- a/doc/design/test.dot.png
+++ b/doc/design/test.dot.png
--- a/doc/design/var_desc.md
+++ b/doc/design/var_desc.md
@@ -16,16 +16,23 @@ The computation graph is constructed by Data Node and Operation Node. The concep
 ## Definition of VarDesc
-A VarDesc should have a name and value, in PaddlePaddle, the value will always be a tensor. Since we use LoDTensor most of the time. We add a LoDTesnorDesc to represent it.
+A VarDesc should have a name, and value. The are two kinds of variable type in compile time, they are `LoDTensor` and `SelectedRows`. 
 ```proto
 message VarDesc {
  required string name = 1;
-  optional LoDTensorDesc lod_tensor = 2;
+  enum VarType {
+    LOD_TENSOR = 0;
+    SELECTED_ROWS = 1;
+  }
+  required VarType type = 2;
+  optional LoDTensorDesc lod_desc = 3;
+  optional TensorDesc selected_rows_desc = 4;
+  optional bool persistable = 5 [ default = false ];
 }
 ```
-## Definition of LodTensorDesc
+## Definition of TensorDesc
 ```proto
 enum DataType {
@@ -38,87 +45,25 @@ enum DataType {
  FP64 = 6;
 }
-message LoDTensorDesc {
+message TensorDesc {
  required DataType data_type = 1;
-  repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
-  optional int32 lod_level = 3 [default=0];
 }
 ```
-## Definition of Variable in Python
+A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md).
-In Python API, layer will take Variable as Input, and return Variable as Output. There should be a class `Variable` in python to help create and manage Variable.
-```python
-image = Variable(dims=[-1, 640, 480])
-# fc1 and fc2 are both Variable
-fc1 = layer.fc(input=image, output_size=10)
-fc2 = layer.fc(input=fc1, output_size=20)
-```
-### what should class `Variable` Have
-1. `name`.a name of string type is used to mark the value of the Variable.
-1. `initializer`. Since our Tensor does not have value. we will always use some Operator to fullfill it when run. So we should have a initialize method to help add the init operator.
-1. `operator`. Variable should record which operator produce itself. The reaon is:
-  - we use pd.eval(targets=[var1, var2]) to run the related ops to get the value of var1 and var2. var.op is used to trace the dependency of the current variable.
-In PaddlePaddle, we use Block to describe Computation Graph, so in the code we will use Block but not Graph.
-```python
-import VarDesc
-import LoDTensorDesc
-import framework
-def AddInitialOperator(variable, initializer):
-	# add an initialize Operator to block to init this Variable
-class Variable(object):
-   def __init__(self, name, dims, type, initializer):
-      self._block = get_default_block()
-      self._name = name
-      self.op = None
-      tensor_desc = LoDTensorDesc(data_type=type, dims=dims)
-      _var_desc = VarDesc(name=name, lod_tensor=tensor_desc)
-      self._var = framework.CreateVar(_var_desc)
-      self._block.add_var(self)
-      # add initial op according to initializer
+## Definition of LodTensorDesc
-      if initializer is not None:
-          AddInitialOperator(self, initializer)
-   def dims(self):
-      return self._var.dims()
-   def data_type(self):
-       return self._var.data_type()
-   def to_proto(self):
+```proto
-       pass
+message LoDTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int lod_level = 2;
+}
 ```
-Then we can use this Variable to create a fc layer in Python.
+A LoDTensorDesc contains a tensor and a lod_level.
-```python
+## Definition of Variable in Python
-import paddle as pd
-def flatten_size(X, num_flatten_dims):
-  prod = 1 # of last num_flatten_dims
-  for i in xrange(num_flatten_dims):
-    prod = prod * X.dims[-i-1]
-  return prod
-def layer.fc(X, output_size, num_flatten_dims):
-  W = Variable(pd.random_uniform(), type=FP32, dims=[flatten_size(X, num_flatten_dims), output_size])
-  b = Variable(pd.random_uniform(), type=FP32, dims=[output_size])
-  out = Variable(type=FP32)
-  y = operator.fc(X, W, b, output=out) # fc will put fc op input into out
-  pd.InferShape(y)
-  return out
-x = Variable(dims=[-1, 640, 480])
-y = layer.fc(x, output_size=100)
-z = layer.fc(y, output_size=200)
-paddle.eval(targets=[z], ...)
+For Variable in Python, please reference [`Python API`](./python_api.md).
-print(z)
-```
--- a/doc/faq/local/index_cn.rst
+++ b/doc/faq/local/index_cn.rst
@@ -174,7 +174,7 @@ decoder_inputs = paddle.layer.fc(
 1. 两者都是对梯度的截断，但截断时机不同，前者在 :code:`optimzier` 更新网络参数时应用；后者在激活函数反向计算时被调用；
 2. 截断对象不同：前者截断可学习参数的梯度，后者截断回传给前层的梯度;
-除此之外，还可以通过减小学习律或者对数据进行归一化处理来解决这类问题。
+除此之外，还可以通过减小学习率或者对数据进行归一化处理来解决这类问题。
 5.  如何调用 infer 接口输出多个layer的预测结果
 -----------------------------------------------

--- a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
+++ b/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
 # 构建Android平台上的PaddlePaddle库
-用户可通过交叉编译的方式，在用户熟悉的开发平台（Linux，Mac OS X和Windows）上编译Android平台上适用的PaddlePaddle库。
+用户可通过如下两种方式，交叉编译Android平台上适用的PaddlePaddle库：
+- 基于Docker容器的编译方式
+- 基于Linux交叉编译环境的编译方式
+## 基于Docker容器的编译方式
+Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行，因此，使用基于Docker容器的编译方式，用户可在自己熟悉的开发平台上编译Android平台上适用的PaddlePaddle库。
+### 构建PaddlePaddle的Android开发镜像
+我们把PaddlePaddle的交叉编译环境打包成一个镜像，称为开发镜像，里面涵盖了交叉编译Android版PaddlePaddle库需要的所有编译工具。
+```bash
+$ git clone https://github.com/PaddlePaddle/Paddle.git
+$ cd Paddle
+$ docker build -t username/paddle-android:dev . -f Dockerfile.android
+```
+### 编译PaddlePaddle C-API库
+构建好开发镜像后，即可使用开发镜像来编译Android版PaddlePaddle C-API库。
+Android的Docker开发镜像向用户提供两个可配置的参数：
+| Argument        | Optional Values         | Default |
+|-----------------|-------------------------|---------|
+|`ANDROID_ABI`    |`armeabi-v7a, arm64-v8a` | `armeabi-v7a` |
+|`ANDROID_API`    |`>= 21` | `21` |
+- 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库
+```bash
+$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
+```
+- 编译`arm64-v8a`，`Android API 21`的PaddlePaddle库
+```bash
+$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
+```
+执行上述`docker run`命令时，容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文**配置交叉编译参数**章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
+## 基于Linux交叉编译环境的编译方式
 本文档将以Linux x86-64平台为例，介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。
-## 准备交叉编译环境
+### 准备交叉编译环境
 从源码交叉编译PaddlePaddle，用户需要提前准备好交叉编译环境。Android平台上使用的C/C++交叉编译工具链为[Android NDK](https://developer.android.com/ndk/downloads/index.html?hl=zh-cn)，用户可自行前往下载预编译好的版本，也可通过以下命令获取：
@@ -13,18 +50,27 @@ unzip -q android-ndk-r14b-linux-x86_64.zip
 ```
 Android NDK中包含了所有Android API级别、所有架构（arm/arm64/x86/mips）需要用到的编译工具和系统库。用户可根据自己的编译目标架构、所需支持的最低Android API级别，构建[独立工具链](https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn)。
-比如：
+- 构建`armeabi-v7a`、 `Android API 21`的独立工具链：
 ```bash
 your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
-        --arch=arm --platform=android-21 --install-dir=your/path/to/my_standalone_toolchain
+        --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
 ```
-此命令将在your/path/to/my_standalone_toolchain目录生成一套编译工具链，面向架构为32位ARM架构，支持的最小的Android API级别为21，使用的编译器为arm-linux-androideabi-gcc (GCC) 4.9。
+此命令将在`your/path/to/arm_standalone_toolchain`目录生成一套独立编译工具链，面向架构为32位ARM架构，支持的最小的Android API级别为21，支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
-注意：**PaddlePaddle要求使用的编译工具链所支持的Andoid API级别不小于21**。
+- 构建`arm64-v8a`、 `Android API 21`的独立工具链：
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
+```
-## 配置交叉编译参数
+此命令将在`your/path/to/arm64_standalone_toolchain`目录生成一套独立编译工具链，面向架构为64位ARM64架构，支持的最小Android API级别为21，支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
+注意：**PaddlePaddle要求使用的编译工具链所支持的Android API级别不小于21**。
+### 配置交叉编译参数
 CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置，PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake)，以提供一些默认的编译器和编译参数相关配置。注意，从CMake 3.7版本开始，CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时，将会将用户传进来的配置参数传递CMake系统，交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)。
@@ -36,32 +82,57 @@ CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cm
 Android平台可选配置参数：
 - `ANDROID_STANDALONE_TOOLCHAIN`，独立工具链所在的绝对路径，或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动推导和设置需要使用的交叉编译器、sysroot、以及Android API级别；否则，用户需要在cmake时手动设置这些值。无默认值。
- `ANDROID_ABI`，目标架构ABI。目前只支持`armeabi-v7a`，默认值为`armeabi-v7a`。
+- `ANDROID_TOOLCHAIN`，目标工具链。可设置`gcc/clang`，默认值为`clang`。
+	- CMake 3.7以上，将会始终使用`clang`工具链；CMake 3.7以下，可设置`ANDROID_TOOLCHAIN=gcc`以使用`gcc`工具链。
+	- Android官方提供的`clang`编译器要求系统支持`GLIBC 2.15`以上。
+- `ANDROID_ABI`，目标架构ABI。目前支持`armeabi-v7a`和`arm64-v8a`，默认值为`armeabi-v7a`。
 - `ANDROID_NATIVE_API_LEVEL`，工具链的Android API级别。若没有显式设置，PaddlePaddle将根据`ANDROID_STANDALONE_TOOLCHAIN`的值自动推导得到。
- `ANROID_ARM_MODE`，是否使用ARM模式。可设置`ON/OFF`，默认值为`ON`。
+- `ANROID_ARM_MODE`，是否使用ARM模式。
- `ANDROID_ARM_NEON`，是否使用NEON指令。目前必须设置成`ON`，默认值为`ON`。
+	- `ANDROID_ABI=armeabi-v7a`时，可设置`ON/OFF`，默认值为`ON`；
+	- `ANDROID_ABI=arm64-v8a`时，不需要设置。
+- `ANDROID_ARM_NEON`，是否使用NEON指令。
+	- `ANDROID_ABI=armeabi-v7a`时，可设置`ON/OFF`，默认值为`ON`；
+	- `ANDROID_ABI=arm64-v8a`时，不需要设置。
 其他配置参数：
+- `USE_EIGEN_FOR_BLAS`，是否使用Eigen库进行矩阵计算。可设置`ON/OFF`，默认值为`OFF`。
 - `HOST_C/CXX_COMPILER`，宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值；若环境变量`CC`没有设置，则设置成`cc`编译器。
-一种常用的cmake配置如下：
+常用的cmake配置如下：
 ```bash
 cmake -DCMAKE_SYSTEM_NAME=Android \
-      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/my_standalone_toolchain \
+      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm_standalone_toolchain \
      -DANDROID_ABI=armeabi-v7a \
      -DANDROID_ARM_NEON=ON \
      -DANDROID_ARM_MODE=ON \
+      -DUSE_EIGEN_FOR_BLAS=ON \
      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
      -DWITH_C_API=ON \
      -DWITH_SWIG_PY=OFF \
      ..
 ```
+```
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \
+      -DANDROID_ABI=arm64-v8a \
+      -DUSE_EIGEN_FOR_BLAS=OFF \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \  
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
 用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。
-## 编译和安装
+**性能TIPS**，为了达到最快的计算速度，在CMake参数配置上，有以下建议：
+- 设置`CMAKE_BUILD_TYPE`为`Release`
+- 使用`clang`编译工具链
+- `armeabi-v7a`时，设置`USE_EIGEN_BLAS=ON`，使用Eigen进行矩阵计算；`arm64-v8a`时，设置`USE_EIGEN_FOR_BLAS=OFF`，使用OpenBLAS进行矩阵计算
+### 编译和安装
 CMake配置完成后，执行以下命令，PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。
@@ -72,4 +143,4 @@ make install
 注意：如果你曾经在源码目录下编译过其他平台的PaddlePaddle库，请先使用`rm -rf`命令删除`third_party`目录和`build`目录，以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。
-执行完安装命令后，`your/path/to/install`目录中会包含`include`和`lib`目录，其中`include`中包含C-API的头文件，`lib`中包含一个Android版本的库。自此，PaddlePaddle的已经安装完成，用户可将`your/path/to/install`目录下的生成文件用于深度学习相关Android App中，调用方法见C-API文档。
+执行完安装命令后，`your/path/to/install`目录中会包含`include`、`lib`和`third_party`目录，其中`include`中包含C-API的头文件，`lib`中包含若干个不同Android ABI的PaddlePaddle库，`third_party`中包含所依赖的所有第三方库。自此，PaddlePaddle的已经安装完成，用户可将`your/path/to/install`目录下的生成文件用于深度学习相关Android App中，调用方法见C-API文档。
--- a/doc/howto/cross_compiling/cross_compiling_for_ios_cn.md
+++ b/doc/howto/cross_compiling/cross_compiling_for_ios_cn.md
+# 构建iOS平台上的PaddlePaddle库
+交叉编译iOS平台上适用的PaddlePaddle库，需要在MacOS系统上进行。本文的将介绍在MacOS上，从源码交叉编译iOS平台上适用的PaddlePaddle库。
+## 准备交叉编译环境
+Apple官方为iOS开发提供了完整的交叉编译工具和集成开发环境，用户从App Store下载安装Xcode即可。也可自行前往官网下载，[Xcode](https://developer.apple.com/cn/xcode/)。安装完成之后，可在命令行执行`xcodebuild -version`，判断是否安装成功。
+```bash
+$ xcodebuild -version
+Xcode 9.0
+Build version 9A235
+```
+## 配置交叉编译参数
+PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/ios.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/ios.cmake)，以提供一些默认的编译器和编译参数配置。
+交叉编译iOS版本的PaddlePaddle库时，有一些必须配置的参数：
+- `CMAKE_SYSTEM_NAME`，CMake编译的目标平台，必须设置为`iOS`。在设置`CMAKE_SYSTEM_NAME=iOS`后，PaddlePaddle的CMake系统会自动编译所有的第三方依赖库，并且强制设置一些PaddlePaddle参数的值（`WITH_C_API=ON`、`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`）。
+- `WITH_C_API`，是否编译C-API预测库，必须设置为ON。在iOS平台上只支持使用C-API来预测。
+- `WITH_SWIG_PY`，必须设置为ON。在iOS平台上不支持通过swig调用来训练或者预测。
+iOS平台可选配置参数：
+- `IOS_PLATFORM`，可设置为`OS/SIMULATOR`，默认值为`OS`。
+  - `OS`，构建目标为`arm`架构的iPhone或者iPad等物理设备。
+  - `SIMULATOR`，构建目标为`x86`架构的模拟器平台。
+- `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示：
+   | IOS_PLATFORM | IOS_ARCH             |
+   |--------------|----------------------|
+   |   OS         | armv7, armv7s, arm64 (默认) |
+   | SIMULATOR    | i386, x86_64 (默认)         |   
+- `IOS_DEPLOYMENT_TARGET`，最小的iOS部署版本，默认值为`7.0`。
+- `IOS_ENABLE_BITCODE`，是否使能[Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3)，可设置`ON/OFF`，默认值为`ON`。
+- `IOS_USE_VECLIB_FOR_BLAS`，是否使用[vecLib](https://developer.apple.com/documentation/accelerate/veclib)框架进行BLAS矩阵计算，可设置`ON/OFF`，默认值为`OFF`。
+- `IOS_DEVELOPMENT_ROOT`，`Developer`目录，可显式指定为`/path/to/platform/Developer`。若未显式指定，PaddlePaddle将会根据`IOS_PLATFORM`自动选择`Xcode`对应`platform`的`Developer`目录。
+- `IOS_SDK_ROOT`，所使用`SDK`的根目录，可显式指定为`/path/to/platform/Developer/SDKs/SDK`。若未显式指定，PaddlePaddle将会自动选择`IOS_DEVELOPMENT_ROOT`目录下最新的`SDK`版本。
+其他配置参数：
+- `USE_EIGEN_FOR_BLAS`，是否使用Eigen库进行矩阵计算，在`IOS_USE_VECLIB_FOR_BLAS=OFF`时有效。可设置`ON/OFF`，默认值为`OFF`。
+- `HOST_C/CXX_COMPILER`，宿主机的C/C++编译器。默认值为环境变量`CC/CXX`的值；若环境变量`CC/CXX`未设置，则使用`cc/c++`编译器。
+常用的cmake配置如下：
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=OS \
+      -DIOS_ARCH="arm64" \
+      -DIOS_ENABLE_BITCODE=ON \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=SIMULATOR \
+      -DIOS_ARCH="x86_64" \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+用户还可根据自己的需求设置其他编译参数。比如希望最小化生成库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望得到最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
+**性能TIPS**，为了达到最快的计算速度，在CMake参数配置上，有以下建议：
+- 设置`CMAKE_BUILD_TYPE`为`Release`
+- 设置`IOS_USE_VECLIB_FOR_BLAS=ON`，调用`vecLib`框架提供的BLAS函数进行矩阵计算。
+## 编译和安装
+CMake配置完成后，执行以下命令，PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。
+```
+$ make
+$ make install
+```
+注意：如果你曾在源码目录下编译过其他平台的PaddlePaddle库，请先使用`rm -rf`命令删除`third_party`目录和`build`目录，以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。
+执行完安装命令后，`your/path/to/install`目录中会包含以下内容：
+- `include`目录，其中包含所有C-API的头文件
+- `lib`目录，其中包含PaddlePaddle的C-API静态库
+- `third_party`目录，其中包含所依赖的所有第三方库
+注意，不同架构的PaddlePaddle库建议安装到不同的目录下，然后使用`lipo`工具将多个静态库合并成一个支持多个架构的fat库。
+自此，PaddlePaddle库已经安装完成，用户可将合成的fat库用于深度学习相关的iOS App中，调用方法见C-API文档。
--- a/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md
+++ b/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md
 # 构建Raspberry Pi平台上的PaddlePaddle库
-对于Rasspberry Pi系统，用户可通过ssh等方式登录到Raspberry Pi系统上，按照[源码编译PaddlePaddle](http://www.paddlepaddle.org/doc_cn/getstarted/build_and_install/cmake/build_from_source_cn.html)相关文档所述，直接编译Raspberry Pi平台上适用的PaddlePaddle库。
+通常有两个方法来构建基于 Rasspberry Pi 的版本：
-用户也可以在自己熟悉的开发平台上，通过交叉编译的方式来编译。这篇文档将以Linux x86-64平台为例，介绍交叉编译Raspberry Pi平台上适用的PaddlePaddle的方法和步骤。
+1. 通过ssh等方式登录到Raspberry Pi系统上来构建。所需的开发工具和第三方库可以参考 [`/Dockerfile`](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile)。
-## 准备交叉编译环境
+1. 另一个方法是交叉编译。这篇文档介绍在 Linux/x64 上交叉编译Raspberry Pi平台上适用的PaddlePaddle的方法和步骤。
-从源码交叉编译PaddlePaddle，用户需要提前准备好交叉编译环境。用户可自行前往[github](https://github.com/raspberrypi/tools)下载Raspberry Pi平台使用的C/C++交叉编译工具链，也可通过以下命令获取：
+## 安装交叉编译器
+克隆下面 Github repo
 ```bash
 git clone https://github.com/raspberrypi/tools.git
 ```
-该github仓库中包含若干个预编译好的、针对不同平台的编译工具。宿主机是Linux x86-64环境，则需选用`arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64`下的作为编译工具，所使用的编译器为arm-linux-gnueabihf-gcc 4.8.3。
+即可在 `./tools/tree/master/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64` 目录里找到交叉编译器 arm-linux-gnueabihf-gcc 4.8.3。运行该编译工具链需要一台 Linux x64 机器上以及 2.14版本以上的 glibc。
-注意，该编译工具链需要系统glibc支持2.14以上。
 ## 配置交叉编译参数
-CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置，PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/raspberry_pi.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake)，以提供一些默认的编译器和编译参数相关配置。
+CMake[支持交叉编译](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。PaddlePaddle for Raspberry Pi的配置信息在[cmake/cross_compiling/raspberry_pi.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake)。
 交叉编译Raspberry Pi版本PaddlePaddle库时，有一些必须配置的参数：
- `CMAKE_SYSTEM_NAME`，CMake编译的目标平台，必须配置为`RPi`。在设置`CMAKE_SYSTEM_NAME=RPi`后，PaddlePaddle的CMake系统才认为在是在交叉编译Raspberry Pi系统的版本，并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及目标机版OpenBLAS库。
+- `CMAKE_SYSTEM_NAME`：CMake编译的目标平台，必须配置为`RPi`。在设置`CMAKE_SYSTEM_NAME=RPi`后，PaddlePaddle的CMake系统才认为在是在交叉编译Raspberry Pi系统的版本，并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及目标机版OpenBLAS库。
-Raspberry Pi平台可选配置参数：
- `RPI_TOOLCHAIN`，编译工具链所在的绝对路径，或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动设置需要使用的交叉编译器；否则，用户需要在cmake时手动设置这些值。无默认值。
+- `RPI_TOOLCHAIN`：编译工具链所在的绝对路径，或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动设置需要使用的交叉编译器；否则，用户需要在cmake时手动设置这些值。无默认值。
- `RPI_ARM_NEON`，是否使用NEON指令。目前必须设置成`ON`，默认值为`ON`。
-其他配置参数：
+- `RPI_ARM_NEON`：是否使用NEON指令。目前必须设置成`ON`，默认值为`ON`。
 - `HOST_C/CXX_COMPILER`，宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值；若环境变量`CC`没有设置，则设置成`cc`编译器。
-cmake参数如下；
+一个常用的CMake配置如下：
 ```
 cmake -DCMAKE_SYSTEM_NAME=RPi \
@@ -47,7 +44,9 @@ cmake -DCMAKE_SYSTEM_NAME=RPi \
      ..
 ```
-用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。
+其中`WITH_C_API=ON`表示需要构建推理库。
+用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。
 ## 编译和安装
@@ -60,6 +59,4 @@ make install
 注意：如果你曾经在源码目录下编译过其他平台的PaddlePaddle库，请先使用`rm -rf`命令删除`third_party`目录和`build`目录，以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。
-执行完安装命令后，由于上一步cmake配置中`WITH_C_API`设置为`ON`，`your/path/to/install`目录中会包含`include`和`lib`目录，其中`include`中包含C-API的头文件，`lib`中包含一个Raspberry Pi版本的库。
+执行完安装命令后，`your/path/to/install`目录中会包含`include`和`lib`目录，其中`include`中包含C-API的头文件，`lib`中包含一个Raspberry Pi版本的库。
-更多的编译配置见[源码编译PaddlePaddle](http://www.paddlepaddle.org/doc_cn/getstarted/build_and_install/cmake/build_from_source_cn.html)相关文档。
--- a/doc/howto/cross_compiling/cross_compiling_for_raspberry_en.md
+++ b/doc/howto/cross_compiling/cross_compiling_for_raspberry_en.md
+# Build PaddlePaddle for Raspberry Pi
+You may use any of the following two approaches to build the inference library of PaddlePaddle for Raspberry Pi:
+1. Build using SSH: Log in to a Raspberry Pi using SSH and build the library. The required development tools and third-party dependencies are listed in here: [`/Dockerfile`](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile).
+1. Cross-compile: We talk about how to cross-compile PaddlePaddle for Raspberry Pi on a Linux/x64 machine, in more detail in this article.
+## The Cross-Compiling Toolchain
+Step 1. Clone the Github repo by running the following command.
+```bash
+git clone https://github.com/raspberrypi/tools.git
+```
+Step 2. Use the pre-built cross-compiler found in `./tools/tree/master/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64`.  To run it on a Linux computer, glibc version >= 2.14 is needed.
+## CMake Arguments
+CMake supports [cross-compiling](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling).  All CMake configuration arguments required for the cross-compilation for Raspberry Pi can be found in [`cmake/cross_compiling/raspberry_pi.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake).
+Some important arguments that need to be set:
+- `CMAKE_SYSTEM_NAME`: The target platform.  Must be `RPi`.
+- `RPI_TOOLCHAIN`: The absolute path of the cross-compiling toolchain.
+- `RPI_ARM_NEON`: Use ARM NEON Intrinsics. This is a required argument and set default to `ON`.
+- `HOST_C/CXX_COMPILER`: The C/C++ compiler for the host.  It is used to build building tools running on the host, for example, protoc.
+A commonly-used CMake configuration is as follows:
+```
+cmake -DCMAKE_SYSTEM_NAME=RPi \
+      -DRPI_TOOLCHAIN=your/path/to/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64 \
+      -DRPI_ARM_NEON=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_GPU=OFF \
+      -DWITH_C_API=ON \
+      -DWITH_PYTHON=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+To build the inference library, please set the argument WITH\_C\_API to ON: `WITH_C_API=ON`.
+You can add more arguments. For example, to minimize the size of the generated inference library, you may use `CMAKE_BUILD_TYPE=MinSizeRel`. For performance optimization, you may use `CMAKE_BUILD_TYPE=Release`.
+## Build and Install
+The following commands build the inference library of PaddlePaddle for Raspberry Pi and third-party dependencies.
+```bash
+make
+make install
+```
+ The intermediate files will be stored in `build`. Third-party libraries will be located in `build/third_party`. If you have already built it for other platforms like Android or iOS, you may want to clear these directories by running the command: `rm -rf build`.
+The infernece library will be in `your/path/to/install/lib`, with related header files in `your/path/to/install/include`.
--- a/doc/howto/deep_model/rnn/rnn_config_cn.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst
@@ -21,7 +21,7 @@ wmt14数据的提供文件在 `python/paddle/v2/dataset/wmt14.py <https://github
 循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。
-.. image:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
+.. image:: src/bi_lstm.jpg
      :align: center
 一般来说，循环网络从 :math:`t=1` 到 :math:`t=T` 或者反向地从 :math:`t=T` 到 :math:`t=1` 执行以下操作。
@@ -96,7 +96,7 @@ Sequence to Sequence Model with Attention
 我们将使用 sequence to sequence model with attention
 作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。
-.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
+.. image:: src/encoder-decoder-attention-model.png
      :align: center
 在这个模型中，源序列 :math:`S = \{s_1, \dots, s_T\}` 

--- a/doc/howto/deep_model/rnn/rnn_config_en.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_en.rst
@@ -19,7 +19,7 @@ Simple Gated Recurrent Neural Network
 Recurrent neural network process a sequence at each time step sequentially. An example of the architecture of LSTM is listed below.
-.. image:: ../../../tutorials/sentiment_analysis/src/bi_lstm.jpg
+.. image:: src/bi_lstm.jpg
     :align: center
 Generally speaking, a recurrent network perform the following operations from :math:`t=1` to :math:`t=T`, or reversely from :math:`t=T` to :math:`t=1`.
@@ -78,7 +78,7 @@ Sequence to Sequence Model with Attention
 -----------------------------------------
 We will use the sequence to sequence model with attention as an example to demonstrate how you can configure complex recurrent neural network models. An illustration of the sequence to sequence model with attention is shown in the following figure.
-.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
+.. image:: src/encoder-decoder-attention-model.png
      :align: center
 In this model, the source sequence :math:`S = \{s_1, \dots, s_T\}` is encoded with a bidirectional gated recurrent neural networks. The hidden states of the bidirectional gated recurrent neural network :math:`H_S = \{H_1, \dots, H_T\}` is called *encoder vector* The decoder is a gated recurrent neural network. When decoding each token :math:`y_t`, the gated recurrent neural network generates a set of weights :math:`W_S^t = \{W_1^t, \dots, W_T^t\}`, which are used to compute a weighted sum of the encoder vector. The weighted sum of the encoder vector is utilized to condition the generation of the token :math:`y_t`.

--- a/doc/tutorials/sentiment_analysis/bi_lstm.jpg
+++ b/doc/tutorials/sentiment_analysis/bi_lstm.jpg
--- a/doc/tutorials/text_generation/encoder-decoder-attention-model.png
+++ b/doc/tutorials/text_generation/encoder-decoder-attention-model.png
--- a/doc/howto/dev/contribute_to_paddle_en.md
+++ b/doc/howto/dev/contribute_to_paddle_en.md
-# Contribute Code
-We sincerely appreciate your contributions. You can use fork and pull request
-workflow to merge your code.
-## Code Requirements
- Your code comments must be fully documented by
-  [Doxygen](http://www.stack.nl/~dimitri/doxygen/) style.
- Make sure the compiler option `WITH_STYLE_CHECK` is on and the compiler
-  passes the code style check.
- All code must have unit test.
- Pass all unit tests.
-The following tutorial guides you into submitting your contibution.
-## [Creating a Fork](https://help.github.com/articles/fork-a-repo/)
-Just head over to the GitHub page and click the "Fork" button.
-It's just that simple.
-## Clone
-Clone remote repository.
-```bash
-➜  git clone https://github.com/USERNAME/Paddle
-➜  cd Paddle
-```
-## Create a local branch
-Paddle is currently using [Git-flow branching model](http://nvie.com/posts/a-successful-git-branching-model/).
-All feature and bug fix development work should be done on a new branch, generally create new branch from `develop` branch .
-```bash
-➜  git checkout -b my-cool-stuff
-```
-Before the checkout, you need to keep the current branch directory clean, otherwise the untracked file will be brought to the new branch, which can be inspected by `git status`.
-## Using `pre-commit` hook
-Paddle developers use [pre-commit](http://pre-commit.com/) tool to manage git
-pre-commit hooks. It can help us format source codes (cpp, python), check some
-basic thing before commit (only one EOL for each file, do not add a huge file
-in git). `pre-commit` tests is a part of unit tests in Travis-CI now, every
-PR doesn't fit hook can not be merged into Paddle.
-To use [pre-commit](http://pre-commit.com/), you should install it by
-`pip install pre-commit`, and currently, Paddle uses `clang-format` to format
-c/cpp sources. Please make sure clang-format 3.8+ installed.
-Install and run it as follow:
-```bash
-➜  pip install pre-commit
-➜  pre-commit install
-```
-When you commit your code, the pre-commit hook will check the local code if there is
-anything not suitable to commit, and so on.
-## Start to develop
-In this tutorial, I delete a line in README.md and created a new file.
-We can use `git status` to inspect the changes of current directory, `git diff` to see difference.
-```bash
-➜  git status
-On branch test
-Changes not staged for commit:
-  (use "git add <file>..." to update what will be committed)
-  (use "git checkout -- <file>..." to discard changes in working directory)
-	modified:   README.md
-Untracked files:
-  (use "git add <file>..." to include in what will be committed)
-	test
-no changes added to commit (use "git add" and/or "git commit -a")
-```
-## Build and Test
-We package PaddlePaddle's compile environment into a Docker image, called the develop image named `paddle:dev`, it contains all compiling tools that PaddlePaddle needs. 
-If you want to build the develop image, just run:
-```bash
-➜  docker build -t paddle:dev .
-```
-Then we can use the develop image to build PaddlePaddle source. For example:
-```bash
-➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev
-```
-The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated.
-Then we can generate the production image by copying the compiled PaddlePaddle program into the image by
-```bash
-➜  docker build -t paddle:prod -f build/Dockerfile .
-```
-Run unit test finally:
-```bash
-➜  docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
-```
-For more details, you can read [this doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst).
-## Commit
-Next we cancel the changes to the README.md file and then commit our changes by following command lines:
-```bash
-➜  git checkout -- README.md
-➜  git status
-On branch test
-Untracked files:
-  (use "git add <file>..." to include in what will be committed)
-	test
-nothing added to commit but untracked files present (use "git add" to track)
-➜  git add test
-```
-We should write a description of each commit by `git commit` to allow others to know
-the changes in these files.
-```bash
-➜  git commit
-CRLF end-lines remover...............................(no files to check)Skipped
-yapf.................................................(no files to check)Skipped
-Check for added large files..............................................Passed
-Check for merge conflicts................................................Passed
-Check for broken symlinks................................................Passed
-Detect Private Key...................................(no files to check)Skipped
-Fix End of Files.....................................(no files to check)Skipped
-clang-formater.......................................(no files to check)Skipped
-[my-cool-stuff c703c041] add test file
- 1 file changed, 0 insertions(+), 0 deletions(-)
- create mode 100644 233
-```
-## Keeping Fork Up to Date
-Before pull your request, you should sync your code from the latest PaddlePaddle.
-To do this, you'll need to add a remote at first:
-```bash
-➜  git remote add upstream https://github.com/PaddlePaddle/Paddle
-➜  git remote
-origin
-upstream
-```
-Update your fork with the latest upstream changes:
-```bash
-➜  git fetch upstream
-➜  git pull upstream develop
-```
-Now, your local master branch is up-to-date with everything modified upstream.
-## Push to GitHub
-```bash
-# push to your repository in Github
-➜  git push origin my-cool-stuff
-```
-## Create an issue and a Pull Request
-Create an Issue to describe the problem and record its number.
-Go to the page for your fork on GitHub, select your development branch,
-and click the `New pull request`.
-<img width="295" alt="screen shot 2017-04-26 at 9 09 28 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436054/a6d98c66-2ac4-11e7-9cb1-18dd13150230.png">
-Then select the target branch:
-<img width="750" alt="screen shot 2017-04-26 at 9 11 52 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436139/f83b1e6c-2ac4-11e7-8c0e-add499023c46.png">
-We can add `resolve #Issue number` in PR description to close the issue automatically after the PR is merge. More details in <https://help.github.com/articles/closing-issues-via-commit-messages/>.
-Then wait for review, if there need to modify, refer to the above steps to update the corresponding origin branch.
-## Delete origin branch
-After the PR is merge into the main repository, we can delete the remote branch on the PR page.
-<img width="775" alt="screen shot 2017-04-26 at 9 18 24 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436457/e4cdd472-2ac5-11e7-9272-badc76c4a23e.png">
-Or just run:
-```bash
-➜  git push origin :my-cool-stuff
-```
-## Delete local branch
-Finally, we delete local branch:
-```bash
-➜  git checkout develop 
-# delete my-cool-stuff branch
-➜  git branch -D my-cool-stuff
-```
--- a/doc/howto/dev/contribute_to_paddle_en.md
+++ b/doc/howto/dev/contribute_to_paddle_en.md
+../../../CONTRIBUTING.md
\ No newline at end of file
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -206,7 +206,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
    - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
    - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
-    - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulKernel`类。
+    - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulGradKernel`类。
 - 在 `.cu`文件中注册GPU Kernel。
@@ -285,41 +285,27 @@ class TestMulGradOp(GradientChecker):
            'Y': np.random.random((84, 100)).astype("float32")
        }
-    def test_cpu_gpu_compare(self):
+    def test_check_grad_normal(self):
-        self.compare_grad(self.op, self.inputs)
-    def test_normal(self):
        # mul op will enlarge the relative error
-        self.check_grad(
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-            self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.5)
-    def test_ignore_x(self):
+    def test_check_grad_ingore_x(self):
        self.check_grad(
-            self.op,
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-            self.inputs, ["Y"],
-            "Out",
-            max_relative_error=0.5,
-            no_grad_set={"X"})
-    def test_ignore_y(self):
+    def test_check_grad_ingore_y(self):
        self.check_grad(
-            self.op,
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-            self.inputs, ["X"],
-            "Out",
-            max_relative_error=0.5,
-            no_grad_set={"Y"})
 ```
 下面解释代码中一些关键的地方:
 - 调用`create_op("mul")`创建反向Op对应的前向Op。
- 调用`compare_grad`函数对比CPU、GPU计算结果。
+- `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
- `test_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
+  - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
-  - 第一个参数`self.op` : 前向Op。
+  - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。
-  - 第二个参数`self.inputs` : 输入词典，词典的Key和`ProtoMaker`定义保持一致。
+  - 第三个参数`max_relative_error`：指定检测梯度时能容忍的最大错误值。
-  - 第三个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
+- `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。
-  - 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
- `test_ignore_x`和`test_ignore_y`分支用来测试只需要计算一个输入梯度的情况。
 ### 编译和执行单元测试

--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
@@ -205,7 +205,7 @@ The definition of its corresponding backward operator, if applicable, is similar
    - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
    - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
-    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulKernel`.
+    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
 - Registering GPU Kernel in `.cu` files
@@ -293,41 +293,27 @@ class TestMulGradOp(GradientChecker):
            'Y': np.random.random((84, 100)).astype("float32")
        }
-    def test_cpu_gpu_compare(self):
+    def test_check_grad_normal(self):
-        self.compare_grad(self.op, self.inputs)
-    def test_normal(self):
        # mul op will enlarge the relative error
-        self.check_grad(
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-            self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.5)
-    def test_ignore_x(self):
+    def test_check_grad_ingore_x(self):
        self.check_grad(
-            self.op,
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-            self.inputs, ["Y"],
-            "Out",
-            max_relative_error=0.5,
-            no_grad_set={"X"})
-    def test_ignore_y(self):
+    def test_check_grad_ingore_y(self):
        self.check_grad(
-            self.op,
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-            self.inputs, ["X"],
-            "Out",
-            max_relative_error=0.5,
-            no_grad_set={"Y"})
 ```
 Some key points in the code above include:
 - `create_op("mul")` creates the backward operator's corresponding forward operator.
- `compare_grad` compares results between utilizing the CPU and the GPU.
 - `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods.
-  - The first variable `self.op` denotes the forward operator.
+  - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
-  - The second variable `self.inputs` denotes the input dictionary, which has its key value identical to its `ProtoMaker` definitions.
+  - The second variable `"Out"` points to the network's final output target `Out`.
-  - The third variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
+  - The third variable `max_relative_error` points to the maximum relative tolerance error during scaling tests.
-  - The fourth variable `"Out"` points to the network's final output target `Out`.
+- `test_check_grad_ingore_x` and `test_check_grad_ingore_y`branches test the cases where there is only one scaling input.
- `test_ignore_x` and `test_ignore_y`branches test the cases where there is only one scaling input.
 ### Compiling and Running

--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -21,7 +21,6 @@
  dev/build_cn.rst
  dev/write_docs_cn.rst
-  dev/contribute_to_paddle_cn.md
 模型配置
 --------

--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
--- a/doc/howto/usage/cluster/src/trainer.png
+++ b/doc/howto/usage/cluster/src/trainer.png
--- a/doc/howto/usage/cluster/src/trainer_cn.png
+++ b/doc/howto/usage/cluster/src/trainer_cn.png
--- a/doc/howto/usage/cluster/src/word2vec/api_train_v2.py
+++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2.py
+import gzip
+import math
+import paddle.v2 as paddle
+embsize = 32
+hiddensize = 256
+N = 5
+def wordemb(inlayer):
+    wordemb = paddle.layer.embedding(
+        input=inlayer,
+        size=embsize,
+        param_attr=paddle.attr.Param(
+            name="_proj",
+            initial_std=0.001,
+            learning_rate=1,
+            l2_rate=0,
+            sparse_update=True))
+    return wordemb
+def main():
+    # for local training
+    cluster_train = False
+    if not cluster_train:
+        paddle.init(use_gpu=False, trainer_count=1)
+    else:
+        paddle.init(
+            use_gpu=False,
+            trainer_count=2,
+            port=7164,
+            ports_num=1,
+            ports_num_for_sparse=1,
+            num_gradient_servers=1)
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+    firstword = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    secondword = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    thirdword = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourthword = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    nextword = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
+    Efirst = wordemb(firstword)
+    Esecond = wordemb(secondword)
+    Ethird = wordemb(thirdword)
+    Efourth = wordemb(fourthword)
+    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    hidden1 = paddle.layer.fc(input=contextemb,
+                              size=hiddensize,
+                              act=paddle.activation.Sigmoid(),
+                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              param_attr=paddle.attr.Param(
+                                  initial_std=1. / math.sqrt(embsize * 8),
+                                  learning_rate=1))
+    predictword = paddle.layer.fc(input=hidden1,
+                                  size=dict_size,
+                                  bias_attr=paddle.attr.Param(learning_rate=2),
+                                  act=paddle.activation.Softmax())
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                with gzip.open("batch-" + str(event.batch_id) + ".tar.gz",
+                               'w') as f:
+                    trainer.save_parameter_to_tar(f)
+                result = trainer.test(
+                    paddle.batch(
+                        paddle.dataset.imikolov.test(word_dict, N), 32))
+                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    result.metrics)
+    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+    parameters = paddle.parameters.create(cost)
+    adagrad = paddle.optimizer.AdaGrad(
+        learning_rate=3e-3,
+        regularization=paddle.optimizer.L2Regularization(8e-4))
+    trainer = paddle.trainer.SGD(cost,
+                                 parameters,
+                                 adagrad,
+                                 is_local=not cluster_train)
+    trainer.train(
+        paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+        num_passes=30,
+        event_handler=event_handler)
+if __name__ == '__main__':
+    main()
--- a/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
+++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
+import math
+import os
+import paddle.v2 as paddle
+import pickle
+embsize = 32
+hiddensize = 256
+N = 5
+cluster_train_file = "./train_data_dir/train/train.txt"
+cluster_test_file = "./test_data_dir/test/test.txt"
+node_id = os.getenv("OMPI_COMM_WORLD_RANK")
+if not node_id:
+    raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
+def wordemb(inlayer):
+    wordemb = paddle.layer.embedding(
+        input=inlayer,
+        size=embsize,
+        param_attr=paddle.attr.Param(
+            name="_proj",
+            initial_std=0.001,
+            learning_rate=1,
+            l2_rate=0,
+            sparse_update=True))
+    return wordemb
+def cluster_reader_cluster(filename, node_id):
+    def cluster_reader():
+        with open("-".join([filename, "%05d" % int(node_id)]), "r") as f:
+            for l in f:
+                csv_data = [int(cell) for cell in l.split(",")]
+                yield tuple(csv_data)
+    return cluster_reader
+def main():
+    # get arguments from env
+    # for local training
+    TRUTH = ["true", "True", "TRUE", "1", "yes", "Yes", "YES"]
+    cluster_train = os.getenv('PADDLE_CLUSTER_TRAIN', "False") in TRUTH
+    use_gpu = os.getenv('PADDLE_INIT_USE_GPU', "False")
+    if not cluster_train:
+        paddle.init(
+            use_gpu=use_gpu,
+            trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")))
+    else:
+        paddle.init(
+            use_gpu=use_gpu,
+            trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")),
+            port=int(os.getenv("PADDLE_INIT_PORT", "7164")),
+            ports_num=int(os.getenv("PADDLE_INIT_PORTS_NUM", "1")),
+            ports_num_for_sparse=int(
+                os.getenv("PADDLE_INIT_PORTS_NUM_FOR_SPARSE", "1")),
+            num_gradient_servers=int(
+                os.getenv("PADDLE_INIT_NUM_GRADIENT_SERVERS", "1")),
+            trainer_id=int(os.getenv("PADDLE_INIT_TRAINER_ID", "0")),
+            pservers=os.getenv("PADDLE_INIT_PSERVERS", "127.0.0.1"))
+    fn = open("thirdparty/wuyi_train_thdpty/word_dict.pickle", "r")
+    word_dict = pickle.load(fn)
+    fn.close()
+    dict_size = len(word_dict)
+    firstword = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    secondword = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    thirdword = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourthword = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    nextword = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
+    Efirst = wordemb(firstword)
+    Esecond = wordemb(secondword)
+    Ethird = wordemb(thirdword)
+    Efourth = wordemb(fourthword)
+    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    hidden1 = paddle.layer.fc(input=contextemb,
+                              size=hiddensize,
+                              act=paddle.activation.Sigmoid(),
+                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              param_attr=paddle.attr.Param(
+                                  initial_std=1. / math.sqrt(embsize * 8),
+                                  learning_rate=1))
+    predictword = paddle.layer.fc(input=hidden1,
+                                  size=dict_size,
+                                  bias_attr=paddle.attr.Param(learning_rate=2),
+                                  act=paddle.activation.Softmax())
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                result = trainer.test(
+                    paddle.batch(
+                        cluster_reader_cluster(cluster_test_file, node_id), 32))
+                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    result.metrics)
+    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+    parameters = paddle.parameters.create(cost)
+    adagrad = paddle.optimizer.AdaGrad(
+        learning_rate=3e-3,
+        regularization=paddle.optimizer.L2Regularization(8e-4))
+    trainer = paddle.trainer.SGD(cost,
+                                 parameters,
+                                 adagrad,
+                                 is_local=not cluster_train)
+    trainer.train(
+        paddle.batch(cluster_reader_cluster(cluster_train_file, node_id), 32),
+        num_passes=30,
+        event_handler=event_handler)
+if __name__ == '__main__':
+    main()
--- a/doc/howto/usage/cluster/src/word2vec/prepare.py
+++ b/doc/howto/usage/cluster/src/word2vec/prepare.py
+import paddle.v2 as paddle
+import tarfile
+import os
+import pickle
+SPLIT_COUNT = 3
+N = 5
+def file_len(fd):
+    for i, l in enumerate(fd):
+        pass
+    return i + 1
+def split_from_reader_by_line(filename, reader, split_count):
+    fn = open(filename, "w")
+    for batch_id, batch_data in enumerate(reader()):
+        batch_data_str = [str(d) for d in batch_data]
+        fn.write(",".join(batch_data_str))
+        fn.write("\n")
+    fn.close()
+    fn = open(filename, "r")
+    total_line_count = file_len(fn)
+    fn.close()
+    per_file_lines = total_line_count / split_count + 1
+    cmd = "split -d -a 5 -l %d %s %s-" % (per_file_lines, filename, filename)
+    os.system(cmd)
+word_dict = paddle.dataset.imikolov.build_dict()
+with open("word_dict.pickle", "w") as dict_f:
+    pickle.dump(word_dict, dict_f)
+split_from_reader_by_line("train.txt",
+                          paddle.dataset.imikolov.train(word_dict, N),
+                          SPLIT_COUNT)
+split_from_reader_by_line("test.txt",
+                          paddle.dataset.imikolov.test(word_dict, N),
+                          SPLIT_COUNT)
--- a/doc/tutorials/image_classification/cifar.png
+++ b/doc/tutorials/image_classification/cifar.png
--- a/doc/tutorials/image_classification/image_classification.png
+++ b/doc/tutorials/image_classification/image_classification.png
--- a/doc/tutorials/image_classification/index_cn.md
+++ b/doc/tutorials/image_classification/index_cn.md
--- a/doc/tutorials/image_classification/index_en.md
+++ b/doc/tutorials/image_classification/index_en.md
--- a/doc/tutorials/image_classification/lenet.png
+++ b/doc/tutorials/image_classification/lenet.png
--- a/doc/tutorials/image_classification/plot.png
+++ b/doc/tutorials/image_classification/plot.png
--- a/doc/tutorials/image_classification/src/cifar.png
+++ b/doc/tutorials/image_classification/src/cifar.png
--- a/doc/tutorials/image_classification/src/image_classification.png
+++ b/doc/tutorials/image_classification/src/image_classification.png
--- a/doc/tutorials/image_classification/src/lenet.png
+++ b/doc/tutorials/image_classification/src/lenet.png
--- a/doc/tutorials/image_classification/src/plot.png
+++ b/doc/tutorials/image_classification/src/plot.png
--- a/doc/tutorials/index_cn.md
+++ b/doc/tutorials/index_cn.md
-# 完整教程
-* [快速入门](quick_start/index_cn.rst)
-* [个性化推荐](rec/ml_regression_cn.rst)
-* [图像分类](image_classification/index_cn.md)
-* [情感分析](sentiment_analysis/index_cn.md)
-* [语义角色标注](semantic_role_labeling/index_cn.md)
-* [机器翻译](text_generation/index_cn.md)
-## 常用模型
-* [ResNet模型](imagenet_model/resnet_model_cn.md)
-* [词向量模型](embedding_model/index_cn.md)
--- a/doc/tutorials/index_en.md
+++ b/doc/tutorials/index_en.md
-# TUTORIALS
-There are several examples and demos here.
-* [Quick Start](quick_start/index_en.md)
-* [MovieLens Regression](rec/ml_regression_en.rst)
-* [Image Classification](image_classification/index_en.md)
-* [Sentiment Analysis](sentiment_analysis/index_en.md)
-* [Semantic Role Labeling](semantic_role_labeling/index_en.md)
-* [Text Generation](text_generation/index_en.md)
-* [Image Auto-Generation](gan/index_en.md)
-## Model Zoo
-* [ImageNet: ResNet](imagenet_model/resnet_model_en.md)
-* [Embedding: Chinese Word](embedding_model/index_en.md)
--- a/doc/tutorials/rec/ml_dataset_cn.md
+++ b/doc/tutorials/rec/ml_dataset_cn.md
--- a/doc/tutorials/rec/ml_dataset_en.md
+++ b/doc/tutorials/rec/ml_dataset_en.md
--- a/doc/tutorials/rec/ml_regression_cn.rst
+++ b/doc/tutorials/rec/ml_regression_cn.rst
--- a/doc/tutorials/rec/ml_regression_en.rst
+++ b/doc/tutorials/rec/ml_regression_en.rst
--- a/doc/tutorials/rec/rec_regression_network.png
+++ b/doc/tutorials/rec/rec_regression_network.png
--- a/doc/tutorials/semantic_role_labeling/feature.jpg
+++ b/doc/tutorials/semantic_role_labeling/feature.jpg
--- a/doc/tutorials/semantic_role_labeling/index_cn.md
+++ b/doc/tutorials/semantic_role_labeling/index_cn.md
--- a/doc/tutorials/semantic_role_labeling/index_en.md
+++ b/doc/tutorials/semantic_role_labeling/index_en.md
--- a/doc/tutorials/semantic_role_labeling/network_arch.png
+++ b/doc/tutorials/semantic_role_labeling/network_arch.png
--- a/doc/tutorials/semantic_role_labeling/src/curve.jpg
+++ b/doc/tutorials/semantic_role_labeling/src/curve.jpg
--- a/doc/tutorials/semantic_role_labeling/src/feature.jpg
+++ b/doc/tutorials/semantic_role_labeling/src/feature.jpg
--- a/doc/tutorials/semantic_role_labeling/src/network_arch.png
+++ b/doc/tutorials/semantic_role_labeling/src/network_arch.png
--- a/doc/tutorials/sentiment_analysis/index_cn.md
+++ b/doc/tutorials/sentiment_analysis/index_cn.md
--- a/doc/tutorials/sentiment_analysis/index_en.md
+++ b/doc/tutorials/sentiment_analysis/index_en.md
--- a/doc/tutorials/sentiment_analysis/lstm.png
+++ b/doc/tutorials/sentiment_analysis/lstm.png
--- a/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg
+++ b/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg
--- a/doc/tutorials/sentiment_analysis/src/lstm.png
+++ b/doc/tutorials/sentiment_analysis/src/lstm.png
--- a/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg
+++ b/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg
--- a/doc/tutorials/sentiment_analysis/stacked_lstm.jpg
+++ b/doc/tutorials/sentiment_analysis/stacked_lstm.jpg
--- a/doc/tutorials/text_generation/index_cn.md
+++ b/doc/tutorials/text_generation/index_cn.md
--- a/doc/tutorials/text_generation/index_en.md
+++ b/doc/tutorials/text_generation/index_en.md
--- a/doc/v1_api_tutorials/README.md
+++ b/doc/v1_api_tutorials/README.md
--- a/doc/tutorials/embedding_model/index_cn.md
+++ b/doc/tutorials/embedding_model/index_cn.md
--- a/doc/tutorials/embedding_model/index_en.md
+++ b/doc/tutorials/embedding_model/index_en.md
--- a/doc/tutorials/embedding_model/neural-n-gram-model.png
+++ b/doc/tutorials/embedding_model/neural-n-gram-model.png
--- a/doc/tutorials/gan/gan.png
+++ b/doc/tutorials/gan/gan.png
--- a/doc/tutorials/gan/index_en.md
+++ b/doc/tutorials/gan/index_en.md
--- a/doc/tutorials/gan/mnist_sample.png
+++ b/doc/tutorials/gan/mnist_sample.png
--- a/doc/tutorials/gan/uniform_sample.png
+++ b/doc/tutorials/gan/uniform_sample.png
--- a/doc/tutorials/imagenet_model/resnet_block.jpg
+++ b/doc/tutorials/imagenet_model/resnet_block.jpg
--- a/doc/tutorials/imagenet_model/resnet_model_cn.md
+++ b/doc/tutorials/imagenet_model/resnet_model_cn.md
--- a/doc/tutorials/imagenet_model/resnet_model_en.md
+++ b/doc/tutorials/imagenet_model/resnet_model_en.md
--- a/doc/tutorials/quick_start/index_cn.rst
+++ b/doc/tutorials/quick_start/index_cn.rst
--- a/doc/tutorials/quick_start/index_en.md
+++ b/doc/tutorials/quick_start/index_en.md
--- a/doc/tutorials/quick_start/src/NetContinuous_cn.jpg
+++ b/doc/tutorials/quick_start/src/NetContinuous_cn.jpg
--- a/doc/tutorials/quick_start/src/NetContinuous_en.png
+++ b/doc/tutorials/quick_start/src/NetContinuous_en.png
--- a/doc/tutorials/quick_start/src/NetConv_cn.jpg
+++ b/doc/tutorials/quick_start/src/NetConv_cn.jpg
--- a/doc/tutorials/quick_start/src/NetConv_en.png
+++ b/doc/tutorials/quick_start/src/NetConv_en.png
--- a/doc/tutorials/quick_start/src/NetLR_cn.jpg
+++ b/doc/tutorials/quick_start/src/NetLR_cn.jpg
--- a/doc/tutorials/quick_start/src/NetLR_en.png
+++ b/doc/tutorials/quick_start/src/NetLR_en.png
--- a/doc/tutorials/quick_start/src/NetRNN_cn.jpg
+++ b/doc/tutorials/quick_start/src/NetRNN_cn.jpg
--- a/doc/tutorials/quick_start/src/NetRNN_en.png
+++ b/doc/tutorials/quick_start/src/NetRNN_en.png
--- a/doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg
+++ b/doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg
--- a/doc/tutorials/quick_start/src/PipelineNetwork_en.jpg
+++ b/doc/tutorials/quick_start/src/PipelineNetwork_en.jpg
--- a/doc/tutorials/quick_start/src/PipelineTest_cn.jpg
+++ b/doc/tutorials/quick_start/src/PipelineTest_cn.jpg
--- a/doc/tutorials/quick_start/src/PipelineTest_en.png
+++ b/doc/tutorials/quick_start/src/PipelineTest_en.png
--- a/doc/tutorials/quick_start/src/PipelineTrain_cn.jpg
+++ b/doc/tutorials/quick_start/src/PipelineTrain_cn.jpg
--- a/doc/tutorials/quick_start/src/PipelineTrain_en.png
+++ b/doc/tutorials/quick_start/src/PipelineTrain_en.png
--- a/doc/tutorials/quick_start/src/Pipeline_cn.jpg
+++ b/doc/tutorials/quick_start/src/Pipeline_cn.jpg
--- a/doc/tutorials/quick_start/src/Pipeline_en.jpg
+++ b/doc/tutorials/quick_start/src/Pipeline_en.jpg
--- a/go/.gitignore
+++ b/go/.gitignore
--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
--- a/go/glide.lock
+++ b/go/glide.lock
--- a/go/glide.yaml
+++ b/go/glide.yaml
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
--- a/go/master/client.go
+++ b/go/master/client.go
--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
--- a/go/master/client_test.go
+++ b/go/master/client_test.go
--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
--- a/go/master/service.go
+++ b/go/master/service.go
--- a/go/proto/.gitignore
+++ b/go/proto/.gitignore
--- a/go/pserver/CMakeLists.txt
+++ b/go/pserver/CMakeLists.txt
--- a/go/pserver/client/c/cclient.go
+++ b/go/pserver/client/c/cclient.go
--- a/go/pserver/client/client.go
+++ b/go/pserver/client/client.go
--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
--- a/go/pserver/client/etcd_client.go
+++ b/go/pserver/client/etcd_client.go
--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
--- a/go/pserver/optimizer_test.go
+++ b/go/pserver/optimizer_test.go
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
--- a/go/pserver/service_internal_test.go
+++ b/go/pserver/service_internal_test.go
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
--- a/paddle/capi/export.sym
+++ b/paddle/capi/export.sym
--- a/paddle/capi/gradient_machine.cpp
+++ b/paddle/capi/gradient_machine.cpp
--- a/paddle/capi/export.map
+++ b/paddle/capi/export.map
--- a/paddle/capi/tests/CMakeLists.txt
+++ b/paddle/capi/tests/CMakeLists.txt
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
--- a/paddle/framework/details/op_registry.h
+++ b/paddle/framework/details/op_registry.h
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
--- a/paddle/framework/feed_fetch_method.h
+++ b/paddle/framework/feed_fetch_method.h
--- a/paddle/framework/feed_fetch_type.h
+++ b/paddle/framework/feed_fetch_type.h
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
--- a/paddle/framework/lod_tensor.md
+++ b/paddle/framework/lod_tensor.md
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
--- a/paddle/framework/op_proto_maker.h
+++ b/paddle/framework/op_proto_maker.h
--- a/paddle/framework/op_proto_maker_test.cc
+++ b/paddle/framework/op_proto_maker_test.cc
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
--- a/paddle/framework/program_desc_test.cc
+++ b/paddle/framework/program_desc_test.cc
--- a/paddle/framework/proto_desc.h
+++ b/paddle/framework/proto_desc.h
--- a/paddle/framework/prune.cc
+++ b/paddle/framework/prune.cc
--- a/paddle/framework/grad_op_builder.h
+++ b/paddle/framework/grad_op_builder.h
--- a/paddle/framework/prune_test.cc
+++ b/paddle/framework/prune_test.cc
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
--- a/paddle/framework/selected_rows.cc
+++ b/paddle/framework/selected_rows.cc
--- a/paddle/framework/selected_rows.h
+++ b/paddle/framework/selected_rows.h
--- a/paddle/framework/selected_rows_test.cc
+++ b/paddle/framework/selected_rows_test.cc
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
--- a/paddle/framework/tensor_array.cc
+++ b/paddle/framework/tensor_array.cc
--- a/paddle/framework/tensor_array.h
+++ b/paddle/framework/tensor_array.h
--- a/paddle/framework/tensor_array_test.cc
+++ b/paddle/framework/tensor_array_test.cc
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
--- a/paddle/framework/type_defs.h
+++ b/paddle/framework/type_defs.h
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
--- a/paddle/framework/var_type_inference.h
+++ b/paddle/framework/var_type_inference.h
--- a/paddle/framework/var_type_inference_test.cc
+++ b/paddle/framework/var_type_inference_test.cc
--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
--- a/paddle/function/BlockExpandOp.cpp
+++ b/paddle/function/BlockExpandOp.cpp
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
--- a/paddle/function/CosSimOp.cpp
+++ b/paddle/function/CosSimOp.cpp
--- a/paddle/function/CropOp.cpp
+++ b/paddle/function/CropOp.cpp
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
--- a/paddle/function/DepthwiseConvOpTest.cpp
+++ b/paddle/function/DepthwiseConvOpTest.cpp
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
--- a/paddle/function/GemmConvOpTest.cpp
+++ b/paddle/function/GemmConvOpTest.cpp
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
--- a/paddle/function/PadOp.cpp
+++ b/paddle/function/PadOp.cpp
--- a/paddle/function/RowConvOp.cpp
+++ b/paddle/function/RowConvOp.cpp
--- a/paddle/function/SwitchOp.cpp
+++ b/paddle/function/SwitchOp.cpp
--- a/paddle/function/neon/NeonDepthwiseConv.h
+++ b/paddle/function/neon/NeonDepthwiseConv.h
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
--- a/paddle/gserver/activations/MKLDNNActivation.cpp
+++ b/paddle/gserver/activations/MKLDNNActivation.cpp
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
--- a/paddle/gserver/gradientmachines/GradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/GradientMachine.cpp
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
--- a/paddle/gserver/layers/CRFLayer.cpp
+++ b/paddle/gserver/layers/CRFLayer.cpp
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
--- a/paddle/gserver/layers/LinearChainCRF.cpp
+++ b/paddle/gserver/layers/LinearChainCRF.cpp
--- a/paddle/gserver/layers/MKLDNNBase.h
+++ b/paddle/gserver/layers/MKLDNNBase.h
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.h
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
--- a/paddle/gserver/layers/MKLDNNPoolLayer.h
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.h
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
--- a/paddle/gserver/layers/SequenceReshapeLayer.cpp
+++ b/paddle/gserver/layers/SequenceReshapeLayer.cpp
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
--- a/paddle/gserver/tests/mkldnn_branch_net.conf
+++ b/paddle/gserver/tests/mkldnn_branch_net.conf
--- a/paddle/trainer/tests/sample_trainer_config_simple_net.conf
+++ b/paddle/trainer/tests/sample_trainer_config_simple_net.conf
--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
--- a/paddle/gserver/tests/test_CRFLayerGrad.cpp
+++ b/paddle/gserver/tests/test_CRFLayerGrad.cpp
--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ b/paddle/gserver/tests/test_ConvTrans.cpp
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
--- a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
+++ b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
--- a/paddle/gserver/tests/test_DetectionOutput.cpp
+++ b/paddle/gserver/tests/test_DetectionOutput.cpp
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
--- a/paddle/gserver/tests/test_Expand.cpp
+++ b/paddle/gserver/tests/test_Expand.cpp
--- a/paddle/gserver/tests/test_KmaxSeqScore.cpp
+++ b/paddle/gserver/tests/test_KmaxSeqScore.cpp
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
--- a/paddle/gserver/tests/test_PriorBox.cpp
+++ b/paddle/gserver/tests/test_PriorBox.cpp
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
--- a/paddle/gserver/tests/test_PyDataProvider.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider.cpp
--- a/paddle/gserver/tests/test_PyDataProvider2.py
+++ b/paddle/gserver/tests/test_PyDataProvider2.py
--- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp
+++ b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
--- a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
+++ b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
--- a/paddle/math/RowBuffer.h
+++ b/paddle/math/RowBuffer.h
--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
--- a/paddle/math/tests/test_Allocator.cpp
+++ b/paddle/math/tests/test_Allocator.cpp
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ b/paddle/math/tests/test_CpuGpuVector.cpp
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ b/paddle/math/tests/test_GpuProfiler.cpp
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
--- a/paddle/math/tests/test_SparseMatrix.cpp
+++ b/paddle/math/tests/test_SparseMatrix.cpp
--- a/paddle/math/tests/test_Tensor.cu
+++ b/paddle/math/tests/test_Tensor.cu
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
--- a/paddle/math/tests/test_batchTranspose.cpp
+++ b/paddle/math/tests/test_batchTranspose.cpp
--- a/paddle/math/tests/test_lazyAssign.cu
+++ b/paddle/math/tests/test_lazyAssign.cu
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
--- a/paddle/memory/detail/meta_cache.cc
+++ b/paddle/memory/detail/meta_cache.cc
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
--- a/paddle/memory/memcpy.h
+++ b/paddle/memory/memcpy.h
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
--- a/paddle/operators/adadelta_op.cu
+++ b/paddle/operators/adadelta_op.cu
--- a/paddle/operators/adadelta_op.h
+++ b/paddle/operators/adadelta_op.h
--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
--- a/paddle/operators/adagrad_op.h
+++ b/paddle/operators/adagrad_op.h
--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
--- a/paddle/operators/adam_op.cu
+++ b/paddle/operators/adam_op.cu
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
--- a/paddle/operators/adamax_op.h
+++ b/paddle/operators/adamax_op.h
--- a/paddle/operators/auc_op.cc
+++ b/paddle/operators/auc_op.cc
--- a/paddle/operators/auc_op.h
+++ b/paddle/operators/auc_op.h
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
--- a/paddle/operators/batch_norm_op.cu
+++ b/paddle/operators/batch_norm_op.cu
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
--- a/paddle/operators/batch_norm_op.md
+++ b/paddle/operators/batch_norm_op.md
--- a/paddle/operators/cast_op.cc
+++ b/paddle/operators/cast_op.cc
--- a/paddle/operators/cast_op.cu
+++ b/paddle/operators/cast_op.cu
--- a/paddle/operators/cast_op.h
+++ b/paddle/operators/cast_op.h
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
--- a/paddle/operators/clip_op.h
+++ b/paddle/operators/clip_op.h
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
--- a/paddle/operators/concat_op.h
+++ b/paddle/operators/concat_op.h
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
--- a/paddle/operators/cond_op.h
+++ b/paddle/operators/cond_op.h
--- a/paddle/operators/conv2d_op.cc
+++ b/paddle/operators/conv2d_op.cc
--- a/paddle/operators/conv2d_op.cu
+++ b/paddle/operators/conv2d_op.cu
--- a/paddle/operators/gemm_conv2d_op.h
+++ b/paddle/operators/gemm_conv2d_op.h
--- a/paddle/operators/conv2d_transpose_cudnn_op.cc
+++ b/paddle/operators/conv2d_transpose_cudnn_op.cc
--- a/paddle/operators/conv2d_transpose_cudnn_op.cu
+++ b/paddle/operators/conv2d_transpose_cudnn_op.cu
--- a/paddle/operators/conv2d_transpose_op.cc
+++ b/paddle/operators/conv2d_transpose_op.cc
--- a/paddle/operators/conv2d_transpose_op.cu
+++ b/paddle/operators/conv2d_transpose_op.cu
--- a/paddle/operators/conv2d_transpose_op.h
+++ b/paddle/operators/conv2d_transpose_op.h
--- a/paddle/operators/conv_cudnn_op.cc
+++ b/paddle/operators/conv_cudnn_op.cc
--- a/paddle/operators/conv_cudnn_op.cu
+++ b/paddle/operators/conv_cudnn_op.cu
--- a/paddle/operators/conv_shift_op.cc
+++ b/paddle/operators/conv_shift_op.cc
--- a/paddle/operators/conv_shift_op.cu
+++ b/paddle/operators/conv_shift_op.cu
--- a/paddle/operators/conv_shift_op.h
+++ b/paddle/operators/conv_shift_op.h
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
--- a/paddle/operators/crop_op.h
+++ b/paddle/operators/crop_op.h
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
--- a/paddle/operators/decayed_adagrad_op.cc
+++ b/paddle/operators/decayed_adagrad_op.cc
--- a/paddle/operators/decayed_adagrad_op.cu
+++ b/paddle/operators/decayed_adagrad_op.cu
--- a/paddle/operators/decayed_adagrad_op.h
+++ b/paddle/operators/decayed_adagrad_op.h
--- a/paddle/operators/detail/strided_memcpy.h
+++ b/paddle/operators/detail/strided_memcpy.h
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
--- a/paddle/operators/dynamic_recurrent_op.cc
+++ b/paddle/operators/dynamic_recurrent_op.cc
--- a/paddle/operators/dynamic_recurrent_op.h
+++ b/paddle/operators/dynamic_recurrent_op.h
--- a/paddle/operators/dynamic_recurrent_op_test.cc
+++ b/paddle/operators/dynamic_recurrent_op_test.cc
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
--- a/paddle/operators/elementwise_mul_op.cu
+++ b/paddle/operators/elementwise_mul_op.cu
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
--- a/paddle/operators/fill_constant_batch_size_like_op.cu
+++ b/paddle/operators/fill_constant_batch_size_like_op.cu
--- a/paddle/platform/environment.h
+++ b/paddle/platform/environment.h
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
--- a/paddle/operators/fill_constant_op.cu
+++ b/paddle/operators/fill_constant_op.cu
--- a/paddle/operators/fill_constant_op.h
+++ b/paddle/operators/fill_constant_op.h
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
--- a/paddle/operators/gather.cu.h
+++ b/paddle/operators/gather.cu.h
--- a/paddle/operators/gather.h
+++ b/paddle/operators/gather.h
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/gather_op.cu
--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
--- a/paddle/operators/gather_test.cc
+++ b/paddle/operators/gather_test.cc
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
--- a/paddle/operators/gru_unit_op.cu
+++ b/paddle/operators/gru_unit_op.cu
--- a/paddle/operators/gru_unit_op.h
+++ b/paddle/operators/gru_unit_op.h
--- a/paddle/operators/huber_loss_op.cc
+++ b/paddle/operators/huber_loss_op.cc
--- a/paddle/operators/huber_loss_op.cu
+++ b/paddle/operators/huber_loss_op.cu
--- a/paddle/operators/huber_loss_op.h
+++ b/paddle/operators/huber_loss_op.h
--- a/paddle/operators/images/batch_norm_fork.dot
+++ b/paddle/operators/images/batch_norm_fork.dot
--- a/paddle/operators/images/batch_norm_fork.png
+++ b/paddle/operators/images/batch_norm_fork.png
--- a/paddle/operators/images/batch_norm_op_kernel.png
+++ b/paddle/operators/images/batch_norm_op_kernel.png
--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
--- a/paddle/operators/increment_op.cu
+++ b/paddle/operators/increment_op.cu
--- a/paddle/operators/increment_op.h
+++ b/paddle/operators/increment_op.h
--- a/paddle/operators/l1_norm_op.cc
+++ b/paddle/operators/l1_norm_op.cc
--- a/paddle/operators/l1_norm_op.cu
+++ b/paddle/operators/l1_norm_op.cu
--- a/paddle/operators/l1_norm_op.h
+++ b/paddle/operators/l1_norm_op.h
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
--- a/paddle/operators/linear_chain_crf_op.cu
+++ b/paddle/operators/linear_chain_crf_op.cu
--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
--- a/paddle/operators/lookup_table_op.h
+++ b/paddle/operators/lookup_table_op.h
--- a/paddle/operators/lrn_op.cc
+++ b/paddle/operators/lrn_op.cc
--- a/paddle/operators/lrn_op.cu
+++ b/paddle/operators/lrn_op.cu
--- a/paddle/operators/lrn_op.h
+++ b/paddle/operators/lrn_op.h
--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
--- a/paddle/operators/lstm_op.cu
+++ b/paddle/operators/lstm_op.cu
--- a/paddle/operators/lstm_op.h
+++ b/paddle/operators/lstm_op.h
--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
--- a/paddle/operators/lstm_unit_op.cu
+++ b/paddle/operators/lstm_unit_op.cu
--- a/paddle/operators/lstm_unit_op.h
+++ b/paddle/operators/lstm_unit_op.h
--- a/paddle/operators/margin_rank_loss_op.cc
+++ b/paddle/operators/margin_rank_loss_op.cc
--- a/paddle/operators/margin_rank_loss_op.cu
+++ b/paddle/operators/margin_rank_loss_op.cu
--- a/paddle/operators/margin_rank_loss_op.h
+++ b/paddle/operators/margin_rank_loss_op.h
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
--- a/paddle/operators/math/context_project.cc
+++ b/paddle/operators/math/context_project.cc
--- a/paddle/operators/math/context_project.cu
+++ b/paddle/operators/math/context_project.cu
--- a/paddle/operators/math/context_project.h
+++ b/paddle/operators/math/context_project.h
--- a/paddle/operators/math/cross_entropy.cc
+++ b/paddle/operators/math/cross_entropy.cc
--- a/paddle/operators/math/cross_entropy.cu
+++ b/paddle/operators/math/cross_entropy.cu
--- a/paddle/operators/math/cross_entropy.h
+++ b/paddle/operators/math/cross_entropy.h
--- a/paddle/operators/math/detail/CMakeLists.txt
+++ b/paddle/operators/math/detail/CMakeLists.txt
--- a/paddle/operators/math/detail/hl_activation_functions.h
+++ b/paddle/operators/math/detail/hl_activation_functions.h
--- a/paddle/operators/math/detail/hl_avx_functions.cc
+++ b/paddle/operators/math/detail/hl_avx_functions.cc
--- a/paddle/operators/math/detail/hl_avx_functions.h
+++ b/paddle/operators/math/detail/hl_avx_functions.h
--- a/paddle/operators/math/detail/hl_cpu_functions.cc
+++ b/paddle/operators/math/detail/hl_cpu_functions.cc
--- a/paddle/operators/math/detail/hl_functions.h
+++ b/paddle/operators/math/detail/hl_functions.h
--- a/paddle/operators/math/detail/hl_gpu_functions.h
+++ b/paddle/operators/math/detail/hl_gpu_functions.h
--- a/paddle/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_cpu_kernel.h
--- a/paddle/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_gpu_kernel.h
--- a/paddle/operators/math/detail/lstm_kernel.h
+++ b/paddle/operators/math/detail/lstm_kernel.h
--- a/paddle/operators/math/im2col.cc
+++ b/paddle/operators/math/im2col.cc
--- a/paddle/operators/math/im2col.cu
+++ b/paddle/operators/math/im2col.cu
--- a/paddle/operators/math/im2col.h
+++ b/paddle/operators/math/im2col.h
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
--- a/paddle/operators/math/lstm_compute.cc
+++ b/paddle/operators/math/lstm_compute.cc
--- a/paddle/operators/math/lstm_compute.cu
+++ b/paddle/operators/math/lstm_compute.cu
--- a/paddle/operators/math/lstm_compute.h
+++ b/paddle/operators/math/lstm_compute.h
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
--- a/paddle/operators/math/math_function_test.cu
+++ b/paddle/operators/math/math_function_test.cu
--- a/paddle/operators/math/matmul.h
+++ b/paddle/operators/math/matmul.h
--- a/paddle/operators/math/pooling.cc
+++ b/paddle/operators/math/pooling.cc
--- a/paddle/operators/math/pooling.cu
+++ b/paddle/operators/math/pooling.cu
--- a/paddle/operators/math/pooling.h
+++ b/paddle/operators/math/pooling.h
--- a/paddle/operators/math/selected_rows_functor.cc
+++ b/paddle/operators/math/selected_rows_functor.cc
--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
--- a/paddle/operators/math/selected_rows_functor.h
+++ b/paddle/operators/math/selected_rows_functor.h
--- a/paddle/operators/math/selected_rows_functor_test.cc
+++ b/paddle/operators/math/selected_rows_functor_test.cc
--- a/paddle/operators/math/selected_rows_functor_test.cu
+++ b/paddle/operators/math/selected_rows_functor_test.cu
--- a/paddle/operators/math/sequence2batch.cc
+++ b/paddle/operators/math/sequence2batch.cc
--- a/paddle/operators/math/sequence2batch.cu
+++ b/paddle/operators/math/sequence2batch.cu
--- a/paddle/operators/math/sequence2batch.h
+++ b/paddle/operators/math/sequence2batch.h
--- a/paddle/operators/math/softmax.cc
+++ b/paddle/operators/math/softmax.cc
--- a/paddle/operators/math/softmax.cu
+++ b/paddle/operators/math/softmax.cu
--- a/paddle/operators/math/softmax.h
+++ b/paddle/operators/math/softmax.h
--- a/paddle/operators/math/vol2col.cc
+++ b/paddle/operators/math/vol2col.cc
--- a/paddle/operators/math/vol2col.cu
+++ b/paddle/operators/math/vol2col.cu
--- a/paddle/operators/math/vol2col.h
+++ b/paddle/operators/math/vol2col.h
--- a/paddle/operators/math/vol2col_test.cc
+++ b/paddle/operators/math/vol2col_test.cc
--- a/paddle/operators/matmul_op.cc
+++ b/paddle/operators/matmul_op.cc
--- a/paddle/operators/matmul_op.cu
+++ b/paddle/operators/matmul_op.cu
--- a/paddle/operators/matmul_op.h
+++ b/paddle/operators/matmul_op.h
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
--- a/paddle/operators/mean_op.cu
+++ b/paddle/operators/mean_op.cu
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
--- a/paddle/operators/minus_op.h
+++ b/paddle/operators/minus_op.h
--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
--- a/paddle/operators/modified_huber_loss_op.cu
+++ b/paddle/operators/modified_huber_loss_op.cu
--- a/paddle/operators/modified_huber_loss_op.h
+++ b/paddle/operators/modified_huber_loss_op.h
--- a/paddle/operators/momentum_op.cc
+++ b/paddle/operators/momentum_op.cc
--- a/paddle/operators/momentum_op.cu
+++ b/paddle/operators/momentum_op.cu
--- a/paddle/operators/momentum_op.h
+++ b/paddle/operators/momentum_op.h
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
--- a/paddle/operators/multiplex_op.h
+++ b/paddle/operators/multiplex_op.h
--- a/paddle/operators/name_convention.md
+++ b/paddle/operators/name_convention.md
--- a/paddle/operators/nccl/CMakeLists.txt
+++ b/paddle/operators/nccl/CMakeLists.txt
--- a/paddle/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/operators/nccl/nccl_gpu_common.cc
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
--- a/paddle/operators/pad_op.h
+++ b/paddle/operators/pad_op.h
--- a/paddle/operators/pool_cudnn_op.cc
+++ b/paddle/operators/pool_cudnn_op.cc
--- a/paddle/operators/pool_cudnn_op.cu
+++ b/paddle/operators/pool_cudnn_op.cu
--- a/paddle/operators/pool_cudnn_op.h
+++ b/paddle/operators/pool_cudnn_op.h
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
--- a/paddle/operators/pool_op.cu
+++ b/paddle/operators/pool_op.cu
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
--- a/paddle/operators/pool_with_index_op.cu
+++ b/paddle/operators/pool_with_index_op.cu
--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
--- a/paddle/operators/precision_recall_op.cc
+++ b/paddle/operators/precision_recall_op.cc
--- a/paddle/operators/precision_recall_op.h
+++ b/paddle/operators/precision_recall_op.h
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
--- a/paddle/operators/prelu_op.h
+++ b/paddle/operators/prelu_op.h
--- a/paddle/operators/proximal_adagrad_op.cc
+++ b/paddle/operators/proximal_adagrad_op.cc
--- a/paddle/operators/proximal_adagrad_op.cu
+++ b/paddle/operators/proximal_adagrad_op.cu
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
--- a/paddle/operators/proximal_gd_op.cc
+++ b/paddle/operators/proximal_gd_op.cc
--- a/paddle/operators/proximal_gd_op.cu
+++ b/paddle/operators/proximal_gd_op.cu
--- a/paddle/operators/proximal_gd_op.h
+++ b/paddle/operators/proximal_gd_op.h
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
--- a/paddle/operators/reduce_op.cu
+++ b/paddle/operators/reduce_op.cu
--- a/paddle/operators/reduce_op.h
+++ b/paddle/operators/reduce_op.h
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
--- a/paddle/operators/rmsprop_op.cc
+++ b/paddle/operators/rmsprop_op.cc
--- a/paddle/operators/rmsprop_op.cu
+++ b/paddle/operators/rmsprop_op.cu
--- a/paddle/operators/rmsprop_op.h
+++ b/paddle/operators/rmsprop_op.h
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ b/paddle/operators/rnn/recurrent_op_utils.h
--- a/paddle/operators/rnn_memory_helper_op.cc
+++ b/paddle/operators/rnn_memory_helper_op.cc
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
--- a/paddle/operators/save_load_op_test.cc
+++ b/paddle/operators/save_load_op_test.cc
--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
--- a/paddle/operators/scale_op.cu
+++ b/paddle/operators/scale_op.cu
--- a/paddle/operators/scale_op.h
+++ b/paddle/operators/scale_op.h
--- a/paddle/operators/scatter.cu.h
+++ b/paddle/operators/scatter.cu.h
--- a/paddle/operators/scatter.h
+++ b/paddle/operators/scatter.h
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
--- a/paddle/operators/scatter_op.h
+++ b/paddle/operators/scatter_op.h
--- a/paddle/operators/scatter_test.cc
+++ b/paddle/operators/scatter_test.cc
--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/seq_expand_op.cc
--- a/paddle/operators/seq_expand_op.cu
+++ b/paddle/operators/seq_expand_op.cu
--- a/paddle/operators/seq_expand_op.h
+++ b/paddle/operators/seq_expand_op.h
--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
--- a/paddle/operators/sequence_concat_op.cu
+++ b/paddle/operators/sequence_concat_op.cu
--- a/paddle/operators/sequence_concat_op.h
+++ b/paddle/operators/sequence_concat_op.h
--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
--- a/paddle/operators/sequence_conv_op.h
+++ b/paddle/operators/sequence_conv_op.h
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/operators/sequence_softmax_op.cc
--- a/paddle/operators/sequence_softmax_op.cu
+++ b/paddle/operators/sequence_softmax_op.cu
--- a/paddle/operators/sequence_softmax_op.h
+++ b/paddle/operators/sequence_softmax_op.h
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
--- a/paddle/operators/identity_op.cc
+++ b/paddle/operators/identity_op.cc
--- a/paddle/operators/sign_op.cu
+++ b/paddle/operators/sign_op.cu
--- a/paddle/operators/sign_op.h
+++ b/paddle/operators/sign_op.h
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
--- a/paddle/operators/smooth_l1_loss_op.h
+++ b/paddle/operators/smooth_l1_loss_op.h
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
--- a/paddle/operators/split_op.h
+++ b/paddle/operators/split_op.h
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
--- a/paddle/operators/squared_l2_distance_op.h
+++ b/paddle/operators/squared_l2_distance_op.h
--- a/paddle/operators/squared_l2_norm_op.cc
+++ b/paddle/operators/squared_l2_norm_op.cc
--- a/paddle/operators/squared_l2_norm_op.cu
+++ b/paddle/operators/squared_l2_norm_op.cu
--- a/paddle/operators/squared_l2_norm_op.h
+++ b/paddle/operators/squared_l2_norm_op.h
--- a/paddle/operators/strided_memcpy_test.cc
+++ b/paddle/operators/strided_memcpy_test.cc
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
--- a/paddle/operators/sum_op.cu
+++ b/paddle/operators/sum_op.cu
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
--- a/paddle/operators/top_k_op.cu
+++ b/paddle/operators/top_k_op.cu
--- a/paddle/operators/top_k_op.h
+++ b/paddle/operators/top_k_op.h
--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
--- a/paddle/operators/transpose_op.h
+++ b/paddle/operators/transpose_op.h
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
--- a/paddle/optimizer/adadelta_optimizer.cc
+++ b/paddle/optimizer/adadelta_optimizer.cc
--- a/paddle/optimizer/adadelta_optimizer.h
+++ b/paddle/optimizer/adadelta_optimizer.h
--- a/paddle/optimizer/adagrad_optimizer.cc
+++ b/paddle/optimizer/adagrad_optimizer.cc
--- a/paddle/optimizer/adagrad_optimizer.h
+++ b/paddle/optimizer/adagrad_optimizer.h
--- a/paddle/optimizer/adam_optimizer.cc
+++ b/paddle/optimizer/adam_optimizer.cc
--- a/paddle/optimizer/adam_optimizer.h
+++ b/paddle/optimizer/adam_optimizer.h
--- a/paddle/optimizer/lr_policy.h
+++ b/paddle/optimizer/lr_policy.h
--- a/paddle/optimizer/optimizer.cc
+++ b/paddle/optimizer/optimizer.cc
--- a/paddle/optimizer/parameter_optimizer.cc
+++ b/paddle/optimizer/parameter_optimizer.cc
--- a/paddle/optimizer/parameter_optimizer.h
+++ b/paddle/optimizer/parameter_optimizer.h
--- a/paddle/optimizer/parameter_optimizer_test.cpp
+++ b/paddle/optimizer/parameter_optimizer_test.cpp
--- a/paddle/optimizer/serialization_test.cpp
+++ b/paddle/optimizer/serialization_test.cpp
--- a/paddle/optimizer/sgd_optimizer.cc
+++ b/paddle/optimizer/sgd_optimizer.cc
--- a/paddle/optimizer/sgd_optimizer.h
+++ b/paddle/optimizer/sgd_optimizer.h
--- a/paddle/optimizer/tensor.h
+++ b/paddle/optimizer/tensor.h
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
--- a/paddle/platform/cudnn_helper.h
+++ b/paddle/platform/cudnn_helper.h
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
--- a/paddle/platform/dynload/cublas.h
+++ b/paddle/platform/dynload/cublas.h
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
--- a/paddle/platform/dynload/dynamic_loader.cc
+++ b/paddle/platform/dynload/dynamic_loader.cc
--- a/paddle/platform/dynload/dynamic_loader.h
+++ b/paddle/platform/dynload/dynamic_loader.h
--- a/paddle/platform/dynload/nccl.cc
+++ b/paddle/platform/dynload/nccl.cc
--- a/paddle/platform/dynload/nccl.h
+++ b/paddle/platform/dynload/nccl.h
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
--- a/paddle/platform/enforce_test.cc
+++ b/paddle/platform/enforce_test.cc
--- a/paddle/platform/environment_test.cc
+++ b/paddle/platform/environment_test.cc
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
--- a/paddle/platform/hostdevice.h
+++ b/paddle/platform/hostdevice.h
--- a/paddle/platform/macros.h
+++ b/paddle/platform/macros.h
--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
--- a/paddle/platform/variant.h
+++ b/paddle/platform/variant.h
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
--- a/paddle/pserver/ParameterClient2.cpp
+++ b/paddle/pserver/ParameterClient2.cpp
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
--- a/paddle/pybind/exception.h
+++ b/paddle/pybind/exception.h
--- a/paddle/pybind/print_operators_doc.cc
+++ b/paddle/pybind/print_operators_doc.cc
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
--- a/paddle/scripts/cluster_train_v2/fabric/conf.py
+++ b/paddle/scripts/cluster_train_v2/fabric/conf.py
--- a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
+++ b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
--- a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
+++ b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
--- a/paddle/scripts/cluster_train_v2/fabric/run.sh
+++ b/paddle/scripts/cluster_train_v2/fabric/run.sh
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
--- a/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
+++ b/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
--- a/paddle/scripts/travis/check_style.sh
+++ b/paddle/scripts/travis/check_style.sh
--- a/paddle/string/to_string_test.cc
+++ b/paddle/string/to_string_test.cc
--- a/paddle/trainer/MergeModel.cpp
+++ b/paddle/trainer/MergeModel.cpp
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
--- a/paddle/trainer/tests/test_Compare.cpp
+++ b/paddle/trainer/tests/test_Compare.cpp
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/trainer/tests/test_CompareTwoNets.cpp
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
--- a/paddle/utils/Version.h
+++ b/paddle/utils/Version.h
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
--- a/proto/TrainerConfig.proto
+++ b/proto/TrainerConfig.proto
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
--- a/python/paddle/utils/merge_model.py
+++ b/python/paddle/utils/merge_model.py
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
--- a/python/paddle/v2/event.py
+++ b/python/paddle/v2/event.py
--- a/python/paddle/v2/framework/__init__.py
+++ b/python/paddle/v2/framework/__init__.py
--- a/python/paddle/v2/framework/backward.py
+++ b/python/paddle/v2/framework/backward.py
--- a/python/paddle/v2/framework/default_scope_funcs.py
+++ b/python/paddle/v2/framework/default_scope_funcs.py
--- a/python/paddle/v2/framework/evaluator.py
+++ b/python/paddle/v2/framework/evaluator.py
--- a/python/paddle/v2/framework/executor.py
+++ b/python/paddle/v2/framework/executor.py
--- a/python/paddle/v2/framework/framework.py
+++ b/python/paddle/v2/framework/framework.py
--- a/python/paddle/v2/framework/initializer.py
+++ b/python/paddle/v2/framework/initializer.py
--- a/python/paddle/v2/framework/io.py
+++ b/python/paddle/v2/framework/io.py
--- a/python/paddle/v2/framework/layer_helper.py
+++ b/python/paddle/v2/framework/layer_helper.py
--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
--- a/python/paddle/v2/framework/net_drawer.py
+++ b/python/paddle/v2/framework/net_drawer.py
--- a/python/paddle/v2/framework/nets.py
+++ b/python/paddle/v2/framework/nets.py
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
--- a/python/paddle/v2/framework/optimizer.py
+++ b/python/paddle/v2/framework/optimizer.py
--- a/python/paddle/v2/framework/regularizer.py
+++ b/python/paddle/v2/framework/regularizer.py
--- a/python/paddle/v2/framework/tests/.gitignore
+++ b/python/paddle/v2/framework/tests/.gitignore
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
--- a/python/paddle/v2/framework/tests/test_accuracy_op.py
+++ b/python/paddle/v2/framework/tests/test_accuracy_op.py
--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
--- a/python/paddle/v2/framework/tests/test_adadelta_op.py
+++ b/python/paddle/v2/framework/tests/test_adadelta_op.py
--- a/python/paddle/v2/framework/tests/test_adagrad_op.py
+++ b/python/paddle/v2/framework/tests/test_adagrad_op.py
--- a/python/paddle/v2/framework/tests/test_adam_op.py
+++ b/python/paddle/v2/framework/tests/test_adam_op.py
--- a/python/paddle/v2/framework/tests/test_adamax_op.py
+++ b/python/paddle/v2/framework/tests/test_adamax_op.py
--- a/python/paddle/v2/framework/tests/test_auc_op.py
+++ b/python/paddle/v2/framework/tests/test_auc_op.py
--- a/python/paddle/v2/framework/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/framework/tests/test_batch_norm_op.py
--- a/python/paddle/v2/framework/tests/test_cast_op.py
+++ b/python/paddle/v2/framework/tests/test_cast_op.py
--- a/python/paddle/v2/framework/tests/test_clip_op.py
+++ b/python/paddle/v2/framework/tests/test_clip_op.py
--- a/python/paddle/v2/framework/tests/test_cond_op.py
+++ b/python/paddle/v2/framework/tests/test_cond_op.py
--- a/python/paddle/v2/framework/tests/test_conv2d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_op.py
--- a/python/paddle/v2/framework/tests/test_conv2d_transpose_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_transpose_op.py
--- a/python/paddle/v2/framework/tests/test_conv_shift_op.py
+++ b/python/paddle/v2/framework/tests/test_conv_shift_op.py
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
--- a/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py
+++ b/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py
--- a/python/paddle/v2/framework/tests/test_default_scope_funcs.py
+++ b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
--- a/python/paddle/v2/framework/tests/test_dropout_op.py
+++ b/python/paddle/v2/framework/tests/test_dropout_op.py
--- a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_add_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
--- a/python/paddle/v2/framework/tests/test_evaluator.py
+++ b/python/paddle/v2/framework/tests/test_evaluator.py
--- a/python/paddle/v2/framework/tests/test_executor_and_mul.py
+++ b/python/paddle/v2/framework/tests/test_executor_and_mul.py
--- a/python/paddle/v2/framework/tests/test_fc_op.py
+++ b/python/paddle/v2/framework/tests/test_fc_op.py
--- a/python/paddle/v2/framework/tests/test_feed_fetch_method.py
+++ b/python/paddle/v2/framework/tests/test_feed_fetch_method.py
--- a/python/paddle/v2/framework/tests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_constant_batch_size_like_op.py
--- a/python/paddle/v2/framework/tests/test_fill_constant_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_constant_op.py
--- a/python/paddle/v2/framework/tests/test_fit_a_line.py
+++ b/python/paddle/v2/framework/tests/test_fit_a_line.py
--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
--- a/python/paddle/v2/framework/tests/test_gradient_checker.py
+++ b/python/paddle/v2/framework/tests/test_gradient_checker.py
--- a/python/paddle/v2/framework/tests/test_gru_unit_op.py
+++ b/python/paddle/v2/framework/tests/test_gru_unit_op.py
--- a/python/paddle/v2/framework/tests/test_huber_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_huber_loss_op.py
--- a/python/paddle/v2/framework/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/framework/tests/test_image_classification_layer.py
--- a/python/paddle/v2/framework/tests/test_image_classification_train.py
+++ b/python/paddle/v2/framework/tests/test_image_classification_train.py
--- a/python/paddle/v2/framework/tests/test_increment_op.py
+++ b/python/paddle/v2/framework/tests/test_increment_op.py
--- a/python/paddle/v2/framework/tests/test_infer_shape.py
+++ b/python/paddle/v2/framework/tests/test_infer_shape.py
--- a/python/paddle/v2/framework/tests/test_inference_model_io.py
+++ b/python/paddle/v2/framework/tests/test_inference_model_io.py
--- a/python/paddle/v2/framework/tests/test_initializer.py
+++ b/python/paddle/v2/framework/tests/test_initializer.py
--- a/python/paddle/v2/framework/tests/test_identity_op.py
+++ b/python/paddle/v2/framework/tests/test_identity_op.py
--- a/python/paddle/v2/framework/tests/test_layers.py
+++ b/python/paddle/v2/framework/tests/test_layers.py
--- a/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py
+++ b/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py
--- a/python/paddle/v2/framework/tests/test_lookup_table_op.py
+++ b/python/paddle/v2/framework/tests/test_lookup_table_op.py
--- a/python/paddle/v2/framework/tests/test_lrn_op.py
+++ b/python/paddle/v2/framework/tests/test_lrn_op.py
--- a/python/paddle/v2/framework/tests/test_lstm_op.py
+++ b/python/paddle/v2/framework/tests/test_lstm_op.py
--- a/python/paddle/v2/framework/tests/test_lstm_unit_op.py
+++ b/python/paddle/v2/framework/tests/test_lstm_unit_op.py
--- a/python/paddle/v2/framework/tests/test_margin_rank_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_margin_rank_loss_op.py
--- a/python/paddle/v2/framework/tests/test_matmul_op.py
+++ b/python/paddle/v2/framework/tests/test_matmul_op.py
--- a/python/paddle/v2/framework/tests/test_mnist.py
+++ b/python/paddle/v2/framework/tests/test_mnist.py
--- a/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
--- a/python/paddle/v2/framework/tests/test_momentum_op.py
+++ b/python/paddle/v2/framework/tests/test_momentum_op.py
--- a/python/paddle/v2/framework/tests/test_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_mul_op.py
--- a/python/paddle/v2/framework/tests/test_nccl_init_op.py
+++ b/python/paddle/v2/framework/tests/test_nccl_init_op.py
--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/framework/tests/test_net.py
--- a/python/paddle/v2/framework/tests/test_op_support_gpu.py
+++ b/python/paddle/v2/framework/tests/test_op_support_gpu.py
--- a/python/paddle/v2/framework/tests/test_operator.py
+++ b/python/paddle/v2/framework/tests/test_operator.py
--- a/python/paddle/v2/framework/tests/test_operator_desc.py
+++ b/python/paddle/v2/framework/tests/test_operator_desc.py
--- a/python/paddle/v2/framework/tests/test_optimizer.py
+++ b/python/paddle/v2/framework/tests/test_optimizer.py
--- a/python/paddle/v2/framework/tests/test_pad_op.py
+++ b/python/paddle/v2/framework/tests/test_pad_op.py
--- a/python/paddle/v2/framework/tests/test_parameter.py
+++ b/python/paddle/v2/framework/tests/test_parameter.py
--- a/python/paddle/v2/framework/tests/test_pool2d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool2d_op.py
--- a/python/paddle/v2/framework/tests/test_pool3d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool3d_op.py
--- a/python/paddle/v2/framework/tests/test_pool_max_op.py
+++ b/python/paddle/v2/framework/tests/test_pool_max_op.py
--- a/python/paddle/v2/framework/tests/test_precision_recall_op.py
+++ b/python/paddle/v2/framework/tests/test_precision_recall_op.py
--- a/python/paddle/v2/framework/tests/test_prelu_op.py
+++ b/python/paddle/v2/framework/tests/test_prelu_op.py
--- a/python/paddle/v2/framework/tests/test_program.py
+++ b/python/paddle/v2/framework/tests/test_program.py
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
--- a/python/paddle/v2/framework/tests/test_proximal_adagrad_op.py
+++ b/python/paddle/v2/framework/tests/test_proximal_adagrad_op.py
--- a/python/paddle/v2/framework/tests/test_proximal_gd_op.py
+++ b/python/paddle/v2/framework/tests/test_proximal_gd_op.py
--- a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
--- a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
--- a/python/paddle/v2/framework/tests/test_recommender_system.py
+++ b/python/paddle/v2/framework/tests/test_recommender_system.py
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
--- a/python/paddle/v2/framework/tests/test_reduce_op.py
+++ b/python/paddle/v2/framework/tests/test_reduce_op.py
--- a/python/paddle/v2/framework/tests/test_regularizer.py
+++ b/python/paddle/v2/framework/tests/test_regularizer.py
--- a/python/paddle/v2/framework/tests/test_rmsprop_op.py
+++ b/python/paddle/v2/framework/tests/test_rmsprop_op.py
--- a/python/paddle/v2/framework/tests/test_rnn_memory_helper_op.py
+++ b/python/paddle/v2/framework/tests/test_rnn_memory_helper_op.py
--- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
--- a/python/paddle/v2/framework/tests/test_scatter_op.py
+++ b/python/paddle/v2/framework/tests/test_scatter_op.py
--- a/python/paddle/v2/framework/tests/test_scope.py
+++ b/python/paddle/v2/framework/tests/test_scope.py
--- a/python/paddle/v2/framework/tests/test_selected_rows.py
+++ b/python/paddle/v2/framework/tests/test_selected_rows.py
--- a/python/paddle/v2/framework/tests/test_seq_concat_op.py
+++ b/python/paddle/v2/framework/tests/test_seq_concat_op.py
--- a/python/paddle/v2/framework/tests/test_seq_conv.py
+++ b/python/paddle/v2/framework/tests/test_seq_conv.py
--- a/python/paddle/v2/framework/tests/test_seq_expand.py
+++ b/python/paddle/v2/framework/tests/test_seq_expand.py
--- a/python/paddle/v2/framework/tests/test_seq_pool.py
+++ b/python/paddle/v2/framework/tests/test_seq_pool.py
--- a/python/paddle/v2/framework/tests/test_sequence_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_sequence_softmax_op.py
--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
--- a/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py
--- a/python/paddle/v2/framework/tests/test_add_op.py
+++ b/python/paddle/v2/framework/tests/test_add_op.py
--- a/python/paddle/v2/framework/tests/test_smooth_l1_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_smooth_l1_loss_op.py
--- a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
--- a/python/paddle/v2/framework/tests/test_squared_l2_norm_op.py
+++ b/python/paddle/v2/framework/tests/test_squared_l2_norm_op.py
--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/framework/tests/test_tensor.py
--- a/python/paddle/v2/framework/tests/test_tensor_array.py
+++ b/python/paddle/v2/framework/tests/test_tensor_array.py
--- a/python/paddle/v2/framework/tests/test_top_k_op.py
+++ b/python/paddle/v2/framework/tests/test_top_k_op.py
--- a/python/paddle/v2/framework/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/framework/tests/test_uniform_random_op.py
--- a/python/paddle/v2/framework/tests/test_variable.py
+++ b/python/paddle/v2/framework/tests/test_variable.py
--- a/python/paddle/v2/framework/tests/test_word2vec.py
+++ b/python/paddle/v2/framework/tests/test_word2vec.py
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
--- a/python/paddle/v2/model.py
+++ b/python/paddle/v2/model.py
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
--- a/python/paddle/v2/plot/plot.py
+++ b/python/paddle/v2/plot/plot.py
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
--- a/python/paddle/v2/tests/CMakeLists.txt
+++ b/python/paddle/v2/tests/CMakeLists.txt
--- a/python/paddle/v2/tests/test_data_feeder.py
+++ b/python/paddle/v2/tests/test_data_feeder.py
--- a/python/paddle/v2/tests/test_paramconf_order.py
+++ b/python/paddle/v2/tests/test_paramconf_order.py
--- a/python/paddle/v2/topology.py
+++ b/python/paddle/v2/topology.py
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
--- a/python/requirements.txt
+++ b/python/requirements.txt
--- a/v1_api_demo/README.md
+++ b/v1_api_demo/README.md