Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix-3923-r

9a3f50d8 · yangyaming · df83ac4c · fa72e544 · 9a3f50d8 · 9a3f50d8
814 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -22,7 +22,9 @@ cmake-build-*
 # generated while compiling
 python/paddle/v2/framework/core.so
+paddle/pybind/pybind.h
 CMakeFiles
 cmake_install.cmake
 paddle/.timestamp
 python/paddlepaddle.egg-info/
+paddle/pybind/pybind.h
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -31,6 +31,3 @@
    -   id: go-fmt
        types:
        - go
-    -   id: gometalinter
-        types:
-        - go
--- a/.travis.yml
+++ b/.travis.yml
@@ -36,10 +36,6 @@ before_install:
  # protobuf version.
  - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
  - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
-  - curl https://glide.sh/get | bash
-  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
-  - go get -u github.com/alecthomas/gometalinter
-  - gometalinter --install
  - |
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,7 +27,7 @@ if(NOT CMAKE_CROSSCOMPILING)
 endif(NOT CMAKE_CROSSCOMPILING)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
-if(NOT ANDROID)
+if(NOT ANDROID AND NOT IOS)
    find_package(Boost QUIET)
 endif()
@@ -64,27 +64,37 @@ if(NOT CMAKE_BUILD_TYPE)
      FORCE)
 endif()
-if(ANDROID)
+if(ANDROID OR IOS)
-    if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
+    if(ANDROID)
-        message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
+        if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
-    elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
+            message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
-        # TODO: support glog for Android api 16 ~ 19 in the future
+        elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
-        message(WARNING "Using the unofficial git repository <https://github.com/Xreki/glog.git> instead")
+            # TODO: support glog for Android api 16 ~ 19 in the future
+            message(WARNING "Using the unofficial git repository <https://github.com/Xreki/glog.git> instead")
+        endif()
    endif()
    set(WITH_GPU OFF CACHE STRING
-        "Disable GPU when cross-compiling for Android" FORCE)
+        "Disable GPU when cross-compiling for Android and iOS" FORCE)
    set(WITH_AVX OFF CACHE STRING
-        "Disable AVX when cross-compiling for Android" FORCE)
+        "Disable AVX when cross-compiling for Android and iOS" FORCE)
    set(WITH_PYTHON OFF CACHE STRING
-        "Disable PYTHON when cross-compiling for Android" FORCE)
+        "Disable PYTHON when cross-compiling for Android and iOS" FORCE)
    set(WITH_RDMA OFF CACHE STRING
-        "Disable RDMA when cross-compiling for Android" FORCE)
+        "Disable RDMA when cross-compiling for Android and iOS" FORCE)
    set(WITH_MKLDNN OFF CACHE STRING
-        "Disable MKLDNN when cross-compiling for Android" FORCE)
+        "Disable MKLDNN when cross-compiling for Android and iOS" FORCE)
    set(WITH_MKLML OFF CACHE STRING
-        "Disable MKLML package when cross-compiling for Android" FORCE)
+        "Disable MKLML package when cross-compiling for Android and iOS" FORCE)
-endif(ANDROID)
+    # Compile PaddlePaddle mobile inference library
+    if (NOT WITH_C_API)
+        set(WITH_C_API ON CACHE STRING
+            "Always compile the C_API when cross-compiling for Android and iOS" FORCE)
+    endif()
+    set(MOBILE_INFERENCE ON)
+    add_definitions(-DPADDLE_MOBILE_INFERENCE)
+endif()
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
  "A path setting third party libraries download & build directories.")
@@ -95,6 +105,12 @@ if (WITH_C_API AND WITH_PYTHON)
    "different Python interpreter from compiling.")
 endif()
+if(MOBILE_INFERENCE)
+    set(THIRD_PARTY_BUILD_TYPE MinSizeRel)
+else()
+    set(THIRD_PARTY_BUILD_TYPE Release)
+endif()
 ########################################################################################
 include(external/mklml)     # download mklml package
@@ -111,6 +127,7 @@ include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)    # download pybind11
+include(external/nccl)
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration
@@ -143,7 +160,7 @@ set(EXTERNAL_LIBS
 if(WITH_GPU)
    list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
    if(NOT WITH_DSO)
-        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
+        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
    endif(NOT WITH_DSO)
 endif(WITH_GPU)
@@ -158,9 +175,11 @@ endif(USE_NNPACK)
 add_subdirectory(proto)
-# "add_subdirectory(go)" should be placed after the following loine,
+if(NOT MOBILE_INFERENCE)
-# because it depends on paddle/optimizer.
+    # "add_subdirectory(go)" should be placed after the following loine,
-add_subdirectory(paddle/optimizer)
+    # because it depends on paddle/optimizer.
+    add_subdirectory(paddle/optimizer)
+endif()
 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
 # placed after this block, because they depends on it.

--- a/Dockerfile
+++ b/Dockerfile
@@ -22,7 +22,7 @@ COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
    apt-get install -y \
-    git python-pip python-dev openssh-server bison  \
+    git python-pip python-dev openssh-server bison libnccl-dev \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
    python-matplotlib gcc-4.8 g++-4.8 \

--- a/README.md
+++ b/README.md
@@ -51,19 +51,19 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
 - **Connected to Products**
    In addition, PaddlePaddle is also designed to be easily deployable. At Baidu,
-    PaddlePaddle has been deployed into products or service with a vast number
+    PaddlePaddle has been deployed into products and services with a vast number
    of users, including ad click-through rate (CTR) prediction, large-scale image
    classification, optical character recognition(OCR), search ranking, computer
    virus detection, recommendation, etc. It is widely utilized in products at
-    Baidu and it has achieved a significant impact. We hope you can also exploit
+    Baidu and it has achieved a significant impact. We hope you can also explore
-    the capability of PaddlePaddle to make a huge impact for your product.
+    the capability of PaddlePaddle to make an impact on your product.
 ## Installation
 It is recommended to check out the
 [Docker installation guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
 before looking into the
-[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
+[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html).
 ## Documentation
@@ -72,7 +72,7 @@ We provide [English](http://doc.paddlepaddle.org/develop/doc/) and
 - [Deep Learning 101](http://book.paddlepaddle.org/index.html)
-  You might want to start from this online interactive book that can run in Jupyter Notebook.
+  You might want to start from this online interactive book that can run in a Jupyter Notebook.
 - [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)

--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@@ -22,5 +22,5 @@ def initHook(settings, height, width, color, num_class, **kwargs):
 def process(settings, file_list):
    for i in xrange(1024):
        img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
-        lab = random.randint(0, settings.num_class)
+        lab = random.randint(0, settings.num_class - 1)
        yield img.astype('float32'), int(lab)
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
+set -e
+function train() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS
+  export OMP_DYNAMIC="FALSE"
+  export KMP_AFFINITY="granularity=fine,compact,0,0"
+  topology=$1
+  bs=$2
+  use_mkldnn=$3
+  if [ $3 == "True" ]; then
+    thread=1
+    log="logs/${topology}-mkldnn-${bs}.log"
+  elif [ $3 == "False" ]; then
+    thread=`nproc`
+    # each trainer_count use only 1 core to avoid conflict
+    export OMP_NUM_THREADS=1
+    export MKL_NUM_THREADS=1
+    log="logs/${topology}-${thread}mklml-${bs}.log"
+  else
+    echo "Wrong input $3, use True or False."
+    exit 0
+  fi
+  args="batch_size=${bs}"
+  config="${topology}.py"
+  paddle train --job=time \
+    --config=$config \
+    --use_mkldnn=$use_mkldnn \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=10 \
+    --test_period=100 \
+    --config_args=$args \
+    2>&1 | tee ${log} 
+}
+if [ ! -d "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+#========== mkldnn ==========#
+train vgg 64 True
+train vgg 128 True
+train vgg 256 True
+#========== mklml ===========#
+train vgg 64 False
+train vgg 128 False
+train vgg 256 False
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
+#!/usr/bin/env python
+from paddle.trainer_config_helpers import *
+height = 224
+width = 224
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 64)
+layer_num = get_config_arg('layer_num', int, 19)
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+settings(
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+img = data_layer(name='image', size=height * width * 3)
+def vgg_network(vgg_num=3):
+    tmp = img_conv_group(
+        input=img,
+        num_channels=3,
+        conv_padding=1,
+        conv_num_filter=[64, 64],
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_size=2,
+        pool_stride=2,
+        pool_type=MaxPooling())
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=[128, 128],
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+    channels = []
+    for i in range(vgg_num):
+        channels.append(256)
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=channels,
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+    channels = []
+    for i in range(vgg_num):
+        channels.append(512)
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=channels,
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=channels,
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+    tmp = fc_layer(
+        input=tmp,
+        size=4096,
+        act=ReluActivation(),
+        layer_attr=ExtraAttr(drop_rate=0.5))
+    tmp = fc_layer(
+        input=tmp,
+        size=4096,
+        act=ReluActivation(),
+        layer_attr=ExtraAttr(drop_rate=0.5))
+    return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
+if layer_num == 16:
+    vgg = vgg_network(3)
+elif layer_num == 19:
+    vgg = vgg_network(4)
+else:
+    print("Wrong layer number.")
+lab = data_layer('label', num_class)
+loss = cross_entropy(input=vgg, label=lab)
+outputs(loss)
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -171,3 +171,10 @@ if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
  add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
  message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
 endif()
+if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND)
+  set(CBLAS_FOUND ON)
+  set(CBLAS_PROVIDER vecLib)
+  set(CBLAS_INC_DIR ${VECLIB_INC_DIR})
+  add_definitions(-DPADDLE_USE_VECLIB)
+endif()
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -24,6 +24,10 @@ if(WITH_DOUBLE)
    add_definitions(-DPADDLE_TYPE_DOUBLE)
 endif(WITH_DOUBLE)
+if(WITH_TESTING)
+    add_definitions(-DPADDLE_WITH_TESTING)
+endif(WITH_TESTING)
 if(NOT WITH_TIMER)
    add_definitions(-DPADDLE_DISABLE_TIMER)
 endif(NOT WITH_TIMER)
@@ -49,19 +53,20 @@ if(NOT WITH_GOLANG)
 endif(NOT WITH_GOLANG)
 if(NOT WITH_GPU)
-    add_definitions(-DPADDLE_ONLY_CPU)
    add_definitions(-DHPPL_STUB_FUNC)
    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
+    add_definitions(-DPADDLE_WITH_CUDA)
    FIND_PACKAGE(CUDA REQUIRED)
    if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
-        message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile")
+        message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile")
    endif()
    if(NOT CUDNN_FOUND)
-        message(FATAL_ERROR "Paddle need cudnn to compile")
+        message(FATAL_ERROR "Paddle needs cudnn to compile")
    endif()
    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")

--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -26,9 +26,9 @@ set(IGNORE_PATTERN
    .*ImportanceSampler.*
    .*cblas\\.h.*
    .*\\.pb\\.txt
-    .*LtrDataProvider.*
    .*MultiDataProvider.*
-    .*pb.*)
+    .*pb.*
+    .*pybind.h)
 # add_style_check_target
 #

--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This is a toolchain file for cross-compiling for iOS, and the
+# configuration largely refers to public toolchain file:
+#    https://raw.githubusercontent.com/leetal/ios-cmake/master/ios.toolchain.cmake
+# and
+#    https://github.com/cristeab/ios-cmake
+#
+# Supports options:
+# IOS_PLATFORM = OS (default) or SIMULATOR
+#   This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders
+#   OS - the default, used to build for iPhone and iPad physical devices, which have an arm arch.
+#   SIMULATOR - used to build for the Simulator platforms, which have an x86 arch.
+# IOS_ARCH
+#   The archectures wanted to support, such "arm64", "armv7;arm64"
+# IOS_DEPLOYMENT_TARGET
+#   The minimum iOS deployment version, such as "7.0"
+# IOS_ENABLE_BITCODE = ON (default) or OFF
+# IOS_USE_VECLIB_FOR_BLAS = OFF (default) or ON
+# IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
+#   By default this location is automatcially chosen based on the IOS_PLATFORM value above.
+#   If set manually, it will override the default location and force the user of a particular Developer Platform
+# IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
+#   By default this location is automatcially chosen based on the IOS_DEVELOPER_ROOT value.
+#   In this case it will always be the most up-to-date SDK found in the IOS_DEVELOPER_ROOT path.
+#   If set manually, this will force the use of a specific SDK version
+# Macros:
+# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE)
+#  A convenience macro for setting xcode specific properties on targets
+#  example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1")
+# find_host_package (PROGRAM ARGS)
+#  A macro used to find executable programs on the host system, not within the iOS environment.
+#  Thanks to the android-cmake project for providing the command
+if(NOT IOS)
+  return()
+endif()
+set(CMAKE_SYSTEM_NAME Darwin)
+# Get the Xcode version being used.
+execute_process(COMMAND xcodebuild -version
+                OUTPUT_VARIABLE XCODE_VERSION
+                RESULT_VARIABLE XCODE_VERSION_RESULT
+                ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+if(NOT ${XCODE_VERSION_RESULT})
+  string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}")
+  string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}")
+  message(STATUS "Building with Xcode version: ${XCODE_VERSION}")
+else()
+  message(FATAL_ERROR "Cannot execute xcodebuild, please check whether xcode is installed.")
+endif()
+# Required as of cmake 2.8.10
+set(CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
+# Setup iOS platform unless specified manually with IOS_PLATFORM
+if(NOT DEFINED IOS_PLATFORM)
+  set(IOS_PLATFORM "OS")
+endif()
+set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
+# Set the architecture for iOS
+if(NOT DEFINED IOS_ARCH)
+  if(IOS_PLATFORM STREQUAL "OS")
+    # FIXME(liuyiqun): support "armv7;armv7s;arm64" future
+    set(IOS_ARCH "arm64")
+  elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
+    set(IOS_ARCH "i386;x86_64")
+  elseif(IOS_PLATFORM STREQUAL "WATCHOS")
+    set(IOS_ARCH armv7k)
+  endif()
+endif()
+set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
+# Specify minimum iOS deployment version
+if(NOT DEFINED IOS_DEPLOYMENT_TARGET)
+  set(IOS_DEPLOYMENT_TARGET "7.0")
+endif()
+set(IOS_DEPLOYMENT_TARGET ${IOS_DEPLOYMENT_TARGET} CACHE STRING "Minimum iOS version")
+# Whether to enable bitcode
+if(NOT DEFINED IOS_ENABLE_BITCODE)
+  set(IOS_ENABLE_BITCODE ON)
+endif()
+set(IOS_ENABLE_BITCODE ${IOS_ENABLE_BITCODE} CACHE BOOL "Whether to enable bitcode")
+if(NOT DEFINED IOS_USE_VECLIB_FOR_BLAS)
+  set(IOS_USE_VECLIB_FOR_BLAS OFF)
+endif()
+set(IOS_USE_VECLIB_FOR_BLAS ${IOS_UES_VECLIB_FOR_BLAS} CACHE BOOL "Whether to use veclib")
+# Check the platform selection and setup for developer root
+if(${IOS_PLATFORM} STREQUAL "OS")
+  set(IOS_PLATFORM_LOCATION "iPhoneOS.platform")
+  set(XCODE_IOS_PLATFORM iphoneos)
+  # This causes the installers to properly locate the output libraries
+  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos")
+elseif(${IOS_PLATFORM} STREQUAL "SIMULATOR")
+  set(IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
+  set(XCODE_IOS_PLATFORM iphonesimulator)
+  # This causes the installers to properly locate the output libraries
+  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
+elseif(${IOS_PLATFORM} STREQUAL "WATCHOS")
+  set(IOS_PLATFORM_LOCATION "WatchOS.platform")
+  set(XCODE_IOS_PLATFORM watchos)
+  # This causes the installers to properly locate the output libraries
+  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-watchos")
+else(${IOS_PLATFORM} STREQUAL "OS")
+  message(FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please set to\n"
+          "\t OS, SIMULATOR, or WATCHOS.")
+endif()
+# Check iOS developer toolchain
+if(NOT DEFINED IOS_DEVELOPER_ROOT)
+  # Setup iOS developer location
+  execute_process(COMMAND xcode-select -print-path
+                  OUTPUT_VARIABLE XCODE_DEVELOPER_DIR
+                  RESULT_VARIABLE XCODE_DEVELOPER_DIR_RESULT
+                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  # Xcode 4.3 changed the installation location, choose the most recent one available
+  if(${XCODE_VERSION} VERSION_LESS "4.3.0")
+    set(IOS_DEVELOPER_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
+  else()
+    set(IOS_DEVELOPER_ROOT "${XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
+  endif()
+endif()
+if(EXISTS ${IOS_DEVELOPER_ROOT})
+  set(IOS_DEVELOPER_ROOT ${IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")
+else()
+  message(FATAL_ERROR "Invalid IOS_DEVELOPER_ROOT: ${IOS_DEVELOPER_ROOT} does not exist.")
+endif()
+# Check iOS SDK
+if(NOT DEFINED IOS_SDK_ROOT)
+  # Find and use the most recent iOS sdk
+  file(GLOB IOS_SDK_LISTS "${IOS_DEVELOPER_ROOT}/SDKs/*")
+  if(IOS_SDK_LISTS)
+    list(SORT IOS_SDK_LISTS)
+    list(REVERSE IOS_SDK_LISTS)
+    list(GET IOS_SDK_LISTS 0 IOS_SDK_ROOT)
+  else(IOS_SDK_LISTS)
+    message(FATAL_ERROR "No iOS SDK's found in default search path ${IOS_DEVELOPER_ROOT}."
+            " Please manually set IOS_SDK_ROOT or install the iOS SDK.")
+  endif(IOS_SDK_LISTS)
+endif()
+if(EXISTS ${IOS_SDK_ROOT})
+  set(IOS_SDK_ROOT ${IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")
+  message(STATUS "iOS toolchain: ${IOS_SDK_ROOT}")
+else()
+  message(FATAL_ERROR "Invalid IOS_SDK_ROOT: ${IOS_SDK_ROOT} does not exist.")
+endif()
+# Set the sysroot default to the most recent SDK
+set(CMAKE_OSX_SYSROOT ${IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
+# Get version of iOS SDK
+execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion
+                OUTPUT_VARIABLE IOS_SDK_VERSION
+                RESULT_VARIABLE IOS_SDK_VERSION_RESULT
+                ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+if(${IOS_SDK_VERSION_RESULT})
+  string(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" IOS_SDK_VERSION "${IOS_SDK_ROOT}")
+endif()
+if(NOT IOS_SDK_VERSION)
+  message(WARNING "Cannot get SDK's version.")
+  set(IOS_SDK_VERSION 1)
+endif()
+set(CMAKE_SYSTEM_VERSION ${IOS_SDK_VERSION})
+# Find the C & C++ compilers for the specified SDK.
+if(NOT CMAKE_C_COMPILER)
+  # Default to use clang
+  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang
+                  OUTPUT_VARIABLE IOS_C_COMPILER
+                  RESULT_VARIABLE IOS_C_COMPILER_RESULT
+                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(${IOS_C_COMPILER_RESULT})
+    get_filename_component(IOS_C_COMPILER clang PROGRAM)
+  endif()
+else(NOT CMAKE_C_COMPILER)
+  # User can set it in cmake command
+  get_filename_component(IOS_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
+endif(NOT CMAKE_C_COMPILER)
+if(NOT EXISTS ${IOS_C_COMPILER})
+  message(FATAL_ERROR "Cannot find C compiler: ${IOS_C_COMPILER}")
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+  # Default to use clang++
+  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++
+                  OUTPUT_VARIABLE IOS_CXX_COMPILER
+                  RESULT_VARIABLE IOS_CXX_COMPILER_RESULT
+                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(${IOS_CXX_COMPILER_RESULT})
+    get_filename_component(IOS_CXX_COMPILER clang++ PROGRAM)
+  endif()
+else(NOT CMAKE_CXX_COMPILER)
+  # User can set it in cmake command
+  get_filename_component(IOS_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
+endif(NOT CMAKE_CXX_COMPILER)
+if(NOT EXISTS ${IOS_CXX_COMPILER})
+  message(FATAL_ERROR "Cannot find CXX compiler: ${IOS_CXX_COMPILER}")
+endif()
+set(CMAKE_C_COMPILER ${IOS_C_COMPILER} CACHE PATH "C compiler" FORCE)
+set(CMAKE_CXX_COMPILER ${IOS_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
+set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
+set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
+set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
+set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
+# Set iOS specific C/C++ flags
+if(IOS_PLATFORM STREQUAL "OS")
+  if(XCODE_VERSION VERSION_LESS "7.0")
+    set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-mios-version-min=${IOS_DEPLOYMENT_TARGET}")
+  else()
+    # Xcode 7.0+ uses flags we can build directly from XCODE_IOS_PLATFORM.
+    set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}")
+  endif()
+else()
+  set(XCODE_IOS_FLATFORM_VERSION_FLAGS "-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
+endif()
+if(IOS_ENABLE_BITCODE)
+  set(XCODE_IOS_BITCODE_FLAGS "${IOS_COMPILER_FLAGS} -fembed-bitcode")
+else()
+  set(XCODE_IOS_BITCODE_FLAGS "")
+endif()
+set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_FLAGS}")
+# Hidden visibilty is required for cxx on iOS 
+set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
+set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
+set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")
+if(IOS_USE_VECLIB_FOR_BLAS)
+  # Find vecLib for iOS
+  set(VECLIB_SEARCH_DIRS
+      ${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks
+      ${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Frameworks
+      )
+  find_path(VECLIB_INC_DIR vecLib.h PATHS ${VECLIB_SEARCH_DIRS}/vecLib.framework/Headers)
+  include(FindPackageHandleStandardArgs)
+  find_package_handle_standard_args(vecLib DEFAULT_MSG VECLIB_INC_DIR)
+  if(VECLIB_FOUND)
+    if(VECLIB_INC_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*")
+      set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework vecLib")
+      message(STATUS "Found standalone vecLib.framework")
+    else()
+      set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework Accelerate")
+      message(STATUS "Found vecLib as part of Accelerate.framework")
+    endif()
+  endif()
+endif()
+set(CMAKE_C_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_C_LINK_FLAGS}")
+set(CMAKE_CXX_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_CXX_LINK_FLAGS}")
+set(CMAKE_PLATFORM_HAS_INSTALLNAME 1)
+if(NOT IOS_ENABLE_BITCODE)
+  set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
+  set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
+else()
+  set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib")
+  set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle")
+endif()
+set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
+set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
+set(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
+# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree
+# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache
+# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
+# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex
+if(NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
+  find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
+endif()
+# Set the find root to the iOS developer roots and to user defined paths
+set(CMAKE_FIND_ROOT_PATH ${IOS_DEVELOPER_ROOT} ${IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH}
+    CACHE string  "iOS find search path root")
+# default to searching for frameworks first
+set(CMAKE_FIND_FRAMEWORK FIRST)
+# set up the default search directories for frameworks
+set(CMAKE_SYSTEM_FRAMEWORK_PATH
+    ${IOS_SDK_ROOT}/System/Library/Frameworks
+    ${IOS_SDK_ROOT}/System/Library/PrivateFrameworks
+    ${IOS_SDK_ROOT}/Developer/Library/Frameworks
+    )
+# only search the iOS sdks, not the remainder of the host filesystem
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+message(STATUS "iOS: Targeting iOS '${CMAKE_SYSTEM_VERSION}', "
+        "building for '${IOS_PLATFORM}' platform, with architecture '${CMAKE_OSX_ARCHITECTURES}'")
+message(STATUS "System CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
+message(STATUS "System CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+# Used in ExternalProject command
+string(REPLACE ";" "\\$<SEMICOLON>" EXTERNAL_IOS_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
+set(EXTERNAL_OPTIONAL_ARGS
+    -DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT}
+    -DCMAKE_OSX_ARCHITECTURES=${EXTERNAL_IOS_ARCHITECTURES})
+# This little macro lets you set any XCode specific property
+macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE)
+  set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
+endmacro(set_xcode_property)
+# This macro lets you find executable programs on the host system
+macro(find_host_package)
+  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
+  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
+  set(IOS FALSE)
+  find_package(${ARGN})
+  set(IOS TRUE)
+  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+endmacro(find_host_package)
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -8,7 +8,7 @@ ExternalProject_Add(
    extern_eigen3
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
-    GIT_TAG         "master"
+    GIT_TAG         4e79cb69b9425f5f8c3a84be4350d4ab75b5fd9d
    PREFIX          ${EIGEN_SOURCE_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""

--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -36,19 +36,21 @@ ExternalProject_Add(
    # change this back to the official Github repo once my PR is
    # merged.
    GIT_REPOSITORY  "https://github.com/wangkuiyi/gflags.git"
+    GIT_TAG         986964c07427ecb9cdb5bd73f73ebbd40e54dadb
    PREFIX          ${GFLAGS_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
+                    -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
-    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DBUILD_TESTING=OFF
+                    -DBUILD_TESTING=OFF
-    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
 ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)

--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -31,23 +31,25 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS gflags
    GIT_REPOSITORY  "https://github.com/google/glog.git"
+    GIT_TAG         v0.3.5
    PREFIX          ${GLOG_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
+                    -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
-    CMAKE_ARGS      -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
+                    -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
-    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DWITH_GFLAGS=ON
+                    -DWITH_GFLAGS=ON
-    CMAKE_ARGS      -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
+                    -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
-    CMAKE_ARGS      -DBUILD_TESTING=OFF
+                    -DBUILD_TESTING=OFF
-    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
                     -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
 ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)

--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -48,18 +48,19 @@ IF(WITH_TESTING)
        PREFIX          ${GTEST_SOURCES_DIR}
        UPDATE_COMMAND  ""
        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-        CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-        CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-        CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-        CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
+                        -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
-        CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-        CMAKE_ARGS      -DBUILD_GMOCK=ON
+                        -DBUILD_GMOCK=ON
-        CMAKE_ARGS      -Dgtest_disable_pthreads=ON
+                        -Dgtest_disable_pthreads=ON
-        CMAKE_ARGS      -Dgtest_force_shared_crt=ON
+                        -Dgtest_force_shared_crt=ON
-        CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                        ${EXTERNAL_OPTIONAL_ARGS}
        CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                         -DCMAKE_BUILD_TYPE:STRING=Release
+                         -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
    )
    ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL)

--- a/cmake/external/nccl.cmake
+++ b/cmake/external/nccl.cmake
+INCLUDE(ExternalProject)
+SET(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
+INCLUDE_DIRECTORIES(${NCCL_SOURCE_DIR}/src/extern_nccl)
+if(WITH_DSO)
+  # If we use DSO, we do not build nccl, just download the dependencies
+  set(NCCL_BUILD_COMMAND "")
+  set(NCCL_INSTALL_COMMAND "")
+  set(NCCL_INSTALL_DIR "")
+else()
+  # otherwise, we build nccl and link it.
+  set(NCCL_BUILD_COMMAND "make -j 8")
+  set(NCCL_INSTALL_COMMAND  "make install")
+  SET(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
+endif()
+ExternalProject_Add(
+        extern_nccl
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/NVIDIA/nccl.git"
+        GIT_TAG         "v1.3.4-1"
+        PREFIX          "${NCCL_SOURCE_DIR}"
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     "${NCCL_BUILD_COMMAND}"
+        INSTALL_COMMAND   "${NCCL_INSTALL_COMMAND}"
+        INSTALL_DIR       "${NCCL_INSTALL_DIR}"
+        TEST_COMMAND      ""
+)
+if (WITH_DSO)
+  if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c)
+    file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
+    add_library(nccl STATIC ${dummyfile})
+  else()
+    add_library(nccl INTERFACE)
+  endif()
+else()
+  ADD_LIBRARY(nccl STATIC IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET nccl PROPERTY IMPORTED_LOCATION
+          ${NCCL_INSTALL_DIR}/lib/libnccl.a)
+endif()
+add_dependencies(nccl extern_nccl)
+LIST(APPEND external_project_dependencies nccl)
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -29,30 +29,41 @@ IF(NOT ${CBLAS_FOUND})
        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
        CACHE FILEPATH "openblas library." FORCE)
-    IF(APPLE)
+    SET(OPENBLAS_CC "${CMAKE_C_COMPILER}")
-        SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
-        SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
-    ELSE()
-        SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs)
-    ENDIF()
    IF(CMAKE_CROSSCOMPILING)
+        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
+        GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
+        SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
        IF(ANDROID)
            # arm_soft_fp_abi branch of OpenBLAS to support softfp
            #   https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
            SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
            IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-                SET(TARGET "ARMV7")
+                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
            ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
-                SET(TARGET "ARMV8")
+                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
+            ENDIF()
+        ELSEIF(IOS)
+            # FIXME(liuyiqun): support multiple architectures
+            SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
+            SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
+            IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7")
+                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7")
+                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
+            ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
+                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
            ENDIF()
-            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=${TARGET} ARM_SOFTFP_ABI=1 USE_THREAD=0)
        ELSEIF(RPI)
            # use hardfp
            SET(OPENBLAS_COMMIT "v0.2.20")
-            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0)
+            SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
        ENDIF()
    ELSE()
+        IF(APPLE)
+            SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
+        ENDIF()
        SET(OPENBLAS_COMMIT "v0.2.20")
        SET(OPTIONAL_ARGS "")
        IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
@@ -60,6 +71,8 @@ IF(NOT ${CBLAS_FOUND})
        ENDIF()
    ENDIF()
+    SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
    ExternalProject_Add(
        extern_openblas
        ${EXTERNAL_PROJECT_LOG_ARGS}

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -173,7 +173,8 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
            "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
            "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
            "-Dprotobuf_WITH_ZLIB=ON"
-            "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}")
+            "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}"
+            ${EXTERNAL_OPTIONAL_ARGS})
        SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
    ENDIF()
@@ -190,12 +191,12 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
            ${OPTIONAL_ARGS}
            -Dprotobuf_BUILD_TESTS=OFF
            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-            -DCMAKE_BUILD_TYPE=Release
+            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
            -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
            -DCMAKE_INSTALL_LIBDIR=lib
        CMAKE_CACHE_ARGS
            -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-            -DCMAKE_BUILD_TYPE:STRING=Release
+            -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
            -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
            -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
            ${OPTIONAL_CACHE_ARGS}

--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -12,16 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-INCLUDE(ExternalProject)
+IF(NOT WITH_PYTHON)
+    return()
+ENDIF()
 INCLUDE(python_module)
 FIND_PACKAGE(PythonInterp 2.7)
-IF(WITH_PYTHON)
+FIND_PACKAGE(PythonLibs 2.7)
-    FIND_PACKAGE(PythonLibs 2.7)
+# Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
-    # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
+ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
-    ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
-    SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
-ENDIF(WITH_PYTHON)
 SET(py_env "")
 IF(PYTHONINTERP_FOUND)
@@ -36,9 +37,5 @@ IF(PYTHONINTERP_FOUND)
    ENDIF()
 ENDIF(PYTHONINTERP_FOUND)
-IF(WITH_PYTHON)
+INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
-    INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
-    INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
-ELSE()
-    SET(PYTHON_LIBRARIES "")
-ENDIF()
--- a/cmake/external/swig.cmake
+++ b/cmake/external/swig.cmake
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+IF(NOT WITH_SWIG_PY)
+    return()
+ENDIF()
 FIND_PACKAGE(SWIG)
 IF(NOT SWIG_FOUND)

--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -16,25 +16,14 @@ INCLUDE(ExternalProject)
 SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
-SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE)
-INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
+SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
+    CACHE PATH "Warp-ctc Directory" FORCE)
-SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE)
+# Used in unit test test_WarpCTCLayer
+SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib"
-IF(WIN32)
+    CACHE PATH "Warp-ctc Library Directory" FORCE)
-    SET(WARPCTC_LIBRARIES
+SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
-        "${WARPCTC_INSTALL_DIR}/lib/warpctc.dll" CACHE FILEPATH "Warp-ctc Library" FORCE)
+    CACHE FILEPATH "Warp-ctc Library" FORCE)
-ELSE(WIN32)
-    IF(APPLE)
-        SET(_warpctc_SHARED_SUFFIX dylib)
-    ELSE(APPLE)
-        SET(_warpctc_SHARED_SUFFIX so)
-    ENDIF(APPLE)
-    SET(WARPCTC_LIBRARIES
-        "${WARPCTC_INSTALL_DIR}/lib/libwarpctc.${_warpctc_SHARED_SUFFIX}" CACHE FILEPATH "Warp-ctc Library" FORCE)
-ENDIF(WIN32)
 IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" )
    SET(USE_OMP OFF)
@@ -46,25 +35,30 @@ ExternalProject_Add(
    extern_warpctc
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY  "https://github.com/gangliao/warp-ctc.git"
+    GIT_TAG         b63a0644654a3e0ed624c85a1767bc8193aead09
    PREFIX          ${WARPCTC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+                    -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-    CMAKE_ARGS      -DWITH_GPU=${WITH_GPU}
+                    -DWITH_GPU=${WITH_GPU}
-    CMAKE_ARGS      -DWITH_OMP=${USE_OMP}
+                    -DWITH_OMP=${USE_OMP}
-    CMAKE_ARGS      -DWITH_TORCH=OFF
+                    -DWITH_TORCH=OFF
-    CMAKE_ARGS      -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
+                    -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-    CMAKE_ARGS      -DBUILD_SHARED=ON
+                    -DBUILD_SHARED=ON
-    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release
+                    ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                     -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )
+MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
+INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
 ADD_LIBRARY(warpctc STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
 ADD_DEPENDENCIES(warpctc extern_warpctc)

--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -34,18 +34,19 @@ ExternalProject_Add(
    GIT_TAG         "v1.2.8"
    PREFIX          ${ZLIB_SOURCES_DIR}
    UPDATE_COMMAND  ""
-    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-    CMAKE_ARGS      -DBUILD_SHARED_LIBS=OFF
+                    -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
-    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DBUILD_SHARED_LIBS=OFF
-    CMAKE_ARGS      -DCMAKE_MACOSX_RPATH=ON
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_MACOSX_RPATH=ON
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
 LIST(APPEND external_project_dependencies zlib)

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -128,8 +128,10 @@ set(GPU_COMMON_FLAGS
 )
 if (APPLE)
-    # On Mac OS X build fat binaries with x86_64 architectures by default.
+    if(NOT CMAKE_CROSSCOMPILING)
-    set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
+        # On Mac OS X build fat binaries with x86_64 architectures by default.
+        set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
+    endif()
 else()
    set(GPU_COMMON_FLAGS
        -Wall

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -106,22 +106,22 @@ function(merge_static_libs TARGET_NAME)
  endforeach()
  list(REMOVE_DUPLICATES libs_deps)
-  if(APPLE) # Use OSX's libtool to merge archives
+  # To produce a library we need at least one source file.
-    # To produce a library we need at least one source file.
+  # It is created by add_custom_command below and will helps
-    # It is created by add_custom_command below and will helps
+  # also help to track dependencies.
-    # also help to track dependencies.
+  set(target_SRCS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+  if(APPLE) # Use OSX's libtool to merge archives
    # Make the generated dummy source file depended on all static input
    # libs. If input lib changes,the source file is touched
    # which causes the desired effect (relink).
-    add_custom_command(OUTPUT ${dummyfile}
+    add_custom_command(OUTPUT ${target_SRCS}
-      COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
      DEPENDS ${libs})
    # Generate dummy staic lib
-    file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+    file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
-    add_library(${TARGET_NAME} STATIC ${dummyfile})
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
    target_link_libraries(${TARGET_NAME} ${libs_deps})
    foreach(lib ${libs})
@@ -130,11 +130,14 @@ function(merge_static_libs TARGET_NAME)
    endforeach()
    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
      COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
-      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
+      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
+      )
  else() # general UNIX: use "ar" to extract objects and re-add to a common lib
+    set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir)
    foreach(lib ${libs})
-      set(objlistfile ${lib}.objlist) # list of objects in the input library
+      set(objlistfile ${target_DIR}/${lib}.objlist) # list of objects in the input library
-      set(objdir ${lib}.objdir)
+      set(objdir ${target_DIR}/${lib}.objdir)
      add_custom_command(OUTPUT ${objdir}
        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
@@ -142,31 +145,32 @@ function(merge_static_libs TARGET_NAME)
      add_custom_command(OUTPUT ${objlistfile}
        COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
-        COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ../${objlistfile}
+        COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ${objlistfile}
        DEPENDS ${lib} ${objdir}
        WORKING_DIRECTORY ${objdir})
-      # Empty dummy source file that goes into merged library		
+      list(APPEND target_OBJS "${objlistfile}")
-      set(mergebase ${lib}.mergebase.c)		
-      add_custom_command(OUTPUT ${mergebase}		
-        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}		
-        DEPENDS ${objlistfile})		
-      list(APPEND mergebases "${mergebase}")
    endforeach()
-    add_library(${TARGET_NAME} STATIC ${mergebases})
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(OUTPUT ${target_SRCS}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
+      DEPENDS ${libs} ${target_OBJS})
+    # Generate dummy staic lib
+    file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
    target_link_libraries(${TARGET_NAME} ${libs_deps})
    # Get the file name of the generated library
-    set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+    set(target_LIBNAME "$<TARGET_FILE:${TARGET_NAME}>")
-    foreach(lib ${libs})
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+        COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'`
-        COMMAND ${CMAKE_AR} cr ${outlibfile} *.o
+        COMMAND ${CMAKE_RANLIB} ${target_LIBNAME}
-        COMMAND ${CMAKE_RANLIB} ${outlibfile}
+        WORKING_DIRECTORY ${target_DIR})
-        WORKING_DIRECTORY ${lib}.objdir)
-    endforeach()
  endif()
 endfunction(merge_static_libs)
@@ -196,7 +200,7 @@ function(cc_library TARGET_NAME)
    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
  else(cc_library_SRCS)
-    if (cc_library_DEPS)
+    if(cc_library_DEPS)
      merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
    else()
      message(FATAL "Please specify source file or library in cc_library.")
@@ -249,7 +253,7 @@ function(nv_library TARGET_NAME)
      foreach(source_file ${nv_library_SRCS})
        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-          list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
        endif()
      endforeach()
      add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
@@ -385,13 +389,60 @@ function(go_test TARGET_NAME)
    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
 endfunction(go_test)
+# Modification of standard 'protobuf_generate_cpp()' with protobuf-lite support
+# Usage:
+#   paddle_protobuf_generate_cpp(<proto_srcs> <proto_hdrs> <proto_files>)
+function(paddle_protobuf_generate_cpp SRCS HDRS)
+  if(NOT ARGN)
+    message(SEND_ERROR "Error: paddle_protobuf_generate_cpp() called without any proto files")
+    return()
+  endif()
+  set(${SRCS})
+  set(${HDRS})
+  if (MOBILE_INFERENCE)
+      set(EXTRA_FLAG "lite:")  
+  else()
+      set(EXTRA_FLAG "") 
+  endif()
+  foreach(FIL ${ARGN})
+    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+    get_filename_component(FIL_WE ${FIL} NAME_WE)
+    set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc")
+    set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h")
+    list(APPEND ${SRCS} "${_protobuf_protoc_src}")
+    list(APPEND ${HDRS} "${_protobuf_protoc_hdr}")
+    add_custom_command(
+      OUTPUT "${_protobuf_protoc_src}"
+             "${_protobuf_protoc_hdr}"
+      COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
+      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} 
+      -I${CMAKE_CURRENT_SOURCE_DIR}
+      --cpp_out "${EXTRA_FLAG}${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
+      DEPENDS ${ABS_FIL} protoc
+      COMMENT "Running C++ protocol buffer compiler on ${FIL}"
+      VERBATIM )
+  endforeach()
+  set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
+  set(${SRCS} ${${SRCS}} PARENT_SCOPE)
+  set(${HDRS} ${${HDRS}} PARENT_SCOPE)
+endfunction()
 function(proto_library TARGET_NAME)
  set(oneValueArgs "")
  set(multiValueArgs SRCS DEPS)
  cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  set(proto_srcs)
  set(proto_hdrs)
-  protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
+  paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
  cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
 endfunction()

--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -24,11 +24,10 @@ IF(WIN32)
    SET(HOST_SYSTEM "win32")
 ELSE(WIN32)
    IF(APPLE)
-        EXEC_PROGRAM (sw_vers ARGS -productVersion OUTPUT_VARIABLE MACOSX_VERSION)
-        STRING(REGEX MATCH "[0-9]+.[0-9]+" VERSION "${MACOSX_VERSION}")
-        SET(MACOS_VERSION ${VERSION})
        SET(HOST_SYSTEM "macosx")
-        IF(NOT DEFINED ENV{MACOSX_DEPLOYMENT_TARGET})
+        EXEC_PROGRAM(sw_vers ARGS -productVersion OUTPUT_VARIABLE HOST_SYSTEM_VERSION)
+        STRING(REGEX MATCH "[0-9]+.[0-9]+" MACOS_VERSION "${HOST_SYSTEM_VERSION}")
+        IF(NOT DEFINED $ENV{MACOSX_DEPLOYMENT_TARGET})
            # Set cache variable - end user may change this during ccmake or cmake-gui configure.
            SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING
                "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.")
@@ -49,6 +48,8 @@ ELSE(WIN32)
            ELSEIF(LINUX_ISSUE MATCHES "Fedora")
                SET(HOST_SYSTEM "fedora")
            ENDIF()
+            STRING(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" HOST_SYSTEM_VERSION "${LINUX_ISSUE}")
        ENDIF(EXISTS "/etc/issue")
        IF(EXISTS "/etc/redhat-release")
@@ -70,7 +71,7 @@ CMAKE_HOST_SYSTEM_INFORMATION(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES)
 MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
-MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}")
+MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}")
 MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
 # configuration for cross-compiling
@@ -82,6 +83,9 @@ IF(DEFINED CMAKE_SYSTEM_NAME)
    ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "RPi")
        SET(RPI TRUE)
        INCLUDE(cross_compiling/raspberry_pi)
+    ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
+        SET(IOS TRUE)
+        INCLUDE(cross_compiling/ios)
    ENDIF()
 ENDIF()

--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -25,7 +25,9 @@ function(target_circle_link_libraries TARGET_NAME)
            endif()
        endforeach()
        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
-            list(APPEND LIBS "-undefined dynamic_lookup")
+            if(NOT IOS_ENABLE_BITCODE)
+                list(APPEND LIBS "-undefined dynamic_lookup")
+            endif()
        endif()
        list(REVERSE libsInArgn)
        target_link_libraries(${TARGET_NAME}
@@ -71,30 +73,52 @@ function(link_paddle_exe TARGET_NAME)
        generate_rdma_links()
    endif()
-    target_circle_link_libraries(${TARGET_NAME}
+    if(MOBILE_INFERENCE)
-        ARCHIVE_START
+        target_circle_link_libraries(${TARGET_NAME}
-        paddle_gserver
+            ARCHIVE_START
-        paddle_function
+            paddle_gserver
-        ARCHIVE_END
+            paddle_function
-        paddle_pserver
+            ARCHIVE_END
-        paddle_trainer_lib
+            paddle_math
-        paddle_network
+            paddle_utils
-        paddle_math
+            paddle_parameter
-        paddle_utils
+            paddle_proto
-        paddle_parameter
+            paddle_cuda
-        paddle_proto
+            ${EXTERNAL_LIBS}
-        paddle_cuda
+            ${CMAKE_THREAD_LIBS_INIT}
-        paddle_optimizer
+            ${CMAKE_DL_LIBS}
-        ${EXTERNAL_LIBS}
+            ${RDMA_LD_FLAGS}
-        ${CMAKE_THREAD_LIBS_INIT}
+            ${RDMA_LIBS})
-        ${CMAKE_DL_LIBS}
+    else()
-        ${RDMA_LD_FLAGS}
+        target_circle_link_libraries(${TARGET_NAME}
-        ${RDMA_LIBS})
+            ARCHIVE_START
+            paddle_gserver
+            paddle_function
+            ARCHIVE_END
+            paddle_pserver
+            paddle_trainer_lib
+            paddle_network
+            paddle_math
+            paddle_utils
+            paddle_parameter
+            paddle_proto
+            paddle_cuda
+            paddle_optimizer
+            ${EXTERNAL_LIBS}
+            ${CMAKE_THREAD_LIBS_INIT}
+            ${CMAKE_DL_LIBS}
+            ${RDMA_LD_FLAGS}
+            ${RDMA_LIBS})
+    endif()
    if(ANDROID)
        target_link_libraries(${TARGET_NAME} log)
    endif(ANDROID)
+    if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR)
+      target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
+    endif()
    add_dependencies(${TARGET_NAME} ${external_project_dependencies})
 endfunction()

--- a/doc/api/v1/index_cn.rst
+++ b/doc/api/v1/index_cn.rst
@@ -21,7 +21,7 @@ Model Config API
    trainer_config_helpers/optimizers.rst
    trainer_config_helpers/data_sources.rst
    trainer_config_helpers/layers.rst
-    trainer_config_helpers/activations.rst 
+    trainer_config_helpers/activations.rst
    trainer_config_helpers/poolings.rst
    trainer_config_helpers/networks.rst
    trainer_config_helpers/evaluators.rst

--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -345,6 +345,11 @@ clip
 ..  autoclass:: paddle.v2.layer.clip
    :noindex:
+resize
+------
+..  autoclass:: paddle.v2.layer.resize
+    :noindex:
 slope_intercept
 ---------------
 ..  autoclass:: paddle.v2.layer.slope_intercept

--- a/doc/api/v2/config/networks.rst
+++ b/doc/api/v2/config/networks.rst
@@ -125,3 +125,8 @@ simple_attention
    :members: simple_attention
    :noindex:
+dot_product_attention
+---------------------
+..  automodule:: paddle.v2.networks
+    :members: dot_product_attention
+    :noindex:
--- a/doc/design/api.md
+++ b/doc/design/api.md
@@ -3,7 +3,7 @@
 ## Ingredients
 As our design principle is starting from the essence: how could we
-allow users to express and solve their problems at neural networks.
+allow users to express and solve their problems as neural networks.
 Some essential concepts that our API have to provide include:
 1. A *topology* is an expression of *layers*.
@@ -233,7 +233,7 @@ paddle.dist_train(model,
                  num_parameter_servers=15)
 ```
-The pseudo code if `paddle.dist_train` is as follows:
+The pseudo code of `paddle.dist_train` is as follows:
 ```python
 def dist_train(topology, parameters, trainer, reader, ...):

--- a/doc/design/auto_gradient_check.md
+++ b/doc/design/auto_gradient_check.md
 ## Auto Gradient Checker Design
 ## Backgraound：
- Operator forward computing is easy to check if the result is right because it has a clear definition. **But** backpropagation is a notoriously difficult algorithm to debug and get right:
+- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right:
-  - 1. you should get the right backpropagation formula according to the forward computation.
+  1. you should get the right backpropagation formula according to the forward computation.
-  - 2. you should implement it right in CPP.
+  2. you should implement it right in CPP.
-  - 3. it's difficult to prepare test data.
+  3. it's difficult to prepare test data.
- Auto gradient check gets a numeric gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages:
+- Auto gradient checking gets a numerical gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages:
-  - 1. numeric gradient checker only need forward operator.
+  1. numerical gradient checker only need forward operator.
-  - 2. user only need to prepare the input data for forward Operator.
+  2. user only need to prepare the input data for forward Operator.
 ## Mathematical Theory
-The following two document from stanford has a detailed explanation of how to get numeric gradient and why it's useful.
+The following two document from Stanford has a detailed explanation of how to get numerical gradient and why it's useful.
 - [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
 - [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
@@ -20,7 +20,7 @@ The following two document from stanford has a detailed explanation of how to ge
 ## Numeric Gradient Implementation
 ### Python Interface
 ```python
-def get_numeric_gradient(op,
+def get_numerical_gradient(op,
                         input_values,
                         output_name,
                         input_to_check,
@@ -30,13 +30,13 @@ def get_numeric_gradient(op,
    Get Numeric Gradient for an operator's input.
    :param op: C++ operator instance, could be an network
-    :param input_values: The input variables. Should be an dictionary, key is
+    :param input_values: The input variables. Should be an dictionary, whose key is
-    variable name. Value is numpy array.
+    variable name, and value is numpy array.
    :param output_name: The final output variable name.
-    :param input_to_check: The input variable need to get gradient.
+    :param input_to_check: The input variable with respect to which to compute the gradient.
    :param delta: The perturbation value for numeric gradient method. The
    smaller delta is, the more accurate result will get. But if that delta is
-     too small, it could occur numerical stability problem.
+     too small, it will suffer from numerical stability problem.
    :param local_scope: The local scope used for get_numeric_gradient.
    :return: The gradient array in numpy format.
    """
@@ -45,28 +45,28 @@ def get_numeric_gradient(op,
 ### Explaination:
 - Why need `output_name`
-  - One Operator may have multiple Output, you can get independent gradient from each Output. So user should set one output to calculate.
+  - An Operator may have multiple Output, one can get independent gradient from each Output. So caller should specify the name of the output variable.
 - Why need `input_to_check`
-  - One operator may have multiple inputs. Gradient Op can calculate the gradient of these Inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times.
+  - One operator may have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times.
 ### Core Algorithm Implementation
 ```python
-    # we only compute gradient of one element each time.
+    # we only compute gradient of one element a time.
-    # we use a for loop to compute the gradient of every element.
+    # we use a for loop to compute the gradient of each element.
    for i in xrange(tensor_size):
-        # get one input element throw it's index i.
+        # get one input element by its index i.
        origin = tensor_to_check.get_float_element(i)
-        # add delta to it, run op and then get the sum of the result tensor.
+        # add delta to it, run op and then get the new value of the result tensor.
        x_pos = origin + delta
        tensor_to_check.set_float_element(i, x_pos)
        y_pos = get_output()
-        # plus delta to this element, run op and get the sum of the result tensor.
+        # plus delta to this element, run op and get the new value of the result tensor.
        x_neg = origin - delta
        tensor_to_check.set_float_element(i, x_neg)
        y_neg = get_output()
@@ -85,15 +85,15 @@ def get_numeric_gradient(op,
 Each Operator Kernel has three kinds of Gradient:
- 1. Numeric Gradient
+1. Numerical gradient
- 2. CPU Operator Gradient
+2. CPU kernel gradient
- 3. GPU Operator Gradient(if supported)
+3. GPU kernel gradient (if supported)
-Numeric Gradient Only relies on forward Operator. So we use Numeric Gradient as the reference value.
+The numerical gradient only relies on forward Operator. So we use the numerical gradient as the reference value. And the gradient checking is performed in the following three steps:
- 1. calculate the numeric gradient.
+1. calculate the numerical gradient
- 2. calculate CPU kernel Gradient with the backward Operator and compare it with the numeric gradient.
+2. calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient
- 3. calculate GPU kernel Gradient with the backward Operator and compare it with the numeric gradient.(if support GPU)
+3. calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient (if supported)
 #### Python Interface
@@ -110,8 +110,8 @@ Numeric Gradient Only relies on forward Operator. So we use Numeric Gradient as
        :param forward_op: used to create backward_op
        :param input_vars: numpy value of input variable. The following
            computation will use these variables.
-        :param inputs_to_check: inputs var names that should check gradient.
+        :param inputs_to_check: the input variable with respect to which to compute the gradient.
-        :param output_name: output name that used to
+        :param output_name: The final output variable name.
        :param max_relative_error: The relative tolerance parameter.
        :param no_grad_set: used when create backward ops
        :param only_cpu: only compute and check gradient on cpu kernel.
@@ -120,24 +120,24 @@ Numeric Gradient Only relies on forward Operator. So we use Numeric Gradient as
 ```
 ### How to check if two numpy array is close enough?
-if `abs_numeric_grad` is nearly zero, then use abs error for numeric_grad, not relative
+if `abs_numerical_grad` is nearly zero, then use abs error for numerical_grad
 ```python
-numeric_grad = ...
+numerical_grad = ...
 operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
-abs_numeric_grad = numpy.abs(numeric_grad)
+abs_numerical_grad = numpy.abs(numerical_grad)
-# if abs_numeric_grad is nearly zero, then use abs error for numeric_grad, not relative
+# if abs_numerical_grad is nearly zero, then use abs error for numeric_grad, not relative
 # error.
-abs_numeric_grad[abs_numeric_grad < 1e-3] = 1
+abs_numerical_grad[abs_numerical_grad < 1e-3] = 1
-diff_mat = numpy.abs(abs_numeric_grad - operator_grad) / abs_numeric_grad
+diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad
 max_diff = numpy.max(diff_mat)
 ```
 #### Notes：
-1，The Input data for auto gradient checker should be reasonable to avoid numeric problem.
+The Input data for auto gradient checker should be reasonable to avoid numerical  stability problem.
 #### Refs:

--- a/doc/design/block.md
+++ b/doc/design/block.md
+# Design Doc: Block and Scope
+## The Representation of Computation
+Both deep learning systems and programming languages help users describe computation procedures.  These systems use various representations of computation:
+- Caffe, Torch, and Paddle: sequences of layers.
+- TensorFlow, Caffe2, Mxnet: graph of operators.
+- PaddlePaddle: nested blocks, like C++ and Java programs.
+## Block in Programming Languages and Deep Learning
+In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions or operators.
+Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:
+| programming languages | PaddlePaddle          |
+|-----------------------|-----------------------|
+| for, while loop       | RNN, WhileOp          |
+| if, if-else, switch   | IfElseOp, SwitchOp    |
+| sequential execution  | a sequence of layers  |
+A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes.
+## Stack Frames and the Scope Hierarchy
+The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:
+| programming languages | PaddlePaddle                    |
+|-----------------------|---------------------------------|
+| stack                 | scope hierarchy                 |
+| stack frame           | scope                           |
+| push at entering block| push at entering block          |
+| pop at leaving block  | destroy when minibatch completes|
+1. In traditional programs:
+   - When the execution enters the left curly brace of a block, the runtime pushes a frame into the stack, where it realizes local variables.
+   - After the execution leaves the right curly brace, the runtime pops the frame.
+   - The maximum number of frames in the stack is the maximum depth of nested blocks.
+1. In PaddlePaddle
+   - When the execution enters a block, PaddlePaddle adds a new scope, where it realizes variables.
+   - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are used by the backward pass.  So it has a stack forest known as a *scope hierarchy*.
+   - The height of the highest tree is the maximum depth of nested blocks.
+   - After the processing of a minibatch, PaddlePaddle destroys the scope hierarchy.
+## Use Blocks in C++ and PaddlePaddle Programs
+Let us consolidate the discussion by presenting some examples.
+### Blocks with `if-else` and `IfElseOp`
+The following C++ programs shows how blocks are used with the `if-else` structure:
+```c++
+namespace pd = paddle;
+int x = 10;
+int y = 1;
+int z = 10;
+bool cond = false;
+int o1, o2;
+if (cond) {
+  int z = x + y;
+  o1 = z;
+  o2 = pd::layer::softmax(z);
+} else {
+  int d = pd::layer::fc(z);
+  o1 = d;
+  o2 = d+1;
+}
+```
+An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](./if_else_op.md) is as follows:
+```python
+import paddle as pd
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+ie = pd.ifelse()
+with ie.true_block():
+    d = pd.layer.add_scalar(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `fc(x)` and `x+1` .
+The difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.
+### Blocks with `for` and `RNNOp`
+The following RNN model in PaddlePaddle from the [RNN design doc](./rnn.md) :
+```python
+x = sequence([10, 20, 30]) # shape=[None, 1]
+m = var(0) # shape=[1]
+W = var(0.314, param=true) # shape=[1]
+U = var(0.375, param=true) # shape=[1]
+rnn = pd.rnn()
+with rnn.step():
+  h = rnn.memory(init = m)
+  h_prev = rnn.previous_memory(h)
+  a = layer.fc(W, x)
+  b = layer.fc(U, h_prev)  
+  s = pd.add(a, b)
+  act = pd.sigmoid(s)
+  rnn.update_memory(h, act)
+  rnn.output(a, b)
+o1, o2 = rnn()
+```
+has its equivalent C++ program as follows
+```c++
+int* x = {10, 20, 30};
+int* m = {0};
+int* W = {0.314};
+int* U = {0.375};
+int mem[sizeof(x) / sizeof(x[0]) + 1];
+int o1[sizeof(x) / sizeof(x[0]) + 1];
+int o2[sizeof(x) / sizeof(x[0]) + 1];
+for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) {
+  int x = x[i-1];
+  if (i == 1) mem[0] = m;
+  int a = W * x;
+  int b = Y * mem[i-1];
+  int s = fc_out + hidden_out;
+  int act = sigmoid(sum);
+  mem[i] = act;
+  o1[i] = act;
+  o2[i] = hidden_out;
+}
+```
+## Compilation and Execution
+Like TensorFlow, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest executes the message for training or inference.
+The generation of this protobuf message is similar to how a compiler generates a binary executable file. The execution of the message is similar to how the OS executes the binary file.
+## The "Binary Executable File Format"
+The definition of the protobuf message is as follows:
+```protobuf
+message BlockDesc {
+  repeated VarDesc vars = 1;
+  repeated OpDesc ops = 2;
+}
+```
+The step net in above RNN example would look like
+```
+BlockDesc {
+  vars = {
+    VarDesc {...} // x
+    VarDesc {...} // h
+    VarDesc {...} // fc_out
+    VarDesc {...} // hidden_out
+    VarDesc {...} // sum
+    VarDesc {...} // act
+  }
+  ops = {
+    OpDesc {...} // matmul
+    OpDesc {...} // add_two
+    OpDesc {...} // sigmoid
+  }
+};
+```
+Also, the RNN operator in above example is serialized into a protobuf message of type `OpDesc` and would look like:
+```
+OpDesc {
+  inputs = {0} // the index of x in vars of BlockDesc above
+  outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above
+  attrs {
+    "states" : {1} // the index of h
+    "step_net" : <above step net>
+  }
+};
+```
+This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing the global block.
+## The Compilation of Blocks
+During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator).
+VarDesc in a block should have its name scope to avoid local variables affect parent block's name scope.
+Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example:
+```python
+a = pd.Variable(shape=[20, 20])
+b = pd.fc(a, params=["fc.w", "fc.b"])
+rnn = pd.create_rnn()
+with rnn.stepnet():
+    x = a.as_step_input()
+    # reuse fc's parameter
+    fc_without_b = pd.get_variable("fc.w")
+    rnn.output(fc_without_b)
+out = rnn()
+```
+The method `pd.get_variable` can help retrieve a Variable by the name. The Variable may be stored in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance.
+In compiler design, the symbol table is a data structure created and maintained by compilers to store information about the occurrence of various entities such as variable names, function names, classes, etc.
+To store the definition of variables and operators, we define a C++ class `SymbolTable`, like the one used in compilers.
+`SymbolTable` can do the following:
+- store the definitions (some names and attributes) of variables and operators,
+- verify if a variable was declared,
+- make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers).
+```c++
+// Information in SymbolTable is enough to trace the dependency graph. So maybe
+// the Eval() interface takes a SymbolTable is enough.
+class SymbolTable {
+ public:
+  SymbolTable(SymbolTable* parent) : parent_(parent) {}
+  OpDesc* NewOp(const string& name="");
+  // TODO determine whether name is generated by python or C++.
+  // Currently assume that a unique name will be generated by C++ if the
+  // argument name is left default.
+  VarDesc* Var(const string& name="");
+  // find a VarDesc by name, if recursive is true, find parent's SymbolTable
+  // recursively.
+  // this interface is introduced to support InferShape, find protobuf messages
+  // of variables and operators, pass pointers into InferShape.
+  //
+  // NOTE maybe some C++ classes such as VarDescBuilder and OpDescBuilder should
+  // be proposed and embedded into pybind to enable python operation on C++ pointers.
+  VarDesc* FindVar(const string& name, bool recursive=true);
+  OpDesc* FindOp(const string& name);
+  BlockDesc Compile() const;
+ private:
+  SymbolTable* parent_;
+  map<string, OpDesc> ops_;
+  map<string, VarDesc> vars_;
+};
+```
+After all the description of variables and operators is added into SymbolTable,
+the block has enough information to run.
+The `Block` class takes a `BlockDesc` as input, and provides `Run` and `InferShape` functions.
+```c++
+namespace {
+class Block : OperatorBase {
+public:
+  Block(const BlockDesc& desc) desc_(desc) {}
+  void InferShape(const framework::Scope& scope) const override {
+    if (!symbols_ready_) {
+      CreateVariables(scope);
+      CreateOperators();
+    }
+    // should run InferShape first.
+    for (auto& op : runtime_table_.ops()) {
+      op->InferShape(scope);
+    }
+  }
+  void Run(const framework::Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
+    for (auto& op : runtime_table_.ops()) {
+      op->Run(scope, dev_ctx);
+    }
+  }
+  void CreateVariables(const framework::Scope& scope);
+  void CreateOperators();
+  // some other necessary interfaces of NetOp are listed below
+  // ...
+private:
+  BlockDesc desc_;
+  bool symbols_ready_{false};
+};
+```
+## The Execution of Blocks
+Block inherits from OperatorBase, which has a Run method.
+Block's Run method will run its operators sequentially.
+There is another important interface called `Eval`, which takes some arguments called targets and generates a minimal graph which treats targets as the end points and creates a new Block. After `Run`, `Eval` will get the latest value and return the targets.
+The definition of Eval is as follows:
+```c++
+// clean a block description by targets using the corresponding dependency graph.
+// return a new BlockDesc with minimal number of operators.
+// NOTE: The return type is not a Block but the block's description so that this can be distributed
+// to a cluster.
+BlockDesc Prune(const BlockDesc& desc, vector<string> targets);
+void Block::Eval(const vector<string>& targets,
+                 const framework::Scope& scope,
+                 const platform::DeviceContext& dev_ctx) {
+  BlockDesc min_desc = Prune(desc_, targets);
+  Block min_block(min_desc);
+  min_block.Run(scope, dev_ctx);
+}
+```
--- a/doc/design/cluster_train/src/trainer.graffle
+++ b/doc/design/cluster_train/src/trainer.graffle
--- a/doc/design/dcgan.png
+++ b/doc/design/dcgan.png
--- a/doc/design/executor.md
+++ b/doc/design/executor.md
+# Executor Design Doc
+## Motivation
+We use executor to do the runtime evaluation of a `ProgramDesc`.
+## Overview
+An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs.
+### What does executor do?
+It evaluates all the operators in the `block_id`th block of a `ProgramDesc`.
+### What does executor NOT do?
+It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run.
+It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices.
+## Implementation
+`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)
--- a/doc/design/functions_operators_layers.md
+++ b/doc/design/functions_operators_layers.md
@@ -53,12 +53,12 @@ Let's explain using an example.  Suppose that we are going to compose the FC usi
 ```python
 def operator.mul(X1, X2):
    O = Var()
-    paddle.cpp.create_operator("mul", input={X1, Y1], output=O)
+    paddle.cpp.create_operator("mul", input={X1, Y1}, output=O)
    return O
 def operator.add(X1, X2):
    O = Var()
-    paddle.cpp.create_operator("add", input={X1, X2], output=O)
+    paddle.cpp.create_operator("add", input={X1, X2}, output=O)
    return O
 ```

--- a/doc/design/gan_api.md
+++ b/doc/design/gan_api.md
+# Design for GAN
+GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas. 
+It applies several important concepts in machine learning system design, including building and running subgraphs, dependency tracing, different optimizers in one executor and so forth.
+In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation.
+<p align="center">
+<img src="./test.dot.png" width = "35%" align="center"/><br/>
+Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run.
+</p>
+The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563.
+<p align="center">
+<img src="./dcgan.png" width = "90%" align="center"/><br/>
+Figure 2. Photo borrowed from the original DC-GAN paper.
+</p>
+## The Conditional-GAN might be a class. 
+This design we adopt the popular open source design in https://github.com/carpedm20/DCGAN-tensorflow and https://github.com/rajathkmp/DCGAN. It contains following data structure:
+- DCGAN(object): which contains everything required to build a GAN model. It provides following member functions methods as API:
+- __init__(...): Initialize hyper-parameters (like conv dimension and so forth), and declare model parameters of discriminator and generator as well.
+- generator(z, y=None): Generate a fake image from input noise z. If the label y is provided, the conditional GAN model will be chosen.
+Returns a generated image.
+- discriminator(image):
+Given an image, decide if it is from a real source or a fake one. 
+Returns a 0/1 binary label.
+- build_model(self):
+build the whole GAN model, define training loss for both generator and discrimator.
+## Discussion on Engine Functions required to build GAN
+- Trace the tensor and variable dependency in the engine executor. (Very critical, otherwise GAN can'be be trained correctly)
+- Different optimizers responsible for optimizing different loss.
+To be more detailed, we introduce our design of DCGAN as following:
+### Class member Function: Initializer
+- Set up hyper-parameters, including condtional dimension, noise dimension, batch size and so forth.
+- Declare and define all the model variables. All the discriminator parameters are included in the list self.theta_D and all the generator parameters are included in the list self.theta_G.
+```python
+class DCGAN(object):
+  def __init__(self, y_dim=None):
+    # hyper parameters  
+    self.y_dim = y_dim # conditional gan or not
+    self.batch_size = 100
+    self.z_dim = z_dim # input noise dimension
+    # define parameters of discriminators
+    self.D_W0 = pd.Variable(shape=[3,3, 1, 128], data=pd.gaussian_normal_randomizer())
+    self.D_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.D_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.D_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.D_W2 = pd.Varialble(np.random.rand(128, 1))
+    self.D_b2 = pd.Variable(np.zeros(128))
+    self.theta_D = [self.D_W0, self.D_b0, self.D_W1, self.D_b1, self.D_W2, self.D_b2]
+    # define parameters of generators
+    self.G_W0 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.G_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.G_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.G_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.G_W2 = pd.Varialble(np.random.rand(128, 1))
+    self.G_b2 = pd.Variable(np.zeros(128))
+    self.theta_G = [self.G_W0, self.G_b0, self.G_W1, self.G_b1, self.G_W2, self.G_b2]
+```
+### Class member Function: Generator
+- Given a noisy input z, returns a fake image.
+- Concatenation, batch-norm, FC operations required;
+- Deconv layer required, which is missing now...
+```python
+class DCGAN(object):
+  def generator(self, z, y = None):
+    # input z: the random noise
+    # input y: input data label (optional)
+    # output G_im: generated fake images
+    if not self.y_dim:
+      z = pd.layer.concat(1, [z, y])
+    G_h0 = pd.layer.fc(z, self.G_w0, self.G_b0)
+    G_h0_bn = pd.layer.batch_norm(G_h0)
+    G_h0_relu = pd.layer.relu(G_h0_bn)
+    G_h1 = pd.layer.deconv(G_h0_relu, self.G_w1, self.G_b1)
+    G_h1_bn = pd.layer.batch_norm(G_h1)
+    G_h1_relu = pd.layer.relu(G_h1_bn)
+    G_h2 = pd.layer.deconv(G_h1_relu, self.G_W2, self.G_b2))
+    G_im = pd.layer.tanh(G_im)
+    return G_im
+```
+### Class member function: Discriminator
+- Given a noisy input z, returns a fake image.
+- Concatenation, Convolution, batch-norm, FC, Leaky-ReLU operations required;
+```python
+class DCGAN(object):
+  def discriminator(self, image):
+    # input image: either generated images or real ones
+    # output D_h2: binary logit of the label
+    D_h0 = pd.layer.conv2d(image, w=self.D_w0, b=self.D_b0)
+    D_h0_bn = pd.layer.batchnorm(h0)
+    D_h0_relu = pd.layer.lrelu(h0_bn)
+    D_h1 = pd.layer.conv2d(D_h0_relu, w=self.D_w1, b=self.D_b1)
+    D_h1_bn = pd.layer.batchnorm(D_h1)
+    D_h1_relu = pd.layer.lrelu(D_h1_bn)
+    D_h2 = pd.layer.fc(D_h1_relu, w=self.D_w2, b=self.D_b2)
+    return D_h2
+```
+### Class member function: Build the model
+- Define data readers as placeholders to hold the data;
+- Build generator and discriminators;
+- Define two training losses for discriminator and generator, respectively. 
+If we have execution dependency engine to back-trace all tensors, the module building our GAN model will be like this:
+```python
+class DCGAN(object):
+  def build_model(self):
+    if self.y_dim:
+        self.y = pd.data(pd.float32, [self.batch_size, self.y_dim])
+    self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.z = pd.data(tf.float32, [None, self.z_size])
+    # step 1: generate images by generator, classify real/fake images with discriminator
+    if self.y_dim: # if conditional GAN, includes label
+        self.G = self.generator(self.z, self.y)
+        self.D_t = self.discriminator(self.images)
+        # generated fake images
+        self.sampled = self.sampler(self.z, self.y)
+        self.D_f = self.discriminator(self.G)
+    else: # original version of GAN
+        self.G = self.generator(self.z)
+        self.D_t = self.discriminator(self.images)
+        # generate fake images
+        self.sampled = self.sampler(self.z)
+        self.D_f = self.discriminator(self.images)
+    # step 2: define the two losses
+    self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
+    self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
+    self.d_loss = self.d_loss_real + self.d_loss_fake
+    self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie))
+```
+If we do not have dependency engine but blocks, the module building our GAN model will be like this:
+```python
+class DCGAN(object):
+  def build_model(self, default_block):
+    # input data in the default block
+    if self.y_dim:
+        self.y = pd.data(pd.float32, [self.batch_size, self.y_dim])
+    self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    # self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.z = pd.data(tf.float32, [None, self.z_size])
+    # step 1: generate images by generator, classify real/fake images with discriminator
+    with pd.default_block().g_block():
+      if self.y_dim: # if conditional GAN, includes label
+        self.G = self.generator(self.z, self.y)
+        self.D_g = self.discriminator(self.G, self.y)
+      else: # original version of GAN
+        self.G = self.generator(self.z)
+        self.D_g = self.discriminator(self.G, self.y)
+      self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie))
+    with pd.default_block().d_block():
+      if self.y_dim: # if conditional GAN, includes label
+        self.D_t = self.discriminator(self.images, self.y)
+        self.D_f = self.discriminator(self.G, self.y)
+      else: # original version of GAN
+        self.D_t = self.discriminator(self.images)
+        self.D_f = self.discriminator(self.G)
+      # step 2: define the two losses
+      self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
+      self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
+      self.d_loss = self.d_loss_real + self.d_loss_fake
+```
+Some small confusion and problems with this design:
+- D\_g and D\_f are actually the same thing, but has to be written twice; i.e., if we want to run two sub-graphs conceptually, the same codes have to be written twice if they are shared by the graph.
+- Requires ability to create a block anytime, rather than in if-else or rnn only;
+## Main function for the demo:
+Generally, the user of GAN just need to the following things:
+- Define an object as DCGAN class;
+- Build the DCGAN model;
+- Specify two optimizers for two different losses with respect to different parameters.
+```python
+# pd for short, should be more concise.
+from paddle.v2 as pd
+import numpy as np
+import logging
+if __name__ == "__main__":
+    # dcgan class in the default graph/block
+    # if we use dependency engine as tensorflow
+    # the codes, will be slightly different like:
+    # dcgan = DCGAN()
+    # dcgan.build_model()
+    with pd.block() as def_block:
+      dcgan = DCGAN()
+      dcgan.build_model(def_block)
+    # load mnist data
+    data_X, data_y = self.load_mnist()
+    # Two subgraphs required!!!
+    with pd.block().d_block():
+      d_optim = pd.train.Adam(lr = .001, beta= .1)
+      d_step = d_optim.minimize(dcgan.d_loss, dcgan.theta_D)
+    with pd.block.g_block():
+      g_optim = pd.train.Adam(lr = .001, beta= .1)
+      g_step = pd.minimize(dcgan.g_loss, dcgan.theta_G)
+    # executor
+    sess = pd.executor()
+    # training
+    for epoch in xrange(10000):
+      for batch_id in range(N / batch_size):
+        idx = ...
+        # sample a batch
+        batch_im, batch_label = data_X[idx:idx+batch_size], data_y[idx:idx+batch_size]
+        # sample z
+        batch_z = np.random.uniform(-1., 1., [batch_size, z_dim])
+        if batch_id % 2 == 0:
+          sess.run(d_step, 
+                   feed_dict = {dcgan.images: batch_im,
+                                dcgan.y: batch_label,
+                                dcgan.z: batch_z})
+        else:
+          sess.run(g_step,
+                   feed_dict = {dcgan.z: batch_z})
+```
+# More thinking about dependency engine v.s. block design:
+- What if we just want to run an intermediate result? Do we need to run the whole block/graph?
+- Should we call eval() to get the fake images in the first stage? And then train the discriminator in the second stage?
--- a/doc/design/graph.md
+++ b/doc/design/graph.md
@@ -56,7 +56,7 @@ For each parameter, like W and b created by `layer.fc`, marked as double circles
 ## Block and Graph
-The word block and graph are interchangable in the desgin of PaddlePaddle.  A [Block[(https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions.  A graph of operators and variables is a representation of the block.
+The word block and graph are interchangable in the desgin of PaddlePaddle.  A [Block](https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions.  A graph of operators and variables is a representation of the block.
 A Block keeps operators in an array `BlockDesc::ops`
@@ -67,4 +67,4 @@ message BlockDesc {
 }
 ```
-in the order that there appear in user programs, like the Python program at the beginning of this article.  We can imagine that in `ops`,  we have some forward operators, followed by some gradient operators, and then some optimization operators.
+in the order that they appear in user programs, like the Python program at the beginning of this article.  We can imagine that in `ops`,  we have some forward operators, followed by some gradient operators, and then some optimization operators.
--- a/doc/design/if_else_op.md
+++ b/doc/design/if_else_op.md
-IfOp should have only one branch. An IfOp operator takes a `cond` variable whose value must be a vector of N boolean elements. Its return value has M (M<=N) instances, each corresponds to a true element in `cond`.
+# The `IfElse` Operator
-```python
+PaddlePaddle's `IfElse` operator differs from TensorFlow's:
-import paddle as pd
-x = var()
+- the TensorFlow version takes a scalar boolean value as the condition so that the whole mini-batch goes to either the true or the false branch, whereas
-y = var()
+- the PaddlePaddle version takes a vector of boolean value as the condition, and instances corresponding to true values go to the true branch, those corresponding to false values go to the false branch.
-cond = var()
-b = pd.create_ifop(inputs=[x], output_num=1)
+## Example
-with b.true_block():
-    x = b.inputs(0)
-    z = operator.add(x, y)
-    b.set_output(0, operator.softmax(z))
-out = b(cond)
+The following PaddlePaddle program shows the usage of the IfElse operator:
-```
-If we want the output still has N instances, we can use IfElseOp with a default value, whose minibatch size must be N:
 ```python
 import paddle as pd
-x = var()
+x = minibatch([10, 20, 30]) # shape=[None, 1]
-y = var()
+y = var(1) # shape=[1], value=1
-cond = var()
+z = minibatch([10, 20, 30]) # shape=[None, 1]
-default_value = var()
+cond = larger_than(x, 15) # [false, true, true]
-b = pd.create_ifelseop(inputs=[x], output_num=1)
-with b.true_block():
+ie = pd.ifelse()
-    x = b.inputs(0)
+with ie.true_block():
-    z = operator.add(x, y)
+    d = pd.layer.add(x, y)
-    b.set_output(0, operator.softmax(z))
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():
-with b.false_block():
+    d = pd.layer.fc(z)
-    x = b.inputs(0)
+    ie.output(d, d+1)
-    z = layer.fc(x)
+o1, o2 = ie(cond)
-    b.set_output(0, operator.softmax(z))
-out = b(cond)
 ```
-If only true_block is set in an IfElseOp, we can have a default value for false as:
+A challenge to implement the `IfElse` operator is to infer those variables to be split, or, say, to identify the variable of the mini-batch or those derived from the mini-batch.
-```python
-import paddle as pd
+An equivalent C++ program is as follows:
-x = var()
+```c++
-y = var()
+namespace pd = paddle;
-cond = var()
-default_value = var()
+int x = 10;
-b = pd.create_ifelseop(inputs=[x], output_num=1, default_value)
+int y = 1;
+int z = 10;
-with b.true_block():
+bool cond = false;
-    x = b.inputs(0)
+int o1, o2;
-    z = operator.add(x, y)
+if (cond) {
-    b.set_output(0, operator.softmax(z))
+  int d = x + y;
+  o1 = z;
-out = b(cond)
+  o2 = pd::layer::softmax(z);
+} else {
+  int d = pd::layer::fc(z);
+  o1 = d;
+  o2 = d+1;
+}
 ```
-where default_value is a list of vars for `cond` == False.
--- a/doc/design/images/feed_forward.png
+++ b/doc/design/images/feed_forward.png
--- a/doc/design/images/feed_forward_regularized.png
+++ b/doc/design/images/feed_forward_regularized.png
--- a/doc/design/images/graph_construction_example.dot
+++ b/doc/design/images/graph_construction_example.dot
@@ -33,7 +33,6 @@ digraph ImageClassificationGraph {
        cost -> MSE_Grad [color=red];
        d_cost -> MSE_Grad [color=red];
-        x -> MSE_Grad [color=red];
        l -> MSE_Grad [color=red];
        y -> MSE_Grad -> d_y [color=red];

--- a/doc/design/images/graph_construction_example_all.png
+++ b/doc/design/images/graph_construction_example_all.png
--- a/doc/design/images/graph_construction_example_forward_backward.png
+++ b/doc/design/images/graph_construction_example_forward_backward.png
--- a/doc/design/images/graph_construction_example_forward_only.png
+++ b/doc/design/images/graph_construction_example_forward_only.png
--- a/doc/design/images/l1_regularization.png
+++ b/doc/design/images/l1_regularization.png
--- a/doc/design/images/l2_regularization.png
+++ b/doc/design/images/l2_regularization.png
--- a/doc/design/images/loss_equation.png
+++ b/doc/design/images/loss_equation.png
--- a/doc/design/infer_var_type.md
+++ b/doc/design/infer_var_type.md
+# Design Doc: InferVarType
+## The Problem Posed
+The variable in our design can hold variant types. Such as `LoDTensor` and `SelectedRows`. An operator should be able to inference the variable types of its output.
+For example, a `lookup table` operator takes two `LoDTensor`; one is a float tensor as the embedding table, the other is an int tensor as word ID. The gradient operator of `lookup table` will generate a `SelectedRows` as its output. A `sum` operator can take both `LoDTensor` and `SelectedRows` as its inputs and will generate a `LoDTensor` if any of its inputs is `LoDTensor`, otherwise, the `sum` operator will generate `SelectedRows` as its output.
+The variable type will be constant at runtime. Every variable's type can either be set by the user (input data and parameter) or be inferred by the operator in compile time.
+## Proposed Solution
+The `InferVarType` is a compile-time function which is registered to each operator. The inferface of that function is:
+```c++
+using InferVarTypeFN = std::function<
+    void (const OpDescBind& /*op_desc*/, BlockDescBind* /*block*/)>;
+```
+It takes an operator description as its input and will write the output variable type and store them in block description.
+The `InferVarTypeFN` will be registered in `OpInfo`, to replace `infer_var_type_` field. The `OpInfo` should be
+```cpp
+struct OpInfo {
+  InferVarTypeFN infer_var_type_;
+  ...
+};
+```
+The default `InferVarType` will set output type as `LoDTensor`. It can be done by `GetInferVarType()`.
+```cpp
+void DefaultInferVarType(const OpDescBind& op_desc, BlockDescBind* block) {
+  // set the output type of variable as `LoDTensor`.
+  // ...
+}
+struct OpInfo {
+  InferVarTypeFN infer_var_type_;
+  InferVarTypeFN GetInferVarType() const {
+    if (infer_var_type_) {
+      return infer_var_type_;
+    } else {
+      return DefaultInferVarType;
+    }
+  }
+};
+```
+## Register InferVarType
+We provide a thin base class for registering an `InferVarTypeFN`. To use a base class will ease the implementation of registry since we can detect the registry entry is an `InferVarTypeFN` or not.
+```cpp
+class VarTypeInferer {
+public:
+  virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const = 0;
+}
+```
+Operator developers can write the specialize `VarTypeInferer` as follow.
+```cpp
+class SpecialVarTypeInferer : public VarTypeInferer {
+public:
+  virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const {
+    // .. own logic
+  }
+}
+```
+Then user can register the `InferVarType` just like `GradOpDescMaker` and `OpInfoMaker`.
+```
+REGISTER_OPERATOR(some_op, OpType, SpecialVarTypeInferer, ...);
+```
--- a/doc/design/ops/images/2_level_rnn.dot
+++ b/doc/design/ops/images/2_level_rnn.dot
+digraph G {
+  rnn [label="1-th level RNN" shape=box]
+  subgraph cluster0 {
+    label = "time step 0"
+    sent0 [label="sentence"]
+    sent1 [label="sentence"]
+    rnn1 [label="2-th level RNN" shape=box]
+    sent0 -> rnn1
+    sent1 -> rnn1
+  }
+  subgraph cluster1 {
+    label = "time step 1"
+    sent2 [label="sentence"]
+    sent3 [label="sentence"]
+    rnn2 [label="2-th level RNN" shape=box]
+    sent2 -> rnn2
+    sent3 -> rnn2
+  }
+  subgraph cluster2 {
+    label = "time step 2"
+    sent4 [label="sentence"]
+    sent5 [label="sentence"]
+    rnn3 [label="2-th level RNN" shape=box]
+    sent4 -> rnn3
+    sent5 -> rnn3
+  }
+  para0 [label="paragraph info 0"]
+  para1 [label="paragraph info 1"]
+  para2 [label="paragraph info 2"]
+  rnn1 -> para0
+  rnn2 -> para1
+  rnn3 -> para2
+  para0 -> rnn
+  para1 -> rnn
+  para2 -> rnn
+  chapter [label="chapter info"]
+  rnn -> chapter
+}
--- a/doc/design/ops/images/2_level_rnn.png
+++ b/doc/design/ops/images/2_level_rnn.png
--- a/doc/design/ops/images/rnn.dot
+++ b/doc/design/ops/images/rnn.dot
+digraph G {
+  label = "simple RNN implementation" 
+  ranksep=2;
+  //graph [nodesep=1, ranksep=1];
+  node[nodesep=1]
+  subgraph cluster0 {
+    label = "global scope"
+    rankdir = TB
+    W
+    boot_memory
+    input
+    output
+  }
+  subgraph cluster1 {
+    label = "step-scope 0"
+    rankdir = TB
+    memory0[label="memory"]
+    prememory0[label="pre-memory"]
+    step_input0[label="step input"]
+    step_output0[label="step output"]
+  }
+  subgraph cluster2 {
+    label = "step-scope 1"
+    rankdir = TB
+    memory1[label="memory"]
+    prememory1[label="pre-memory"]
+    step_input1[label="step input"]
+    step_output1[label="step output"]
+  }
+  subgraph cluster3 {
+    label = "step-scope 2"
+    rankdir = TB
+    memory2[label="memory"]
+    prememory2[label="pre-memory"]
+    step_input2[label="step input"]
+    step_output2[label="step output"]
+  }
+  stepnet [shape=box]
+  stepnet0 [shape=box, style=dashed]
+  stepnet1 [shape=box, style=dashed]
+  stepnet2 [shape=box, style=dashed]
+  edge[color=blue]
+  boot_memory -> prememory0 [label="init" color="blue"]
+  memory0 -> prememory1  [label="copy/reference" color="blue"]
+  memory1 -> prememory2 [label="copy/reference" color="blue"]
+  edge[color=black]
+  W -> stepnet0[constraint=false, style=dashed]
+  W -> stepnet1[constraint=false, style=dashed]
+  W -> stepnet2[constraint=false, style=dashed]
+  memory0 -> stepnet0[style=dashed]
+  prememory0 -> stepnet0 -> step_output0[style=dashed]
+  memory1 -> stepnet1[style=dashed]
+  prememory1 -> stepnet1 -> step_output1[style=dashed]
+  memory2 -> stepnet2[style=dashed]
+  prememory2 -> stepnet2 -> step_output2[style=dashed]
+  input -> step_input0
+  input -> step_input1
+  input -> step_input2
+  step_input0 -> stepnet0 [style=dashed]
+  step_input1 -> stepnet1[style=dashed]
+  step_input2 -> stepnet2[style=dashed]
+  step_output0 -> output
+  step_output1 -> output
+  step_output2 -> output
+  stepnet0 -> stepnet[style=dashed]
+  stepnet1 -> stepnet[style=dashed]
+  stepnet2 -> stepnet[style=dashed]
+}
--- a/doc/design/ops/images/rnn.jpg
+++ b/doc/design/ops/images/rnn.jpg
--- a/doc/design/ops/images/rnn.png
+++ b/doc/design/ops/images/rnn.png
--- a/doc/design/ops/images/rnn_2level_data.dot
+++ b/doc/design/ops/images/rnn_2level_data.dot
+digraph G {
+  chapter [label="chapter"]
+  subgraph cluster0 {
+    label = "paragraph 0"
+    top_rnn0[label="top rnn step 0" shape=box]
+    p0 [label="paragraph 0"]
+    p1 [label="paragraph 1"]
+  }
+  subgraph cluster1{
+    label = "paragraph 1"
+    top_rnn1[label="top rnn step 1" shape=box]
+    p2 [label="paragraph 0"]
+    p3 [label="paragraph 1"]
+  }
+  subgraph cluster_p0 {
+    label = "sentence 0"
+    low_rnn0 [label="low rnn step 0" shape=box]
+    s00 [label="sentence 0"]
+    s01 [label="sentence 1"]
+    low_rnn0 -> s00
+    low_rnn0 -> s01
+  }
+  subgraph cluster_p1 {
+    label = "sentence 1"
+    low_rnn1 [label="low rnn step 1" shape=box]
+    s10 [label="sentence 0"]
+    s11 [label="sentence 1"]
+    low_rnn1 -> s10
+    low_rnn1 -> s11
+  }
+  subgraph cluster_p2 {
+    label = "sentence 1"
+    low_rnn2 [label="low rnn step 0" shape=box]
+    s20 [label="sentence 0"]
+    s21 [label="sentence 1"]
+    low_rnn2 -> s20
+    low_rnn2 -> s21
+  }
+  subgraph cluster_p3 {
+    label = "sentence 1"
+    low_rnn3 [label="low rnn step 1" shape=box]
+    s30 [label="sentence 0"]
+    s31 [label="sentence 1"]
+    low_rnn3 -> s30
+    low_rnn3 -> s31
+  }
+  chapter -> top_rnn0
+  chapter -> top_rnn1
+  top_rnn0 -> p0
+  top_rnn0 -> p1
+  top_rnn1 -> p2
+  top_rnn1 -> p3
+  p0 -> low_rnn0
+  p1 -> low_rnn1
+  p2 -> low_rnn2
+  p3 -> low_rnn3
+}
--- a/doc/design/ops/images/rnn_2level_data.png
+++ b/doc/design/ops/images/rnn_2level_data.png
--- a/doc/design/ops/rnn.md
+++ b/doc/design/ops/rnn.md
+# RNNOp design
+This document is about an RNN operator which requires that instances in a mini-batch have the same length.  We will have a more flexible RNN operator.
+## RNN Algorithm Implementation
+<p aligh="center">
+<img src="./images/rnn.jpg"/>
+</p>
+The above diagram shows an RNN unrolled into a full network.
+There are several important concepts:
+- *step-net*: the sub-graph to run at each step,
+- *memory*, $h_t$, the state of the current step,
+- *ex-memory*, $h_{t-1}$, the state of the previous step,
+- *initial memory value*, the ex-memory of the first step.
+### Step-scope
+There could be local variables defined in step-nets.  PaddlePaddle runtime realizes these variables in *step-scopes* -- scopes created for each step.
+<p aligh="center">
+<img src="./images/rnn.png"/><br/>
+Figure 2 the RNN's data flow
+</p>
+Please be aware that all steps run the same step-net.  Each step
+1. creates the step-scope,
+2. realizes local variables, including step-outputs, in the step-scope, and
+3. runs the step-net, which could use these variables.
+The RNN operator will compose its output from step outputs in step scopes.
+### Memory and Ex-memory
+Let's give more details about memory and ex-memory via a simply example:
+$$
+h_t = U h_{t-1} + W x_t
+$$,
+where $h_t$ and $h_{t-1}$ are the memory and ex-memory of step $t$'s respectively.
+In the implementation, we can make an ex-memory variable either "refers to" the memory variable of the previous step,
+or copy the value of the previous memory value to the current ex-memory variable.
+### Usage in Python
+For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
+We can define an RNN's step-net using Block:
+```python
+import paddle as pd
+X = some_op() # x is some operator's output, and is a LoDTensor
+a = some_op()
+# declare parameters
+W = pd.Variable(shape=[20, 30])
+U = pd.Variable(shape=[20, 30])
+rnn = pd.create_rnn_op(output_num=1)
+with rnn.stepnet():
+    x = rnn.add_input(X)
+    # declare a memory (rnn's step)
+    h = rnn.add_memory(init=a)
+    # h.pre_state() means previous memory of rnn
+    new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state()))
+    # update current memory
+    h.update(new_state)
+    # indicate that h variables in all step scopes should be merged
+    rnn.add_outputs(h)
+out = rnn()
+```
+Python API functions in above example:
+- `rnn.add_input` indicates the parameter is a variable that will be segmented into step-inputs.
+- `rnn.add_memory` creates a variable used as the memory.
+- `rnn.add_outputs` mark the variables that will be concatenated across steps into the RNN output.
+### Nested RNN and LoDTensor
+An RNN whose step-net includes other RNN operators is known as an *nested RNN*.
+For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences.
+The following figure illustrates the feeding of text into the lower level, one sentence each step, and the feeding of step outputs to the top level. The final top level output is about the whole text.
+<p aligh="center">
+<img src="./images/2_level_rnn.png"/>
+</p>
+```python
+import paddle as pd
+W = pd.Variable(shape=[20, 30])
+U = pd.Variable(shape=[20, 30])
+W0 = pd.Variable(shape=[20, 30])
+U0 = pd.Variable(shape=[20, 30])
+# a is output of some op
+a = some_op()
+# chapter_data is a set of 128-dim word vectors
+# the first level of LoD is sentence
+# the second level of LoD is chapter
+chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2)
+def lower_level_rnn(paragraph):
+    '''
+    x: the input
+    '''
+    rnn = pd.create_rnn_op(output_num=1)
+    with rnn.stepnet():
+        sentence = rnn.add_input(paragraph, level=0)
+        h = rnn.add_memory(shape=[20, 30])
+        h.update(
+            pd.matmul(W, sentence) + pd.matmul(U, h.pre_state()))
+        # get the last state as sentence's info
+        rnn.add_outputs(h)
+    return rnn
+top_level_rnn = pd.create_rnn_op(output_num=1)
+with top_level_rnn.stepnet():
+    paragraph_data = rnn.add_input(chapter_data, level=1)
+    low_rnn = lower_level_rnn(paragraph_data)
+    paragraph_out = low_rnn()
+    h = rnn.add_memory(init=a)
+    h.update(
+        pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state()))
+    top_level_rnn.add_outputs(h)
+# just output the last step
+chapter_out = top_level_rnn(output_all_steps=False)
+```
+in above example, the construction of the `top_level_rnn` calls  `lower_level_rnn`.  The input is a LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences.
+By default, the `RNNOp` will concatenate the outputs from all the time steps,
+if the `output_all_steps` set to False, it will only output the final time step.
+<p align="center">
+<img src="images/rnn_2level_data.png"/>
+</p>
--- a/doc/design/optimizer.md
+++ b/doc/design/optimizer.md
+## Optimizer Design
+### The Problem
+A PaddlePaddle program, or a block, is a sequence of operators operating variables.  A training program needs to do three kinds of works:
+1. the forward pass, which computes intermediate results and the cost(s),
+1. the backward pass, which derives gradients from intermediate results and costs, and
+1. the optimization pass, which update model parameters to optimize the cost(s).
+These works rely on three kinds of operators:
+1. forward operators,
+1. gradient operators, and
+1. optimization operators.
+It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically.
+In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass.
+### High-level Python API to describe the training process
+1. User write code to describe the network:
+	```python
+	images = layer.data("images")
+	labels = layer.data("labels")
+	w1 = pd.var("w1")
+	b1 = pd.var("b1")
+	hidden = layer.fc(images, w=w1, b=b1)
+	cost = layer.mse(hidden, labels)
+	```
+	The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
+2. Users create a certain kind of Optimizer with some argument.
+	```python
+	optimizer = AdagradOptimizer(learing_rate=0.001)
+	```
+3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list.
+	```python
+	opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1])
+	```
+	The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session.
+4. Users use Session/Executor to run this opt_op_list as target to do training.
+	```python
+	sess.run(target= opt_op_list, ...)
+	```
+#### Optimizer Python interface:
+```python
+class Optimizer(object):
+    """Optimizer Base class.
+    """
+    def __init__(self):
+        pass
+    def create_backward_pass(self, loss, parameter_list=None):
+        """
+        create and add gradient Operators in BlockDesc to Compute gradients of `loss`
+        for parameters in parameter_list
+        Args:
+          loss: an variable generated by cost function.
+          parameter_list: parameters that need to compute gradient and update to optimize the lost.
+        Returns:
+          list of (parameters, gradients) pair.
+        """
+        return None
+    def create_optimization_pass(self, parameters_and_grads):
+        """Add optimization operators to update gradients to variables.
+        Args:
+          parameters_and_grads: a list of (variable, gradient) pair to update.
+        Returns:
+          optmization_op_list: a list of optimization operator that will update parameter using gradient.
+        """
+        return None
+    def minimize(self, loss, parameter_list):
+        """Add operations to minimize `loss` by updating `parameter_list`.
+        This method combines interface `create_backward_pass()` and
+        `create_optimization_pass()` into one.
+        """
+        params_grads = self.create_backward_pass(loss, parameter_list)
+        update_ops = self.create_optimization_pass(params_grads)
+        return update_ops
+```
+Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer.
--- a/doc/design/parameters_in_cpp.md
+++ b/doc/design/parameters_in_cpp.md
 # Design Doc: The C++ Class `Parameters`
-`Parameters` is a concept we designed in Paddle V2 API. `Parameters` is a container of parameters, and make Paddle can shared parameter between topologies. We described usages of `Parameter` in [api.md](./api.md).
+`Parameters` is a concept we designed in PaddlePaddle V2 API. `Parameters` is a container of parameters, which makes PaddlePaddle capable of  sharing parameter between topologies. We described usages of `Parameter` in [api.md](./api.md).
-We used Python to implement Parameters when designing V2 API before. There are several defects for current implementation:
+We used Python to implement Parameters when designing V2 API before. There are several defects for the current implementation:
 * We just use `memcpy` to share Parameters between topologies, but this is very inefficient. 
-* We did not implement share Parameters while training. We just trigger `memcpy` when start training.
+* We did not support sharing Parameters while training. We just trigger `memcpy` when start training.
-It is necessary that we implement Parameters in CPP side. However, it could be a code refactoring for Paddle, because Paddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current Paddle implementation, there are three concepts associated with `Parameters`:
+It is necessary that we implement Parameters in CPP side. However, it could result a code refactoring for PaddlePaddle, because PaddlePaddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current PaddlePaddle implementation, there are three concepts associated with `Parameters`:
 1. `paddle::Parameter`. A `Parameters` is a container for `paddle::Parameter`.
 It is evident that we should use `paddle::Parameter` when developing `Parameters`.
 However, the `Parameter` class contains many functions and does not have a clear interface.
 It contains `create/store Parameter`, `serialize/deserialize`, `optimize(i.e SGD)`, `randomize/zero`.
 When we developing `Parameters`, we only use `create/store Parameter` functionality.
-We should extract functionalities of Parameter into many classes to clean Paddle CPP implementation.
+We should extract functionalities of Parameter into many classes to clean PaddlePaddle CPP implementation.
 2. `paddle::GradientMachine` and its sub-classes, e.g., `paddle::MultiGradientMachine`, `paddle::NeuralNetwork`.
 We should pass `Parameters` to `paddle::GradientMachine` when `forward/backward` to avoid `memcpy` between topologies.
@@ -24,7 +24,7 @@ Also, we should handle multi-GPU/CPU training, because `forward` and `backward`
 So `Parameters` should be used by `paddle::ParameterUpdater`, and `paddle::ParameterUpdater` should optimize `Parameters` (by SGD).
-The step by step approach for implementation Parameters in Paddle C++ core is listed below. Each step should be a PR and could be merged into Paddle one by one.
+The step by step approach for implementation Parameters in PaddlePaddle C++ core is listed below. Each step should be a PR and could be merged into PaddlePaddle one by one.
 1. Clean `paddle::Parameter` interface. Extract the functionalities of `paddle::Parameter` to prepare for the implementation of Parameters.

--- a/doc/design/program.md
+++ b/doc/design/program.md
+# Design Doc: PaddlePaddle Programs
+## Compile and Execution
+A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`.
+A simple example PaddlePaddle program can be found in [graph.md](./graph.md):
+```python
+x = layer.data("images")
+l = layer.data("label")
+y = layer.fc(x)
+cost = layer.mse(y, l)
+optimize(cost)
+train(cost, reader=mnist.train())
+```
+The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message.  The last line runs it.
+## Programs and Blocks
+The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
+- program: some nested blocks
+- [block](./block.md):
+  - some local variable definitions, and
+  - a sequence of operators
+The concept of block comes from usual programs.  For example, the following C++ program has three blocks:
+```c++
+int main() { // block 0
+  int i = 0;
+  if (i < 10) { // block 1
+    for (int j = 0; j < 10; j++) { // block 2
+    }
+  }
+  return 0;
+}
+```
+The following PaddlePaddle program has three blocks:
+```python
+import paddle as pd  // block 0
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+ie = pd.ifelse()
+with ie.true_block():  // block 1
+    d = pd.layer.add_scalar(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():  // block 2
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+## `BlockDesc` and `ProgramDesc`
+All protobuf messages are defined in `framework.proto`.
+`BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`.
+```protobuf
+message BlockDesc {
+  required int32 parent = 1;
+  repeated VarDesc vars = 2;
+  repeated OpDesc ops = 3;
+}
+```
+The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks.
+All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array.
+```protobuf
+message ProgramDesc {
+  repeated BlockDesc blocks = 1;
+}
+```
+### Global Block
+The global block is the first one in the above array.
+## Operators that Use Blocks
+In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch.
+The definition of `OpDesc` shows that an operator could have some attributes:
+```protobuf
+message OpDesc {
+  AttrDesc attrs = 1;
+  ...
+}
+```
+and an attribute could be of type block, which is, in fact, a block ID as described above:
+```
+message AttrDesc {
+  required string name = 1;
+  enum AttrType {
+    INT = 1,
+    STRING = 2,
+    ...
+    BLOCK = ...
+  }
+  required AttrType type = 2;
+  optional int32 block = 10; // when type == BLOCK
+  ...
+}
+```
+## InferShape
+With this design, the InferShape function should take the following parameters:
+```c++
+void InferShape(int current_block,
+                int current_operator,
+                ProgramDesc* program // might change VarDesc values.
+                ) {
+  ...
+}
+```
+where
+- `current_block` indices into `ProgramDesc::blocks`,
+- `current_operator` indices into `BlockDesc::ops`.
--- a/doc/design/prune.md
+++ b/doc/design/prune.md
+# Prune
+## Motivation
+We want to support running inference, training and checkpointing in one `ProgramDesc`. We implement 
+`void Prune(const ProgramDesc* input, ProgramDesc* output)` function, which takes a `ProgramDesc`
+and generate a pruned `ProgramDesc`.
+## Challenge
+Pruning need to support both variables and operators being evaluation targets. Consider the following
+different situations.
+```python
+# Case 1: run foward pass.
+cost_np = session.run(target=cost)
+# Case 2: run backward passing.
+opts_np, _ = session.run(target=[cost, opt])
+# Case 3: run checkpointing
+_ = session.run(target=checkpoint)
+```
+## Solution
+To support evaluation of operators, we add `is_target` field in the `OpDesc`.
+```c++
+message OpDesc {
+  required string type = 3;
+  repeated Var inputs = 1;
+  repeated Var outputs = 2;
+  repeated Attr attrs = 4;
+  optional bool is_target = 5 [ default = false ];
+};
+```
+To support evaluation of variables, we add [fetch_op](https://github.com/PaddlePaddle/Paddle/pull/4599).
+For each variable in the `target`, we insert a `fetch_op` into the `ProgramDesc` with `variable` being
+`fetch_op`'s input. Then we also set `fetch_op` is a target.
+### Algorithm
+If an operator needs to be run, it must fall into one of the following cases:
+1. It is the target.
+2. It is depended by some other ops, meaning its output is some other op's input.
+The first case can be checked by `op_desc.is_traget()` . The second case can be implement as
+```c++
+bool HasDependentVar(const OpDesc& op_desc, const std::set<string>& dependent_vars) {
+  for (auto& var : op_desc.outputs()) {
+    for (auto& argu : var.arguments()) {
+      if (dependent_vars.count(argu) != 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+```
+Then the whole algorithm can be implemented as the following [code](https://github.com/tonyyang-svail/Paddle/blob/prune_impl/paddle/framework/prune.cc).
--- a/doc/design/python_api.md
+++ b/doc/design/python_api.md
+# Design Doc: Python API
+Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
+| Python classes | Protobuf messages |
+| --- | --- |
+| Program | ProgramDesc |
+| Block | BlockDesc |
+| Operator | OpDesc |
+| Variable | VarDesc |
+Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
+## Core Concepts
+### Program
+A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s.  The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array.  For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks.
+Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`.
+```python
+class Program(objects):
+    def __init__(self):
+        self.desc = core.NewProgram() # a C++ ProgramDesc pointer.
+        self.blocks = vector<Block>()
+        self.blocks.append(Block(self, -1)) # the global block
+        self.current_block = 0          # initialized to the global block
+    def global_block():
+        return self.blocks[0]
+    def current_block():
+        return self.get_block(self.current_block)
+    def rollback():
+        self.current_block = self.current_block().parent_idx
+    def create_block():
+        new_block_idx = len(self.block)
+        self.blocks.append(Block(self, self.current_block))
+        self.current_block = new_block_idx
+        return current_block()
+```
+`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`.
+`Program` creates the first block as the global block in its constructor.  All parameters and their initializer operators are in the global block.
+### Block
+A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) includes
+1. a map from variable names to an instance of the Python `Variable` class, and
+1. a list of `Operator` instances.
+```python
+class Block(objects):
+    def __init__(self, program, parent_idx):
+        self.desc = core.NewBlock(program.desc)
+        self.program = program
+        self.vars = map<string, Variable>()
+        self.ops = vector<Operator>()
+        self.parent_idx = parent_idx
+    def create_var(self, ...):
+        return Variable(self, ...)
+    def _create_global_var(self, ...):
+        program.global_block().create_var(...)
+    def create_parameter(self, name, ...):
+        # Parameter is a subclass of variable. See Parameter section for details.
+        self.vars[name] = Parameter(self._create_global_var(...), ...)
+        return self.vars[name]
+    def append_operator(self, ...):
+        self.ops.append(Operator(self, ...))
+    def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
+       self.ops.prepend(Operator(self, ...))
+```
+`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.
+`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
+### Operator
+The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes.
+```python
+class Operator(object):
+    def __init__(self,
+                 block,  # Block
+                 type,   # string
+                 inputs, # dict<string, Variable>
+                 outputs,# dict<stirng, Variable>
+                 attrs   # dict<string, Any>
+                 ):
+        self.desc = core.NewOpDesc(block.desc, type, inputs, outputs, attrs)
+        core.infer_shape(self.desc, inputs, outputs)
+    def type(self):
+        return self.desc.type()
+```
+`Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++.
+### Variable
+Operators take Variables as its inputs and outputs.
+```python
+class Variable(object):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 ):
+        if name is None:
+            name = unique_name_generator()
+        self.name = name
+        self.block = block
+        self.desc = core.NewVarDesc(block.desc, name, shape, lod_level)
+        self.writer = None
+```
+Please be aware of `self.writer`, that tracks operator who creates the variable.  It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class.  This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**.
+### Parameter
+A parameter is a global variable with an initializer (or load) operator.
+```python
+class Parameter(Variable):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 trainable,       # bool
+                 initialize_op_attrs,
+                 optimize_op_attrs):
+        super(Parameter, self).__init__(block, name, shape, dtype, lod_level)
+        self.trainable = trainable
+        self.optimize_op_attrs = optimize_op_attrs
+        block.prepend(Operator(block,  # Block
+                               initialize_op_attrs['type'],   # string
+                               None,   # no inputs
+                               self,   # output is the parameter
+                               initialize_op_attrs)
+```
+When users create a parameter, they can call
+```python
+program.create_parameter(
+  ...,
+  init_attr={
+    type: "uniform_random",
+    min: -1.0,
+    max: 1.0,
+  })
+)
+```
+In above example, `init_attr.type` names an initialize operator.  It can also name the load operator
+```python
+init_attr={
+ type: "load",
+ filename: "something.numpy",
+}
+```
+`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message.
+## Layer Function
+A layer is a Python function that creates some operators and variables. Layers simplify the work of application programmers.
+Layer functions take `Variable` and configuration parameters as its input and return the output variable(s).
+For example, `FullyConnected` take one or more variable as its input. The input could be input data or another layer's output. There are many configuration options for a `FullyConnected` layer, such as layer size, activation, parameter names, initialization strategies of parameters, and so on. The `FullyConnected` layer will return an output variable.
+### Necessity for reusing code between layer functions
+There are a lot of code that can be reused. Such as
+* Give the default value of configuration. e.g., default initialize strategy for parameters is uniform random with `min = -1.0`, `max = 1.0`. and default initialize strategy for bias is to fill zero.
+* Append the activation operator.
+* Create a temporary variable.
+* Create parameter.
+* Generate a unique name.
+* Add a bias.
+* ...
+A mechanism to reuse code between layer functions is necessary. It will be around [150 lines of code](https://github.com/PaddlePaddle/Paddle/pull/4724/files#diff-823b27e07e93914ada859232ae23f846R12) if we write a `FullyConnected` layer without any helper functions.
+### Comparision between global functions and helper class
+The `FullyConnected` layer will be as follow when we provide global functions:
+```python
+def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
+  if name is None:
+    name = unique_name("fc")
+  input = multiple_input(input)
+  param_attr = default_param_attr(param_attr)
+  param_attr = multiple_param_attr(param_attr, len(input))
+  # mul
+  mul_results = []
+  for ipt, attr in zip(input, param_attr):
+    shape = ipt.shape[1:] + [size]
+    w = g_program.global_block().create_parameter(shape, ipt.dtype, name, attr)
+    tmp = create_tmp_var(name)
+    g_program.current_block().append_op("mul", {ipt, w}, {tmp})
+  mul_results.append(tmp)
+  # add sum
+  ...
+  # add bias
+  ...
+  # add activation
+  ...
+  return out
+```
+We can provide many helpers functions for layer developers. However, there are several disadvantages for global helper functions:
+1. We need a namespace for these methods, then layer developers can quickly figure out what method they can use.
+2. Global functions will force layer developers to pass its parameter time by time.
+So we provide a helper class, `LayerHelper`, to share code between layer functions. The `FullyConnected` Layer will be as follow.
+```python
+def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
+  helper = LayerHelper(locals())  # pass all parameter to LayerHelper
+  mul_results = []
+  for ipt, param in helper.iter_multiple_input_and_param():
+    w = helper.create_parameter(shape=ipt.shape[1:] + [size], dtype = ipt.dtype)
+    tmp = helper.create_tmp_variable()
+    helper.append_op('mul', {ipt, w}, {tmp})
+    mul_results.append(tmp)
+  pre_bias = helper.add_sum(mul_results)
+  pre_activation = helper.add_bias(pre_bias)
+  return helper.add_activation(pre_activation)
+```
+We not only use the fewer lines of code to write `fc_layer` but also make the code clearer to understand. At the same time, layer developers can figure out what function they can invoke by typing `helper.` in a python editor.
+### Implementation of layer helper
+We just keep all parameters of a layer function as a dictionary in layer helper as a private data member. Every method of layer helper will look up the dictionary after it is invoked. In that way, we can implement a layer helper for all layer functions even some layer does not contain some operator. For example, The `activation` is used by the FullyConnected layer or convolution layers, but a cross-entropy layer does not use it. The example code of `add_activation` are:
+```python
+class LayerHelper(object):
+  def __init__(self, **kwargs):  # kwargs is short for `keyword arguments`
+    self.kwargs = kwargs
+  def add_activation(self, input_var):
+    act = self.kwargs.get("act", None)  # default value is None
+    if act is None:  # do nothing if no act
+      return input_var
+    tmp = self.create_tmp_var(self)
+    self.append_op(type=act, input=input_var, output=tmp)
+    return tmp
+```
+## Optimizer
+[Optimizer Design Doc](./optimizer.md)
--- a/doc/design/reader/README.md
+++ b/doc/design/reader/README.md
@@ -52,7 +52,7 @@ Here are valid outputs:
 # a mini batch of three data items, each data item is a list (single column).
 [([1,1,1],),
 ([2,2,2],),
-([3,3,3],),
+([3,3,3],)]
 ```
 Please note that each item inside the list must be a tuple, below is an invalid output:

--- a/doc/design/refactor/distributed_architecture.md
+++ b/doc/design/refactor/distributed_architecture.md
+# Design Doc: Distributed Training Architecture
+## Abstract
+PaddlePaddle v0.10.0 uses the "trainer-parameter server"
+architecture. We run multiple replicated instances of trainers (runs
+the same code written by the user) and parameter servers for
+distributed training. This architecture served us well, but has some
+limitations:
+1. Need to write special code to handle tasks which should only be run
+  by a single trainer. E.g., initializing model and saving model.
+2. Model parallelism is hard: need to write if-else branches conditioned
+  on the trainer ID to partition model onto each trainer, and manually
+  write the inter-model-shard communication code.
+3. The user can not directly specify the parameter update rule: need
+   to modify the parameter server C++ code and compile a new
+   binary. This adds complication for researchers: A lot of extra
+   effort is required. Besides, the training job submission program
+   may not allow running arbitrary binaries.
+This design doc discusses PaddlePaddle's new distributed training
+architecture that addresses the above limitations.
+## Analysis
+We will assume the user writes the trainer program by Python, the same
+analysis holds if the trainer program is written in C++.
+### Limitation 1
+If we look at the Python code that the user writes, there are two
+kinds of functionalities:
+- The training logic such as load / save model and print log.
+- The neural network definition such as the definition of the data
+  layer, the fully connected layer, the cost function and the
+  optimizer.
+When we training with PaddlePaddle v0.10.0 distributedly, multiple
+replicated Python instances are running on different nodes: both the
+training logic and the neural network computation is replicated.
+The tasks that should only run once all belong to the training logic,
+if we only replicate the neural network computation, but do **not**
+replicate the training logic, the limitation could be solved.
+### Limitation 2
+Model parallelism means running a single model on multiple nodes by
+partitioning the model onto different nodes and managing the
+inter-model-shard communications.
+PaddlePaddle should be able to modify the nerual network computation
+definition to support model parallelism automatically. However, the
+computation is only specified in Python code, and PaddlePaddle can not
+modify Python code.
+Just like compiler uses a intermediate representation (IR) so that
+programmer does not need to manually optimize their code in most of
+the cases - the compiler will optimize the IR:
+<img src="src/compiler.png"/>
+We can have our own IR too: PaddlePaddle can support model parallel by
+converting the IR so the user no longer need to manually do it in
+Python:
+<img src="src/paddle-compile.png"/>
+The IR for PaddlePaddle after refactor is called `Block`, it specifies
+the computation dependency graph and the variables used in the
+computation.
+### Limitation 3
+The user can not directly specify the parameter update rule for the
+parameter server because the parameter server does not use the same
+computation definition as the trainer. Instead, the update rule is
+baked in the parameter server. The user can not specify the update
+rule in the same way of specifying the trainer computation.
+This could be fixed by making the parameter server run the same
+computation definition as the trainer. For a detailed explanation,
+please
+see
+[Design Doc: Operation Graph Based Parameter Server](./dist_train.md)
+## Distributed Training Architecture
+The new distributed training architecture can address the above
+limitations. Below is the illustration:
+<img src="src/distributed_architecture.png"/>
+The architecture includes major components: *PaddlePaddle Python*,
+*PaddlePaddle converter* and *PaddlePaddle runtime*:
+### PaddlePaddle Python
+PaddlePaddle Python is the Python library that user's Python trainer
+invoke to build the neural network topology, start training, etc.
+```Python
+paddle.init()
+input = paddle.op.recordIO("/home/data/mnist.recordio") # file stored on the cluster
+img, label = input[0], input[1]
+hidden = paddle.layer.fc(input=img, size=200, act=paddle.activation.Tanh())
+prediction = paddle.layer.fc(input=img, size=10, act=paddle.activation.Softmax())
+cost = paddle.layer.classification_cost(input=prediction, label=label)
+optimizer = paddle.optimizer.SGD(cost, learning_rate=0.01)
+session = paddle.session.NewRemote(num_trainer=3, num_ps=2, GPU_per_trainer=1)
+for i in range(1000):
+	_, cost_val = session.eval(targets=[cost, optimizer])
+	print cost_val
+```
+The code above is a typical Python trainer code, the neural network
+topology is built using helper functions such as
+`paddle.layer.fc`. The training is done by calling `session.eval`
+iteratively.
+#### session.eval
+As shown in the graph, `session.eval` sends the IR and the evaluation
+inputs/targets to the PaddlePaddle cluster for evaluation. The
+targets can be any variable in the computation graph. When the target
+is the `optimizer` variable, the neural network will be optimized
+once. When the target is the `cost` variable, `session.eval` returns
+the cost value.
+The Python `session` is a wrapper of the C++ `Session` class. For more
+information about `Session`, please
+see [Design Doc: Session](./session.md).
+### PaddlePaddle Converter
+PaddlePaddle converter automatically converts the IR in the request
+(IR and evaluation inputs/targets) from PaddlePaddle Python to new
+partitioned IRs and dispatch the new IRs and evaluation inputs/targets
+to different PaddlePaddle runtimes. Below are the steps:
+1. Add `feed` OP that feeds the eval inputs, and `fetch` OP that
+   fetches the eval targets to the IR.
+1. Extract a new computation (sub)graph with `feed` and `fetch` OP as
+   the boundary. The runtime does not need to run the OP that is not
+   dependent by the `fetch` OP.
+1. Optimizes the computation graph.
+1. Place the OPs in the graph onto different devices on different
+   PaddlePaddle runtime according to a placement algorithm and device
+   constraint specified by the user.
+1. Partition the graph according to runtime boundaries and add `send` /
+   `recv` OP pair on the runtime boundaries.
+1. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+1. PaddlePaddle runtimes with the `fetch` OP reports evaluation
+   results back to the converter, the convert reports the evaluation
+   results back to the PaddlePaddle Python.
+The output IRs will be cached to optimize the conversion latency.
+#### Placement Algorithm
+Our first implementation will only support "trainer-parameter server"
+placement: the parameters, initializers, and optimizers are placed on
+the PaddlePaddle runtimes with the parameter server role. And
+everything else will be placed on the PaddlePaddle runtimes with the
+trainer role. This has the same functionality of our
+"trainer-parameter server" architecture of PaddlePaddle v0.10.0, but
+is more general and flexible.
+In the future, we will implement the general placement algorithm,
+which makes placements according to the input IR, and a model of
+device computation time and device communication time. Model
+parallelism requires the general placement algorithm.
+### PaddlePaddle Runtime
+The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and
+runs the IR. The runtime does not need to do OP placement since it's
+already done by the converter.
+### Local Training Architecture
+The local training architecture will be the same as the distributed
+training architecture, the differences are everything runs locally,
+and there is just one PaddlePaddle runtime:
+<img src="src/local_architecture.png"/>
+### Training Data
+In PaddlePaddle v0.10.0, training data is typically read
+with [data reader](../reader/README.md) from Python. This approach is
+no longer efficient when training distributedly since the Python
+process no longer runs on the same node with the trainer processes,
+the Python reader will need to read from the distributed filesystem
+(assuming it has the access) and send to the trainers, doubling the
+network traffic.
+When doing distributed training, the user can still use Python data
+reader: the training data are sent with `session.eval`. However should
+be used for debugging purpose only. The users are encouraged to use
+the read data OPs.
+## References:
+[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
+[2] [TensorFlow: A System for Large-Scale Machine Learning](https://www.usenix.org/system/files/conference/osdi16/osdi16-abadi.pdf)
--- a/doc/design/ops/dist_train.md
+++ b/doc/design/ops/dist_train.md
--- a/doc/design/refactor/session.md
+++ b/doc/design/refactor/session.md
+# Design Doc: Session
+## Abstract
+The *session* object encapsulates the environment in which the
+computation graph is executed.
+We will have the *local* session and *remote* session, they offer the
+same [interface](#interface). The local session encapsulates the local
+runtime environment and the remote session encapsulates the cluster
+runtime environment.
+The local runtime environment contains:
+1. computation devices (i.e., CPU, GPU) handles, and
+1. the [scope](../scope.md) which holds all variables.
+The remote runtime environment contains:
+1. computation devices (i.e., CPU and GPU on node 0, 1) in a cluster,
+   and
+1. the distributed [scope](../scope.md) in a cluster which holds all
+   variables.
+The user can create a remote session on Paddle Cloud and evaluate the
+computation graph with it. In this way, the user can control the
+remote computation resource in a cluster from his local computer.
+## Background
+The current design has an implicit global session in which
+`paddle.eval()` is executed. The pain point is:
+Since the user is not able to explicitly switch between runtime
+environments, the user cannot run a topology in two independent
+environments.
+For example, in reinforcement learning, the user may want to have a
+stale model for inference and a fresh model for training, and only
+replace the stale model with the fresh model periodically.
+Furthermore, we have no concept that encapsulates a remote environment
+that executes a computation graph.
+We need the session object to address above issues.
+## Session
+A session is an object that owns the runtime environment. All
+computations are executed through `session.eval()`.
+### Interface
+```python
+eval(
+    targets,
+    feed_dict=None,
+)
+```
+Evaluates the target Operations or Variables in `targets`.
+- *targets*: the evaluation targets. Can be a single Operation or
+  Variable, or a list with the Operations or Variables as
+  elements. The value returned by `eval()` has the same shape as the
+  `target` argument.
+  The PaddlePaddle program is represented by
+  the [ProgramDesc](../design/program.md), `eval()` will infer the
+  ProgramDesc from the given targets and run the PaddlePaddle
+  program. Please
+  see
+  [this graph](./distributed_architecture.md#local-training-architecture) for
+  the detailed illustration for the local session
+  and
+  [this graph](./distributed_architecture.md#distributed-training-architecture) for
+  the detailed illustration for the remote session.
+- *feed_dict*: a dictionary that contains the tensors which override
+  the edges of the computation graph.
+  feed_dict not only can provide the input data, it can override any
+  OP's input as well:
+  ```python
+  a = pd.constant(2.0, name="a")
+  b = pd.variable(name="b")
+  c = pd.mul(a,b)
+  sess.eval(targets=c, feed_dict={"b":3.0}) # returns 6.0
+  ```
+```python
+close()
+```
+Closes the session and releases the scope that the session owns.
+### Create a Local Session
+```python
+session(
+    devices=None
+)
+```
+Creates a new session. One session owns one global scope, so creating
+multiple sessions will create different scopes.
+- *devices*: a single `string` or a list of `string` of device names,
+  the corresponding devices will be the computation devices for
+  `eval()`. If not specified, all available devices (e.g., all GPUs)
+  will be used. The user doesn't need to specify the CPU device since
+  it will be always used. Multiple sessions can use the same device.
+#### Example
+```Python
+a = paddle.constant(1.0)
+b = paddle.constant(2.0)
+c = a + b
+sess = paddle.session(devices=["gpu:0", "gpu:1", "fpga:0"])
+sess.eval(c)
+sess.close()
+```
+### Create a Remote Session
+```python
+create_cloud_job(
+    name,
+    num_trainer,
+    mem_per_trainer,
+    gpu_per_trainer,
+    cpu_per_trainer,
+    num_ps,
+    mem_per_ps,
+    cpu_per_ps,
+)
+```
+Creates a Paddle Cloud job. Fails if the job name exists.
+```python
+get_cloud_job(
+    name
+)
+```
+Gets a Paddle Cloud job.
+```python
+remote_session(
+    job
+)
+```
+- *job*: the Paddle Cloud job.
+#### Example
+```Python
+reader = paddle.reader.recordio("/pfs/home/peter/mnist-train-*") # data stored on Paddle Cloud
+image = reader.column(0)
+label = reader.column(1)
+fc1 = paddle.op.fc(image, size=256, act="sigmoid")
+fc2 = paddle.op.fc(fc1, size=10, act="softmax")
+cost = paddle.op.cross_entropy(fc2, label)
+opt = paddle.optimizer.sgd(cost)
+job = paddle.create_cloud_job("test", 3, "1G", 1, 1, 2, "1G", 1)
+sess = paddle.remote_ession(job)
+for i in range(1000):
+    sess.eval(opt)
+sess.close()
+```
--- a/doc/design/refactor/src/compiler.graffle
+++ b/doc/design/refactor/src/compiler.graffle
--- a/doc/design/refactor/src/compiler.png
+++ b/doc/design/refactor/src/compiler.png
--- a/doc/design/ops/src/dist-graph.graffle
+++ b/doc/design/ops/src/dist-graph.graffle
--- a/doc/design/ops/src/dist-graph.png
+++ b/doc/design/ops/src/dist-graph.png
--- a/doc/design/refactor/src/distributed_architecture.graffle
+++ b/doc/design/refactor/src/distributed_architecture.graffle
--- a/doc/design/refactor/src/distributed_architecture.png
+++ b/doc/design/refactor/src/distributed_architecture.png
--- a/doc/design/ops/src/local-graph.graffle
+++ b/doc/design/ops/src/local-graph.graffle
--- a/doc/design/ops/src/local-graph.png
+++ b/doc/design/ops/src/local-graph.png
--- a/doc/design/refactor/src/local_architecture.graffle
+++ b/doc/design/refactor/src/local_architecture.graffle
--- a/doc/design/refactor/src/local_architecture.png
+++ b/doc/design/refactor/src/local_architecture.png
--- a/doc/design/refactor/src/paddle-compile.graffle
+++ b/doc/design/refactor/src/paddle-compile.graffle
--- a/doc/design/refactor/src/paddle-compile.png
+++ b/doc/design/refactor/src/paddle-compile.png
--- a/doc/design/refactorization.md
+++ b/doc/design/refactorization.md
+# Design Doc: Refactorization Overview
+The goals of refactoring include:
+1. Making it easy for external contributors to write new elementary computation operations.
+1. Making the codebase clean and readable.
+1. Designing a new computation representation -- a computation graph of operators and variables.
+1. Implementing auto-scalability and auto fault recoverable distributed computing with the help of computation graphs.
+## Computation Graphs
+1. PaddlePaddle represents the computation, training and inference of Deep Learning models, by computation graphs.
+  1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a concrete example.
+1. Users write Python programs to describe the graphs and run them (locally or remotely).
+1. A graph is composed of *variables* and *operators*.
+1. The description of graphs must be serializable/deserializable, so that:
+   1. It can be sent to the cloud for distributed execution, and
+   1. It can be sent to clients for mobile or enterprise deployment.
+1. The Python program does two things
+   1. *Compilation* runs a Python program to generate a protobuf message representation of the graph and send it to
+      1. the C++ library `libpaddle.so` for local execution,
+      1. the master process of a distributed training job for training, or
+      1. the server process of a Kubernetes serving job for distributed serving.
+   1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message.
+## Description and Realization of Computation Graph
+At compile time, the Python program generates a protobuf message representation of the graph, or a description of the graph.
+At runtime, the C++ program realizes the graph and runs it.
+| | Representation (protobuf messages) | Realization (C++ class objects) |
+|---|---|---|
+|Data|[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107)|[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24)|
+|Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)|
+|Block|BlockDesc|Block|
+The word *graph* is interchangeable with *block* in this document.  A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
+## Compilation and Execution
+1. Run a Python program to describe the graph.  In particular, the Python application program does the following:
+   1. Create `VarDesc` to represent local/intermediate variables,
+   1. Create operators and set attributes,
+   1. Validate attribute values,
+   1. Infer the type and the shape of variables,
+   1. Plan memory-reuse for variables,
+   1. Generate the backward graph
+   1. Add optimization operators to the computation graph.
+   1. Optionally, split the graph for distributed training.
+1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following:
+   1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block,
+      1. realize local variables defined in the BlockDesc message in the new scope,
+      1. a scope is similar to the stack frame in programming languages,
+   1. Create an instance of class `Block`, in which,
+      1. realize operators in the BlockDesc message,
+   1. Run the Block by calling
+      1. `Block::Eval(vector<Variable>* targets)` for forward and backward computations, or
+      1. `Block::Eval(vector<Operator>* targets)` for optimization.
+## Intermediate Representation (IR)
+```text
+Compile Time -> IR -> Runtime
+```
+### Benefits of IR
+- Optimization
+  ```text
+  Compile Time -> IR -> Optimized IR -> Runtime
+  ```
+- Automatically send partitioned IR to different nodes.
+  - Automatic Data Parallelism
+    ```text
+    Compile Time
+    |-> Single GPU IR
+        |-> [trainer-IR-0, trainer-IR-1, pserver-IR]
+            |-> Node-0 (runs trainer-IR-0)
+            |-> Node-1 (runs trainer-IR-1)
+            |-> Node-2 (runs pserver-IR)
+    ```
+  - Automatic Model Parallelism (planned for future)
+---
+# Operator/OpWithKernel/OpKernel
+![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/49caf1fb70820fb4a6c217634317c9306f361f36/op_op_with_kern_class_diagram.dot)
+---
+# Operator
+![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot)
+* `Operator` is the fundamental building block of the user interface.
+    * Operator stores input/output variable names and attributes.
+    * The `InferShape` interface is used to infer the shape of the output variables based on the shapes of the input variables.
+    * Use `Run` to compute the `output` variables from the `input` variables.
+---
+# OpWithKernel/Kernel
+![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/9d7f4eba185cf41c8e2fbfb40ae21890dbddcd39/op_with_kernel.dot)
+* `OpWithKernel` inherits `Operator`.
+* `OpWithKernel` contains a Kernel map.
+    * `OpWithKernel::Run` get device's kernel, and invoke `OpKernel::Compute`.
+    * `OpKernelKey` is the map key. Only device place now, but may be data type later.
+---
+# Why separate Kernel and Operator
+* Separate GPU and CPU code.
+    * Make Paddle capable of running without GPU.
+* Make one operator (which is a user interface) and create many implementations.
+    * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel.
+---
+# Libraries for Kernel development
+* `Eigen::Tensor` contains basic math and element-wise functions.
+    * Note that `Eigen::Tensor` has broadcast implementation.
+    * Limit the number of `tensor.device(dev) = ` in your code.
+* `thrust::transform` and `std::transform`.
+    * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels.
+    * `thrust`, in addition, supports more complex APIs, like `scan`, `reduce`, `reduce_by_key`.
+* Hand-writing `GPUKernel` and `CPU` code
+    * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
+---
+# Operator Registration
+## Why is registration necessary?
+We need a method to build mappings between Op type names and Op classes.
+## How is registration implemented?
+Maintaining a map, whose key is the type name and the value is the corresponding Op constructor.
+---
+# The Registry Map
+### `OpInfoMap`
+`op_type(string)` -> `OpInfo`
+`OpInfo`:
+- **`creator`**: The Op constructor.
+- **`grad_op_type`**: The type of the gradient Op.
+- **`proto`**: The Op's Protobuf, including inputs, outputs and required attributes.
+- **`checker`**: Used to check attributes.
+---
+# Related Concepts
+### Op_Maker
+It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
+### Register Macros
+```cpp
+REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class)
+REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
+```
+---
+# Registration Process
+1. Write an Op class and its gradient Op class, if required.
+2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator.
+3. Invoke the macro `REGISTER_OP`. This macro will
+	1. Call maker class to complete `proto` and `checker`
+	2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`
+---
+# Backward Module (1/2)
+### Create Backward Operator
+- Mapping from forward Op to backward Op
+![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png)
+---
+# Backward Module (2/2)
+### Build Backward Network
+- **Input**: a graph of forward operators
+- **Output**: a graph of backward operators
+- **Corner cases in construction**
+	- Shared Variables => insert an `Add` operator to combine gradients
+	- No Gradient => insert a `fill_zero_grad` operator
+	- Recursive NetOp => call `Backward` recursively
+	- RNN Op => recursively call `Backward` on stepnet
+	- RNN Op => recursively call `Backward` on stepnet
+---
+# Scope, Variable, Tensor
+* `Tensor` is an n-dimension array with type.
+	* Only dims and data pointers are stored in `Tensor`.
+	* All operations on `Tensor` are written in `Operator` or global functions.
+	* Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
+* `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`.
+	* `step_scopes` in RNN is a variable and not a tensor.
+* `Scope` is where variables are stored.
+	* map<string `var name`, Variable>
+	* `Scope` has a hierarchical structure. The local scope can get variables from its parent scope.
+---
+# Block (in design)
+## the difference between original RNNOp and Block
+- As an operator is more intuitive than `RNNOp`,
+- Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`,
+- Fits the compile-time/ runtime separation design paradigm.
+  - During the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc`
+  - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`.
+---
+# Milestone
+- Take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
+- Model migration
+  - Framework development gives **priority support** to model migration, for example,
+    - the MNIST demo needs a Python interface,
+    - the RNN models require the framework to support `LoDTensor`.
+  - Determine some timelines,
+  - Frequently used Ops need to be migrated first,
+  - Different models can be migrated in parallel.
+- Improve the framework at the same time
+- Accept imperfection, concentrate on solving the specific problem at the right price.
+---
+# Control the migration quality
+- Compare the performance of migrated models with old ones.
+- Follow the google C++ style guide.
+- Build the automatic workflow of generating Python/C++ documentations.
+  - The documentation of layers and ops should be written inside the code.
+  - Take the documentation quality into account when submitting pull requests.
+  - Preview the documentations, read and improve them from a user's perspective.
--- a/doc/design/register_grad_op.md
+++ b/doc/design/register_grad_op.md
+# Design Doc: Gradient Operators Registration
+## The Problem Posed
+Currently, for each C++ operator class definition, a *gradient operator creator* function is registered, which takes as input a C++ operator instance and returns the corresponding gradient operator instance.
+However, we noticed two problems with the current design:
+1. As we decided to separate the *compilation* and the *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message.
+1. For some operators, the gradient computation can be written in terms of existing operators.  For example, the gradient of *minus* operator consists of two operators -- an *identity* operator followed by a *scale* operator.  Hence the registration mechanism needs to support mapping from an operator to a set of operators for the gradient computation.
+## The Current Implementation
+Instances of the C++ class `OpInfo` are stored an associative map whose key is the operator type. The `grad_op_type` indicates the associated gradient operator type. An operator can create the gradient operator by invoking `OpInfo::creator_` of the gradient operator. The pseudo code is as follows
+```cpp
+struct OpInfo {
+  std::function<OperatorBase*(...)> creator_;
+  std::string grad_op_type_;
+  ...
+};
+map<string, OpInfo> OpInfoMap;
+OperatorBase* CreateGradientOperator(const OperatorBase& op) {
+  return OpInfoMap.at(op.Type()).creator_(...);
+}
+```
+## Proposed Solution
+The mapping relationship between an operator and its gradient operators is a function. The interface of this function is:
+```cpp
+// (OpDesc) --> vector<OpDesc>
+std::function<std::vector<OpDescBind>(const OpDescBind&)>;
+```
+The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for  the protobuf message `OpDesc` for rapid manipulation of `OpDesc`.
+The `GradOpDescMaker` will be registered in `OpInfo` and will replace the `grad_op_type_` field. The `OpInfo` should look like 
+```cpp
+struct OpInfo {
+  std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>  grad_op_maker_;
+  ...
+};
+```
+The `grad_op_maker_ ` is a `nullptr` if the operator does not have any associated gradient operators.
+We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is
+```cpp
+class GradOpDescMakerBase {
+public:
+  GradOpDescMakerBase(const OpDescBind& );
+  virtual std::vector<std::unique_ptr<OpDescBind>> operator()()const = 0;
+};
+```
+We can convert `GradOpDescMakerBase` to `std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>` by
+```cpp
+using GradOpMaker = ...;
+std::function<std::vector<OpDescBind>(const OpDescBind&)> func;
+func = [] (const OpDescBind& fwd_op) {
+  GradOpMaker maker(fwd_op);
+  return maker();
+};
+```
+We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator.
+We should change register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`.
+The user interface should be
+```cpp
+vector<OpDesc> MinusOpGradMaker(OpDesc) {...}
+REGISTER_OPERATOR(minus, MinusOp, MinusOpProtoAndCheckerMaker, SumOpGradMaker);
+// Developers can still manually implement gradient operator.
+REGISTER_OPERATOR(minus_grad, MinusGradOp);
+```
+The interface of current `REGISTER_OP` macro could not be changed. In `REGISTER_OP`, it will invoke `REGISTER_OPERATOR` two times and generate GradOpDescMaker inside.
+```cpp
+REGISTER_OP(minus, MinusOp, MinusOpProtoAndCheckerMaker, minus_grad, MinusGradOp);
+```
--- a/doc/design/regularization.md
+++ b/doc/design/regularization.md
+# Regularization in PaddlePaddle
+## Introduction to Regularization
+A central problem in machine learning is how to design an algorithm that will perform well not just on the training data, but also on new data. Many strategies are used by machine learning practitioners to reduce the test error, possibly at the expense of increased training error. These strategies are collectively known as **regularization**. 
+### Parameter Norm Penalties
+Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows:
+<img src="./images/loss_equation.png" align="center"/><br/>
+The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`.
+The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows:
+##### L2 Regularization:
+<img src="./images/l2_regularization.png" align="center"/><br/>
+##### L1 Regularization
+<img src="./images/l1_regularization.png" align="center"/><br/>
+A much more detailed mathematical background of reguilarization can be found [here](http://www.deeplearningbook.org/contents/regularization.html).
+## How to do Regularization in PaddlePaddle
+On surveying existing frameworks like Tensorflow, PyTorch, Caffe, etc, it can be seen that there are 2 common approaches of doing regularization:
+1. Making regularization a part of the optimizer using an attribute like `weight_decay` that is used to control the scale of the L2 Penalty. This approach is used in PyTorch as follows:
+	```python
+	opt =  torch.optim.SGD(params, lr=0.2, weight_decay=0.2)
+	```
+    At every optimization step, this code will add the gradient of the L2 Norm of the params to the gradient of the params with respect to the loss function. This can seen in the following code snippet:
+    ```python
+    if weight_decay != 0:
+    	d_p.add_(weight_decay, p.data)
+    ```
+    This is a very restyrictive way of doing regularization and does not give the users enough flexibility. 
+    **Advantages**:
+    -  It is easy to implement for us.
+    -  Faster execution of backward. However, it can be done manually by advanced users too.
+	**Disadvantages**:
+    - Not flexible for other regularizations such as L1/L0 regularization.
+    - Does not allow for different regularization coefficient for different parameters. For example, in most models, ony the weight matrices are regularized and the bias vectors are unregularized.
+    - Tightly coupled optimizer and regularization implementation. 
+2. Adding regularization ops to the graph through Python API. This approach is used by Tensorflow and Caffe. Using this approach, we manually add regularization ops to the graph and then add the regularization loss to the final loss function before sending them to the optimizer.
+	**Advantages**:
+    - Allows for greater flexibility to the users of Paddle. Using this approach, the users can put different regularization to different parameters and also choose parameters that are not a part of regularization.
+    - Makes it easy for the users to customize and extend the framework. 
+	**Disadvantages**:
+    - Implementation requires comprehensive design and time. 
+## Proposal for Regularization in PaddlePaddle
+### Low-Level implementation
+In the new design, we propose to create new operations for regularization. For now, we can add 2 ops thgat correspond to the most frequently used regularizations:
+- L2_regularization_op
+- L1_regularization_op
+These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate Cpu and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes. other than L1 and L2 norm penalties. 
+The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API. 
+### Computation Graph
+Below is an example of a really simple feed forward neural network.
+<img src="./images/feed_forward.png" align="center"/><br/>
+The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows:
+<img src="./images/feed_forward_regularized.png" align="center"/><br/>
+### Python API implementation for Regularization
+Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions. 
+#### Creation of Regularization ops
+There are two possibilities for creating the regularization ops:
+1. We create these ops immediately while building the computation graph. 
+2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added. 
+The proposal is to add these ops in a lazy manner just before the backward pass. 
+#### Storage of Regularization attributes
+Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters. 
+#### High-level API
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we lso need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
--- a/doc/design/releasing_process.md
+++ b/doc/design/releasing_process.md
-# Paddle发行规范
+# PaddlePaddle发行规范
-Paddle使用git-flow branching model做分支管理，使用[Semantic Versioning](http://semver.org/)标准表示Paddle版本号。
+PaddlePaddle使用git-flow branching model做分支管理，使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。
-Paddle每次发新的版本，遵循以下流程:
+PaddlePaddle每次发新的版本，遵循以下流程:
 1. 从`develop`分支派生出新的分支，分支名为`release/版本号`。例如，`release/0.10.0`
 2. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
@@ -27,14 +27,14 @@ Paddle每次发新的版本，遵循以下流程:
 需要注意的是:
-* `release/版本号`分支一旦建立，一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭，方便测试人员测试Paddle的行为。
+* `release/版本号`分支一旦建立，一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭，方便测试人员测试PaddlePaddle的行为。
 * 在`release/版本号`分支存在的时候，如果有bugfix的行为，需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
-# Paddle 分支规范
+# PaddlePaddle 分支规范
-Paddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，并适应github的特性做了一些区别。
+PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，并适应github的特性做了一些区别。
-* Paddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中:
+* PaddlePaddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中:
 	* `master`分支为稳定(stable branch)版本分支。每一个`master`分支的版本都是经过单元测试和回归测试的版本。
 	* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试，但并没有经过回归测试。
 	* `release/版本号`分支为每一次Release时建立的临时分支。在这个阶段的代码正在经历回归测试。
@@ -42,18 +42,18 @@ Paddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branch
 * 其他用户的fork版本库并不需要严格遵守[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，但所有fork的版本库的所有分支都相当于特性分支。
 	* 建议，开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
 	* 建议，开发者fork的版本库中，再基于`develop`版本fork出自己的功能分支。
-	* 当功能分支开发完毕后，向Paddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
+	* 当功能分支开发完毕后，向PaddlePaddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
 		* 在评审过程中，开发者修改自己的代码，可以继续在自己的功能分支提交代码。 
 * BugFix分支也是在开发者自己的fork版本库维护，与功能分支不同的是，BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支，同时提起`Pull Request`。
-# Paddle回归测试列表
+# PaddlePaddle回归测试列表
-本列表说明Paddle发版之前需要测试的功能点。
+本列表说明PaddlePaddle发版之前需要测试的功能点。
-## Paddle Book中所有章节
+## PaddlePaddle Book中所有章节
-Paddle每次发版本首先要保证Paddle Book中所有章节功能的正确性。功能的正确性包括验证Paddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
+PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
 | | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 |
 | --- | --- | --- | --- | --- | --- | --- | --- | --- |

--- a/doc/design/scope.md
+++ b/doc/design/scope.md
@@ -17,7 +17,7 @@ Scope is an association of a name to variable. All variables belong to `Scope`.
 1. Scope only contains a map of a name to variable.
-   All parameters, data, states in a Net should be variables and stored inside a scope. Each op should get inputs and outputs to do computation from a scope, such as data buffer, state(momentum) etc.
+   All parameters, data, states in a Net should be variables and stored inside a scope. Each op should get inputs and outputs to do computation from a scope, such as data buffer, state (momentum) etc.
 1. Variable can only be created by Scope and a variable can only be got from Scope. User cannot create or get a variable outside a scope. This is a constraints of our framework, and will keep our framework simple and clear.
@@ -32,12 +32,12 @@ Scope is an association of a name to variable. All variables belong to `Scope`.
 1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else. 
-   Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be a invalid pointer when associated `Scope` is destroyed.
+   Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be an invalid pointer when associated `Scope` is destroyed.
 ```cpp
 class Scope {
 public:
-  Variable* NewVar(const std::string& name);
+  Variable* Var(const std::string& name);
  const Variable* FindVar(const std::string& name) const;
 private:
@@ -50,7 +50,7 @@ class Scope {
 Just like [scope](https://en.wikipedia.org/wiki/Scope_(computer_science)) in programming languages, `Scope` in the neural network can also be a local scope. There are two attributes about local scope.
-1.  We can create local variables in a local scope. When that local scope are destroyed, all local variables should also be destroyed.
+1.  We can create local variables in a local scope. When that local scope is destroyed, all local variables should also be destroyed.
 2.  Variables in a parent scope can be retrieved from local scopes of that parent scope, i.e., when user get a variable from a scope, it will try to search this variable in current scope. If there is no such variable in the local scope, `scope` will keep searching from its parent, until the variable is found or there is no parent.
 ```cpp
@@ -98,7 +98,7 @@ class Scope {
  Variable* FindVar(const std::string& name) const;
  // return if already contains same name variable.
-  Variable* NewVar(const std::string& name);
+  Variable* Var(const std::string& name);
 private:
  std::shared_ptr<Scope> parent_;
@@ -107,7 +107,7 @@ class Scope {
 ```
 ## Only scope can create a variable
-To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `NewVar` can construct `Variable`.
+To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `Var` can construct `Variable`.
 ## When scope destroyed, all variables inside this scope should be destroyed together
@@ -121,4 +121,4 @@ Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shar
 ## Orthogonal interface
-`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `NewVar` will return a `Error` when there is a name conflict locally. Combine `FindVar` and `NewVar`, we can implement `NewVar` easily.
+`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `Var` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `Var`, we can implement `Var` easily.
--- a/doc/design/selected_rows.md
+++ b/doc/design/selected_rows.md
+# Design Doc: Selected Rows
+`SelectedRows` is a type of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in this tensor. It is straight-forward to represent a sparse tensor by the following sparse tensor data structure:
+```cpp
+class SelectedRows {
+ private:
+  vector<int> rows_;
+  Tensor value_;
+  int height_;
+};
+```
+The field `height_` is the first dimension of `SelectedRows`. The `rows` are the indices of the non-zero rows of `SelectedRows`. The `value_` field is an N-dim tensor of shape `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`.
+Suppose that a SelectedRows-typed variable `x` has many rows, but only two of them have values -- row 73 is `[1, 2]` and row 84 is `[3, 4]`, the `SelectedRows` representation would be:
+```
+x = SelectedRow {
+  rows = [73, 84],
+  value = [[1, 2], [3,4]]
+}
+```
+## SelectedRows in Protobuf
+`SelectedRows` is a type of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time because the `rows_` and `value_` are dependent on the training data. 
+So we use `TensorDesc` to unify `data_type` and `dims`. A LodTensorDesc contains a `TensorDesc` and `lod_level`. The description of `SelectedRows` is a Tensor description.
+```proto
+message TensorDesc {
+  required DataType data_type = 1;
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+}
+message LodTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int lod_level = 2;
+}
+message VarDesc {
+  required string name = 1;
+  enum VarType { 
+    LOD_TENSOR = 0;
+    SELECTED_ROWS = 1;
+  }
+  required VarType type = 2;
+  optional LodTensorDesc lod_desc = 3;
+  optional TensorDesc selected_rows_desc = 4;
+  optional bool persistable = 5 [ default = false ];
+}
+```
+## InferShape for Selected Rows
+Just like `LoD` information, `InferShape` method will infer the output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor.
+For example, the gradient operator of `TableLookup` will always generate `SelectedRows`. Its `InferShape` method should be like following
+```cpp
+void TableLookupGrad::InferShape(context) {
+  ...
+  context.SetDataType("Embedding.Grad", kSelectedRows);
+}
+```
+## Sparse Operators
+There are several operators that need to be written to support `SelectedRows`. These are:
+1. Operators which generate `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`.
+2. Optimize operators which support `SelectedRows` gradient. e.g. `SGD` or `AdaGrad` for `SelectedRows`. However, there should be only one `SGD` operator. `OpWithKernel::Run` should select a suitable kernel for both `dense` tensor or `SelectedRows`.
--- a/doc/design/simple_op_design.md
+++ b/doc/design/simple_op_design.md
@@ -6,9 +6,9 @@ The Interaction between Python and C++ can be simplified as two steps:
 1. C++ tells Python how many Ops there are, and what parameter do users need to offer to initialize a new Op. Python then builds API for each Op at compile time.
-2. Users invoke APIs built by Python and provide necessary parameters. These parameters will be sent to C++ fo finish Op construction task.
+2. Users invoke APIs built by Python and provide necessary parameters. These parameters will be sent to C++ for finishing the Op construction task.
-### Message form C++ to Python
+### Message from C++ to Python
 We define a Protobuf message class `OpProto` to hold message needed in the first step. What should an `OpProto` contain? This question is equivalent to “What message do we need to offer, to build a Python API which is legal and user oriented and can use to describe a whole Op.”
@@ -193,7 +193,7 @@ def fc_layer(input, size, with_bias, activation):
 	elif:
 		# ...
 	return act_output;
-``` 
+```
 ### Low Leval API

--- a/doc/design/tensor_array.md
+++ b/doc/design/tensor_array.md
+# Design for TensorArray
+This design doc presents the necessity of a new C++ class `TensorArray`.
+In addition to the very simple C++ implementation
+```c++
+class TensorArray {
+ public:
+  explicit TensorArray(const LoDTensor&);
+  explicit TensorArray(size_t size);
+ private:
+  vector<LoDTensor> values_;
+};
+```
+We also need to expose it to PaddlePaddle's Python API,
+because users would want to use it with our very flexible operators `WhileLoop`.
+An example for a RNN based on dynamic operators is 
+```python
+input = pd.data(...)
+num_steps = Var(12)
+TensorArray states(size=num_steps)
+TensorArray step_inputs(unstack_from=input)
+TensorArray step_outputs(size=num_steps)
+W = Tensor(...)
+U = Tensor(...)
+default_state = some_op()
+step = Var(1)
+wloop = paddle.create_whileloop(loop_vars=[step])
+with wloop.frame():
+    wloop.break_if(pd.equal(step, num_steps)
+    pre_state = states.read(step-1, default_state)
+    step_input = step_inputs.read(step)
+    state = pd.sigmoid(pd.matmul(U, pre_state) + pd.matmul(W, step_input))
+    states.write(step, state)
+    step_outputs.write(step, state) # output state
+    step.update(state+1)
+output = step_outputs.stack()
+```
+## Background
+Steps are one of the core concepts of RNN. In each time step of RNN, there should be several input segments, states, and output segments; all these components act like arrays, for example, call `states[step_id]` will get the state in `step_id`th time step.
+An RNN can be implemented with the following pseudocode
+```c++
+Array states;
+Array input_segments;
+Array output_segments;
+Parameter W, U;
+step = 1
+seq_len = 12
+while_loop {
+   if (step == seq_len) break;
+    states[step] = sigmoid(W * states[step-1] + U * input_segments[step]);
+    output_segments[step] = states[step] // take state as output
+   step++;
+}
+```
+According to the [RNN roadmap](https://github.com/PaddlePaddle/Paddle/issues/4561), there are several different RNNs that PaddlePaddle will eventually support.
+Currently, the basic RNN implementation supported by PaddlePaddle is the `recurrent_op` which takes tensors as input and splits them into `input_segments`.
+Since a tensor cannot store variable-length sequences directly, PaddlePaddle implements the tensor with level of details (`LoDTensor` for short).
+Segmenting the `LoDTensor` is much more complicated than splitting a tensor, that makes it necessary to refactor the `recurrent_op` with `LoDTensor` segmenting support.
+As the next step in RNN support, `dynamic_recurrent_op` should be introduced to handle inputs with variable-length sequences.
+The implementation is similar to `recurrent_op`. 
+The key difference is the way **the original input `LoDTensors` and outupts are split to get the `input_segments` and the `output_segments`.**
+Though it can't be built over `recurrent_op` or `dynamic_recurrent_op` directly,
+the logic behind splitting a tensor or a LoD tensor into `input_segments` remains the same.
+## Why `TensorArray`
+The logic behind splitting the inputs to segments, states and outputs is similar and can be shared in a seperate module.
+The array of `states`, `input_segments` and `output_segments` would be exposed to users when writing a dynamic RNN model similar to the above pseudo codes. 
+So there should be an array-like container, which can store the segments of a tensor or LoD tensor.
+**This container can store an array of tensors and provides several methods to split a tensor or a LoD tensor** .
+This is where the notion of `TensorArray` comes from.
+## Introduce TensorArray to uniform all the three RNNs
+TensorArray as a new concept is borrowed from TensorFlow, 
+it is meant to be used with dynamic iteration primitives such as `while_loop` and `map_fn`.
+This concept can be used to support our new design of dynamic operations, and help to refactor some existing variant-sentence-related layers, 
+such as `recurrent_op`, `RecurrentGradientMachine`.
+In [our design for dynamic RNN](https://github.com/PaddlePaddle/Paddle/pull/4401), 
+`TensorArray` is used to segment inputs and store states in all time steps.
+By providing some methods similar to a C++ array,
+the definition of some state-based dynamic models such as RNN can be more natural and highly flexible.
+## Dynamic-operations on TensorArray
+`TensorArray` will be used directly when defining dynamic models, so some operators listed below should be implemented
+```python
+# several helper operators for TensorArray
+def tensor_array_stack(ta, tensor):
+    '''
+    get a tensor array `ta`, return a packed `tensor`.
+    '''
+    pass
+def tensor_array_unstack(tensor, ta):
+    '''
+    get a `tensor`, unstack it and get a tensor array `ta`.
+    '''
+    pass
+def tensor_array_write(ta, index, tensor, data_shared):
+    '''
+    get a `tensor` and a scalar tensor `index`, write `tensor` into index-th
+    value of the tensor array `ta`.
+    `data_shared` is an attribute that specifies whether to copy or reference the tensors.
+    '''
+    pass
+def tensor_array_read(ta, index, tensor):
+    '''
+    get a tensor array `ta`, a scalar tensor `index`, read the index-th value of
+    `ta` and return as the `tensor`.
+    '''
+    pass
+def tensor_array_size(ta, tensor):
+    '''
+    get a tensor array `ta`, return the size of `ta` and return as the scalar `tensor`.
+    '''
+    pass
+```
+It is trivial for users to use so many low-level operators, so some helper methods should be proposed in python wrapper to make `TensorArray` easier to use, 
+for example
+```python
+class TensorArray:
+    def __init__(self, name):
+        self.name = name
+        self.desc = TensorArrayDesc()
+    def stack(self, name=None):
+        '''
+        Pack the values in a `TensorArray` into a tensor with rank one higher
+        than each tensor in `values`.
+        `stack` can be used to split tensor into time steps for RNN or whileloop.
+        @name: str
+            the name of the variable to output.
+        '''
+        tensor = Var(name)
+        tensor_array_stack(self.name, tensor)
+        return tensor
+    def unstack(self, input):
+        '''
+        Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
+        `unstack` can be used to concatenate all the time steps for RNN or whileloop.
+        @input: str
+            the name of input tensor
+        '''
+        tensor_array_unstack(tensor, self.name)
+    def write(self, index, value, data_shared=True):
+        '''
+        Write value into index of the TensorArray.
+        If `data_shared` is set to True, than the index-th value in TensorArray will
+        be shared with the tensor passed in.
+        @index: str
+            name of a scalar tensor
+        @value: str
+            name of a tensor
+        @data_shared: bool
+        '''
+        tensor_array_write(self.name, index, value, data_shared)
+    def read(self, index, output):
+        '''
+        Read the value at location `index` in the `TensorArray`.
+        @index: str
+            name of a scalar tensor
+        @output:
+            name of a output variable
+        '''
+        tensor_array_read(self.name, index, output)
+    def size(self, output):
+        '''
+        Return the number of values.
+        @output: str
+            name of a scalar tensor
+        '''
+        tensor_array_size(self.name, output)
+```
+## LoDTensor-related Supports
+The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes varience-length sequences as input, and output sequences too.
+Since each step of RNN can only take a tensor-represented batch of data as input, 
+some preprocess should be taken on the inputs such as sorting the sentences by their length in descending order and cut each word and pack to new batches.
+Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`,
+these two operations are similar to `stack` and `unstack` except that they operate on variable-length sequences formated as a LoD tensor rather than a tensor.
+Some definitions are like
+```python
+def unpack(level):
+    '''
+    Split LodTensor in some `level` and generate batches, if set `sort_by_length`,
+    will sort by length.
+    Returns:
+        - a new `TensorArray`, whose values are LodTensors and represents batches
+          of data.
+        - an int32 Tensor, which stores the map from the new batch's indices to
+          original LoDTensor
+    '''
+    pass
+def pack(level, indices_map):
+    '''
+    Recover the original LoD-arranged LoDTensor with the values in a `TensorArray`
+    and `level` and `indices_map`.
+    '''
+    pass
+```
+With these two methods, a varience-length sentence supported RNN can be implemented like
+```c++
+// input is the varient-length data
+LodTensor sentence_input(xxx);
+TensorArray ta;
+Tensor indice_map;
+Tensor boot_state = xxx; // to initialize rnn's first state
+TensorArray::unpack(input, 1/*level*/, true/*sort_by_length*/, &ta, &indice_map);
+TessorArray step_outputs;
+TensorArray states;
+for (int step = 0; step = ta.size(); step++) {
+  auto state = states.read(step);
+  // rnnstep is a function which acts like a step of RNN
+  auto step_input = ta.read(step);
+  auto step_output = rnnstep(step_input, state);
+  step_outputs.write(step_output, true/*data_shared*/);
+}
+// rnn_output is the final output of an rnn
+LoDTensor rnn_output = ta.pack(ta, indice_map);
+```
+the code above shows that by embedding the LoDTensor-related preprocess operations into `TensorArray`,
+the implementation of a RNN that supports varient-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend.
--- a/doc/design/test.dot
+++ b/doc/design/test.dot
+digraph Test {
+    z -> generator -> G_img;
+    G_img -> discriminator -> D_f -> d_loss_f;
+    label0 -> d_loss_f -> d_loss;
+    img -> discriminator -> D_t -> d_loss_t;
+    label1 -> d_loss_t -> d_loss;
+    d_loss -> d_loss_t[color=red, style=dashed];
+    d_loss -> d_loss_f[color=red, style=dashed];
+    d_loss_t -> D_t[color=red, style=dashed];
+    d_loss_f -> D_f[color=red, style=dashed];
+    D_t -> discriminator[color=red, style=dashed];
+    D_f -> discriminator[color=red, style=dashed];
+    D_f -> g_loss;
+    label2 -> g_loss;
+    g_loss -> D_f[color=green, style=dashed];
+    D_f -> discriminator[color=green, style=dashed];
+    discriminator -> G_img[color=green, style=dashed];
+    G_img -> generator[color=green, style=dashed];
+    discriminator [color=red, shape=box];
+    generator [color=green, shape=box];
+    z [shape=diamond];
+    img [shape=diamond];
+    label0 [shape=diamond];
+    label1 [shape=diamond];
+    label2 [shape=diamond];
+    d_loss [color=red];
+    g_loss [color=green];
+}
--- a/doc/design/test.dot.png
+++ b/doc/design/test.dot.png
--- a/doc/design/var_desc.md
+++ b/doc/design/var_desc.md
 ## Background
 PaddlePaddle divides the description of neural network computation graph into two stages: compile time and runtime.
-PaddlePaddle use proto message to describe compile time graph for
+PaddlePaddle use proto message to describe compile time graph because
 1. Computation graph should be able to be saved to a file.
 1. In distributed training, the graph will be serialized and send to multiple workers.
@@ -16,16 +16,23 @@ The computation graph is constructed by Data Node and Operation Node. The concep
 ## Definition of VarDesc
-A VarDesc should have a name and value, in PaddlePaddle, the value will always be a tensor. Since we use LoDTensor most of the time. We add a LoDTesnorDesc to represent it.
+A VarDesc should have a name, and value. The are two kinds of variable type in compile time, they are `LoDTensor` and `SelectedRows`. 
 ```proto
 message VarDesc {
  required string name = 1;
-  optional LoDTensorDesc lod_tensor = 2;
+  enum VarType {
+    LOD_TENSOR = 0;
+    SELECTED_ROWS = 1;
+  }
+  required VarType type = 2;
+  optional LoDTensorDesc lod_desc = 3;
+  optional TensorDesc selected_rows_desc = 4;
+  optional bool persistable = 5 [ default = false ];
 }
 ```
-## Definition of LodTensorDesc
+## Definition of TensorDesc
 ```proto
 enum DataType {
@@ -38,87 +45,25 @@ enum DataType {
  FP64 = 6;
 }
-message LoDTensorDesc {
+message TensorDesc {
  required DataType data_type = 1;
-  repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
-  optional int32 lod_level = 3 [default=0];
 }
 ```
-## Definition of Variable in Python
+A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md).
-In Python API, layer will take Variable as Input, and return Variable as Output. There should be a class `Variable` in python to help create and manage Variable.
-```python
-image = Variable(dims=[-1, 640, 480])
-# fc1 and fc2 are both Variable
-fc1 = layer.fc(input=image, output_size=10)
-fc2 = layer.fc(input=fc1, output_size=20)
-```
-### what should class `Variable` Have
-1. `name`.a name of string type is used to mark the value of the Variable.
-1. `initializer`. Since our Tensor does not have value. we will always use some Operator to fullfill it when run. So we should have a initialize method to help add the init operator.
-1. `operator`. Variable should record which operator produce itself. The reaon is:
-  - we use pd.eval(targets=[var1, var2]) to run the related ops to get the value of var1 and var2. var.op is used to trace the dependency of the current variable.
-In PaddlePaddle, we use Block to describe Computation Graph, so in the code we will use Block but not Graph.
-```python
-import VarDesc
-import LoDTensorDesc
-import framework
-def AddInitialOperator(variable, initializer):
-	# add an initialize Operator to block to init this Variable
-class Variable(object):
-   def __init__(self, name, dims, type, initializer):
-      self._block = get_default_block()
-      self._name = name
-      self.op = None
-      tensor_desc = LoDTensorDesc(data_type=type, dims=dims)
-      _var_desc = VarDesc(name=name, lod_tensor=tensor_desc)
-      self._var = framework.CreateVar(_var_desc)
-      self._block.add_var(self)
-      # add initial op according to initializer
+## Definition of LodTensorDesc
-      if initializer is not None:
-          AddInitialOperator(self, initializer)
-   def dims(self):
-      return self._var.dims()
-   def data_type(self):
-       return self._var.data_type()
-   def to_proto(self):
+```proto
-       pass
+message LoDTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int lod_level = 2;
+}
 ```
-Then we can use this Variable to create a fc layer in Python.
+A LoDTensorDesc contains a tensor and a lod_level.
-```python
+## Definition of Variable in Python
-import paddle as pd
-def flatten_size(X, num_flatten_dims):
-  prod = 1 # of last num_flatten_dims
-  for i in xrange(num_flatten_dims):
-    prod = prod * X.dims[-i-1]
-  return prod
-def layer.fc(X, output_size, num_flatten_dims):
-  W = Variable(pd.random_uniform(), type=FP32, dims=[flatten_size(X, num_flatten_dims), output_size])
-  b = Variable(pd.random_uniform(), type=FP32, dims=[output_size])
-  out = Variable(type=FP32)
-  y = operator.fc(X, W, b, output=out) # fc will put fc op input into out
-  pd.InferShape(y)
-  return out
-x = Variable(dims=[-1, 640, 480])
-y = layer.fc(x, output_size=100)
-z = layer.fc(y, output_size=200)
-paddle.eval(targets=[z], ...)
+For Variable in Python, please reference [`Python API`](./python_api.md).
-print(z)
-```
--- a/doc/faq/build_and_install/index_cn.rst
+++ b/doc/faq/build_and_install/index_cn.rst
--- a/doc/faq/cluster/index_cn.rst
+++ b/doc/faq/cluster/index_cn.rst
+###############
+集群训练与预测
+###############
+..  contents::
+1. 集群多节点训练，日志中保存均为网络通信类错误
+------------------------------------------------
+集群多节点训练，日志报错为网络通信类错误，比如 :code:`Connection reset by peer` 等。
+此类报错通常是由于某一个节点的错误导致这个节点的训练进程退出，从而引发其他节点无法连接导致，可以参考下面的步骤排查：
+* 从 :code:`train.log` ， :code:`server.log` 找到最早报错的地方，查看是否是其他错误引发的报错（比如FPE，内存不足，磁盘空间不足等）。
+* 如果发现最早的报错就是网络通信的问题，很有可能是非独占方式执行导致的端口冲突，可以联系OP，看当前MPI集群是否支持resource=full参数提交，如果支持增加此参数提交，并更换job 端口。
+* 如果当前MPI集群并不支持任务独占模式，可以联系OP是否可以更换集群或升级当前集群。
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
--- a/doc/faq/local/index_cn.rst
+++ b/doc/faq/local/index_cn.rst
--- a/doc/faq/src/reduce_min_pool_size.py
+++ b/doc/faq/src/reduce_min_pool_size.py
--- a/doc/faq/src/word2vec_config.py
+++ b/doc/faq/src/word2vec_config.py
--- a/doc/faq/src/word2vec_dataprovider.py
+++ b/doc/faq/src/word2vec_dataprovider.py
--- a/doc/faq/model/index_cn.rst
+++ b/doc/faq/model/index_cn.rst
--- a/doc/faq/parameter/index_cn.rst
+++ b/doc/faq/parameter/index_cn.rst
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
--- a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
+++ b/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
--- a/doc/howto/deep_model/rnn/rnn_config_cn.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst
--- a/doc/howto/deep_model/rnn/rnn_config_en.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_en.rst
--- a/doc/tutorials/sentiment_analysis/bi_lstm.jpg
+++ b/doc/tutorials/sentiment_analysis/bi_lstm.jpg
--- a/doc/tutorials/text_generation/encoder-decoder-attention-model.png
+++ b/doc/tutorials/text_generation/encoder-decoder-attention-model.png
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
--- a/doc/howto/dev/use_eigen_en.md
+++ b/doc/howto/dev/use_eigen_en.md
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
--- a/doc/howto/usage/cluster/src/trainer.png
+++ b/doc/howto/usage/cluster/src/trainer.png
--- a/doc/howto/usage/cluster/src/trainer_cn.png
+++ b/doc/howto/usage/cluster/src/trainer_cn.png
--- a/doc/howto/usage/cluster/src/word2vec/api_train_v2.py
+++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2.py
--- a/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
+++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
--- a/doc/howto/usage/cluster/src/word2vec/prepare.py
+++ b/doc/howto/usage/cluster/src/word2vec/prepare.py
--- a/doc/survey/cluster_bootstrapping_tools.md
+++ b/doc/survey/cluster_bootstrapping_tools.md
--- a/doc/tutorials/image_classification/cifar.png
+++ b/doc/tutorials/image_classification/cifar.png
--- a/doc/tutorials/image_classification/image_classification.png
+++ b/doc/tutorials/image_classification/image_classification.png
--- a/doc/tutorials/image_classification/index_cn.md
+++ b/doc/tutorials/image_classification/index_cn.md
--- a/doc/tutorials/image_classification/index_en.md
+++ b/doc/tutorials/image_classification/index_en.md
--- a/doc/tutorials/image_classification/lenet.png
+++ b/doc/tutorials/image_classification/lenet.png
--- a/doc/tutorials/image_classification/plot.png
+++ b/doc/tutorials/image_classification/plot.png
--- a/doc/tutorials/image_classification/src/cifar.png
+++ b/doc/tutorials/image_classification/src/cifar.png
--- a/doc/tutorials/image_classification/src/image_classification.png
+++ b/doc/tutorials/image_classification/src/image_classification.png
--- a/doc/tutorials/image_classification/src/lenet.png
+++ b/doc/tutorials/image_classification/src/lenet.png
--- a/doc/tutorials/image_classification/src/plot.png
+++ b/doc/tutorials/image_classification/src/plot.png
--- a/doc/tutorials/index_cn.md
+++ b/doc/tutorials/index_cn.md
--- a/doc/tutorials/index_en.md
+++ b/doc/tutorials/index_en.md
--- a/doc/tutorials/rec/ml_dataset_cn.md
+++ b/doc/tutorials/rec/ml_dataset_cn.md
--- a/doc/tutorials/rec/ml_dataset_en.md
+++ b/doc/tutorials/rec/ml_dataset_en.md
--- a/doc/tutorials/rec/ml_regression_cn.rst
+++ b/doc/tutorials/rec/ml_regression_cn.rst
--- a/doc/tutorials/rec/ml_regression_en.rst
+++ b/doc/tutorials/rec/ml_regression_en.rst
--- a/doc/tutorials/rec/rec_regression_network.png
+++ b/doc/tutorials/rec/rec_regression_network.png
--- a/doc/tutorials/semantic_role_labeling/feature.jpg
+++ b/doc/tutorials/semantic_role_labeling/feature.jpg
--- a/doc/tutorials/semantic_role_labeling/index_cn.md
+++ b/doc/tutorials/semantic_role_labeling/index_cn.md
--- a/doc/tutorials/semantic_role_labeling/index_en.md
+++ b/doc/tutorials/semantic_role_labeling/index_en.md
--- a/doc/tutorials/semantic_role_labeling/network_arch.png
+++ b/doc/tutorials/semantic_role_labeling/network_arch.png
--- a/doc/tutorials/semantic_role_labeling/src/curve.jpg
+++ b/doc/tutorials/semantic_role_labeling/src/curve.jpg
--- a/doc/tutorials/semantic_role_labeling/src/feature.jpg
+++ b/doc/tutorials/semantic_role_labeling/src/feature.jpg
--- a/doc/tutorials/semantic_role_labeling/src/network_arch.png
+++ b/doc/tutorials/semantic_role_labeling/src/network_arch.png
--- a/doc/tutorials/sentiment_analysis/index_cn.md
+++ b/doc/tutorials/sentiment_analysis/index_cn.md
--- a/doc/tutorials/sentiment_analysis/index_en.md
+++ b/doc/tutorials/sentiment_analysis/index_en.md
--- a/doc/tutorials/sentiment_analysis/lstm.png
+++ b/doc/tutorials/sentiment_analysis/lstm.png
--- a/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg
+++ b/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg
--- a/doc/tutorials/sentiment_analysis/src/lstm.png
+++ b/doc/tutorials/sentiment_analysis/src/lstm.png
--- a/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg
+++ b/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg
--- a/doc/tutorials/sentiment_analysis/stacked_lstm.jpg
+++ b/doc/tutorials/sentiment_analysis/stacked_lstm.jpg
--- a/doc/tutorials/text_generation/index_cn.md
+++ b/doc/tutorials/text_generation/index_cn.md
--- a/doc/tutorials/text_generation/index_en.md
+++ b/doc/tutorials/text_generation/index_en.md
--- a/doc/v1_api_tutorials/README.md
+++ b/doc/v1_api_tutorials/README.md
--- a/doc/tutorials/embedding_model/index_cn.md
+++ b/doc/tutorials/embedding_model/index_cn.md
--- a/doc/tutorials/embedding_model/index_en.md
+++ b/doc/tutorials/embedding_model/index_en.md
--- a/doc/tutorials/embedding_model/neural-n-gram-model.png
+++ b/doc/tutorials/embedding_model/neural-n-gram-model.png
--- a/doc/tutorials/gan/gan.png
+++ b/doc/tutorials/gan/gan.png
--- a/doc/tutorials/gan/index_en.md
+++ b/doc/tutorials/gan/index_en.md
--- a/doc/tutorials/gan/mnist_sample.png
+++ b/doc/tutorials/gan/mnist_sample.png
--- a/doc/tutorials/gan/uniform_sample.png
+++ b/doc/tutorials/gan/uniform_sample.png
--- a/doc/tutorials/imagenet_model/resnet_block.jpg
+++ b/doc/tutorials/imagenet_model/resnet_block.jpg
--- a/doc/tutorials/imagenet_model/resnet_model_cn.md
+++ b/doc/tutorials/imagenet_model/resnet_model_cn.md
--- a/doc/tutorials/imagenet_model/resnet_model_en.md
+++ b/doc/tutorials/imagenet_model/resnet_model_en.md
--- a/doc/tutorials/quick_start/index_cn.rst
+++ b/doc/tutorials/quick_start/index_cn.rst
--- a/doc/tutorials/quick_start/index_en.md
+++ b/doc/tutorials/quick_start/index_en.md
--- a/doc/tutorials/quick_start/src/NetContinuous_cn.jpg
+++ b/doc/tutorials/quick_start/src/NetContinuous_cn.jpg
--- a/doc/tutorials/quick_start/src/NetContinuous_en.png
+++ b/doc/tutorials/quick_start/src/NetContinuous_en.png
--- a/doc/tutorials/quick_start/src/NetConv_cn.jpg
+++ b/doc/tutorials/quick_start/src/NetConv_cn.jpg
--- a/doc/tutorials/quick_start/src/NetConv_en.png
+++ b/doc/tutorials/quick_start/src/NetConv_en.png
--- a/doc/tutorials/quick_start/src/NetLR_cn.jpg
+++ b/doc/tutorials/quick_start/src/NetLR_cn.jpg
--- a/doc/tutorials/quick_start/src/NetLR_en.png
+++ b/doc/tutorials/quick_start/src/NetLR_en.png
--- a/doc/tutorials/quick_start/src/NetRNN_cn.jpg
+++ b/doc/tutorials/quick_start/src/NetRNN_cn.jpg
--- a/doc/tutorials/quick_start/src/NetRNN_en.png
+++ b/doc/tutorials/quick_start/src/NetRNN_en.png
--- a/doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg
+++ b/doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg
--- a/doc/tutorials/quick_start/src/PipelineNetwork_en.jpg
+++ b/doc/tutorials/quick_start/src/PipelineNetwork_en.jpg
--- a/doc/tutorials/quick_start/src/PipelineTest_cn.jpg
+++ b/doc/tutorials/quick_start/src/PipelineTest_cn.jpg
--- a/doc/tutorials/quick_start/src/PipelineTest_en.png
+++ b/doc/tutorials/quick_start/src/PipelineTest_en.png
--- a/doc/tutorials/quick_start/src/PipelineTrain_cn.jpg
+++ b/doc/tutorials/quick_start/src/PipelineTrain_cn.jpg
--- a/doc/tutorials/quick_start/src/PipelineTrain_en.png
+++ b/doc/tutorials/quick_start/src/PipelineTrain_en.png
--- a/doc/tutorials/quick_start/src/Pipeline_cn.jpg
+++ b/doc/tutorials/quick_start/src/Pipeline_cn.jpg
--- a/doc/tutorials/quick_start/src/Pipeline_en.jpg
+++ b/doc/tutorials/quick_start/src/Pipeline_en.jpg
--- a/go/glide.lock
+++ b/go/glide.lock
--- a/go/glide.yaml
+++ b/go/glide.yaml
--- a/go/pserver/client/client.go
+++ b/go/pserver/client/client.go
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
--- a/paddle/capi/export.sym
+++ b/paddle/capi/export.sym
--- a/paddle/capi/export.map
+++ b/paddle/capi/export.map
--- a/paddle/capi/tests/CMakeLists.txt
+++ b/paddle/capi/tests/CMakeLists.txt
--- a/paddle/cuda/include/hl_cuda_cudnn.h
+++ b/paddle/cuda/include/hl_cuda_cudnn.h
--- a/paddle/cuda/include/hl_tensor_ops.h
+++ b/paddle/cuda/include/hl_tensor_ops.h
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
--- a/paddle/framework/backward.md
+++ b/paddle/framework/backward.md
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
--- a/paddle/framework/details/op_registry.h
+++ b/paddle/framework/details/op_registry.h
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
--- a/paddle/gserver/layers/ExpandConvBaseLayer.h
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.h
--- a/paddle/framework/feed_fetch_method.h
+++ b/paddle/framework/feed_fetch_method.h
--- a/paddle/framework/feed_fetch_type.h
+++ b/paddle/framework/feed_fetch_type.h
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
--- a/paddle/framework/lod_tensor.md
+++ b/paddle/framework/lod_tensor.md
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
--- a/paddle/framework/op_proto_maker.cc
+++ b/paddle/framework/op_proto_maker.cc
--- a/paddle/framework/op_proto_maker.h
+++ b/paddle/framework/op_proto_maker.h
--- a/paddle/framework/op_proto_maker_test.cc
+++ b/paddle/framework/op_proto_maker_test.cc
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
--- a/paddle/framework/program_desc_test.cc
+++ b/paddle/framework/program_desc_test.cc
--- a/paddle/framework/proto_desc.h
+++ b/paddle/framework/proto_desc.h
--- a/paddle/framework/prune.cc
+++ b/paddle/framework/prune.cc
--- a/paddle/framework/grad_op_builder.h
+++ b/paddle/framework/grad_op_builder.h
--- a/paddle/framework/prune_test.cc
+++ b/paddle/framework/prune_test.cc
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
--- a/paddle/framework/selected_rows.cc
+++ b/paddle/framework/selected_rows.cc
--- a/paddle/framework/selected_rows.h
+++ b/paddle/framework/selected_rows.h
--- a/paddle/framework/selected_rows_test.cc
+++ b/paddle/framework/selected_rows_test.cc
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
--- a/paddle/framework/tensor_array.cc
+++ b/paddle/framework/tensor_array.cc
--- a/paddle/framework/tensor_array.h
+++ b/paddle/framework/tensor_array.h
--- a/paddle/framework/tensor_array_test.cc
+++ b/paddle/framework/tensor_array_test.cc
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
--- a/paddle/framework/type_defs.h
+++ b/paddle/framework/type_defs.h
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
--- a/paddle/framework/var_type_inference.h
+++ b/paddle/framework/var_type_inference.h
--- a/paddle/framework/var_type_inference_test.cc
+++ b/paddle/framework/var_type_inference_test.cc
--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
--- a/paddle/framework/variable.md
+++ b/paddle/framework/variable.md
--- a/paddle/function/BlockExpandOp.cpp
+++ b/paddle/function/BlockExpandOp.cpp
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
--- a/paddle/function/CosSimOp.cpp
+++ b/paddle/function/CosSimOp.cpp
--- a/paddle/function/CropOp.cpp
+++ b/paddle/function/CropOp.cpp
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
--- a/paddle/function/DepthwiseConvOpTest.cpp
+++ b/paddle/function/DepthwiseConvOpTest.cpp
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
--- a/paddle/function/GemmConvOpTest.cpp
+++ b/paddle/function/GemmConvOpTest.cpp
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
--- a/paddle/function/PadOp.cpp
+++ b/paddle/function/PadOp.cpp
--- a/paddle/function/RowConvOp.cpp
+++ b/paddle/function/RowConvOp.cpp
--- a/paddle/function/SwitchOp.cpp
+++ b/paddle/function/SwitchOp.cpp
--- a/paddle/function/neon/NeonDepthwiseConv.cpp
+++ b/paddle/function/neon/NeonDepthwiseConv.cpp
--- a/paddle/function/neon/NeonDepthwiseConv.h
+++ b/paddle/function/neon/NeonDepthwiseConv.h
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
--- a/paddle/gserver/activations/MKLDNNActivation.cpp
+++ b/paddle/gserver/activations/MKLDNNActivation.cpp
--- a/paddle/gserver/activations/MKLDNNActivation.h
+++ b/paddle/gserver/activations/MKLDNNActivation.h
--- a/paddle/gserver/gradientmachines/GradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/GradientMachine.cpp
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
--- a/paddle/gserver/layers/CudnnPoolLayer.cpp
+++ b/paddle/gserver/layers/CudnnPoolLayer.cpp
--- a/paddle/gserver/layers/DetectionOutputLayer.cpp
+++ b/paddle/gserver/layers/DetectionOutputLayer.cpp
--- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
--- a/paddle/gserver/layers/MKLDNNBase.h
+++ b/paddle/gserver/layers/MKLDNNBase.h
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
--- a/paddle/gserver/layers/MKLDNNPoolLayer.h
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.h
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
--- a/paddle/gserver/layers/SequenceSliceLayer.cpp
+++ b/paddle/gserver/layers/SequenceSliceLayer.cpp
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
--- a/paddle/gserver/tests/mkldnn_branches_conv.conf
+++ b/paddle/gserver/tests/mkldnn_branches_conv.conf
--- a/paddle/gserver/tests/mkldnn_branches_fc.conf
+++ b/paddle/gserver/tests/mkldnn_branches_fc.conf
--- a/paddle/gserver/tests/mkldnn_branches_pool.conf
+++ b/paddle/gserver/tests/mkldnn_branches_pool.conf
--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
--- a/paddle/gserver/tests/test_CRFLayerGrad.cpp
+++ b/paddle/gserver/tests/test_CRFLayerGrad.cpp
--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ b/paddle/gserver/tests/test_ConvTrans.cpp
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
--- a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
+++ b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
--- a/paddle/gserver/tests/test_DetectionOutput.cpp
+++ b/paddle/gserver/tests/test_DetectionOutput.cpp
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
--- a/paddle/gserver/tests/test_KmaxSeqScore.cpp
+++ b/paddle/gserver/tests/test_KmaxSeqScore.cpp
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
--- a/paddle/gserver/tests/test_PriorBox.cpp
+++ b/paddle/gserver/tests/test_PriorBox.cpp
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
--- a/paddle/gserver/tests/test_PyDataProvider.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider.cpp
--- a/paddle/gserver/tests/test_PyDataProvider2.py
+++ b/paddle/gserver/tests/test_PyDataProvider2.py
--- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp
+++ b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
--- a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
+++ b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
--- a/paddle/math/NEONFunctions.cpp
+++ b/paddle/math/NEONFunctions.cpp
--- a/paddle/math/NEONFunctions.h
+++ b/paddle/math/NEONFunctions.h
--- a/paddle/math/RowBuffer.h
+++ b/paddle/math/RowBuffer.h
--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
--- a/paddle/math/tests/test_Allocator.cpp
+++ b/paddle/math/tests/test_Allocator.cpp
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ b/paddle/math/tests/test_CpuGpuVector.cpp
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ b/paddle/math/tests/test_GpuProfiler.cpp
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
--- a/paddle/math/tests/test_SparseMatrix.cpp
+++ b/paddle/math/tests/test_SparseMatrix.cpp
--- a/paddle/math/tests/test_Tensor.cu
+++ b/paddle/math/tests/test_Tensor.cu
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
--- a/paddle/math/tests/test_batchTranspose.cpp
+++ b/paddle/math/tests/test_batchTranspose.cpp
--- a/paddle/math/tests/test_lazyAssign.cu
+++ b/paddle/math/tests/test_lazyAssign.cu
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
--- a/paddle/memory/.clang-format
+++ b/paddle/memory/.clang-format
--- a/paddle/memory/.clang-format
+++ b/paddle/memory/.clang-format
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
--- a/paddle/memory/memcpy.h
+++ b/paddle/memory/memcpy.h
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
--- a/paddle/operators/.clang-format
+++ b/paddle/operators/.clang-format
--- a/paddle/operators/.clang-format
+++ b/paddle/operators/.clang-format
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
--- a/paddle/operators/adadelta_op.cu
+++ b/paddle/operators/adadelta_op.cu
--- a/paddle/operators/adadelta_op.h
+++ b/paddle/operators/adadelta_op.h
--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
--- a/paddle/operators/adagrad_op.h
+++ b/paddle/operators/adagrad_op.h
--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
--- a/paddle/operators/adam_op.cu
+++ b/paddle/operators/adam_op.cu
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
--- a/paddle/operators/adamax_op.h
+++ b/paddle/operators/adamax_op.h
--- a/paddle/operators/batch_norm_op.md
+++ b/paddle/operators/batch_norm_op.md
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
--- a/paddle/operators/clip_op.cu
+++ b/paddle/operators/clip_op.cu
--- a/paddle/operators/clip_op.h
+++ b/paddle/operators/clip_op.h
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
--- a/paddle/operators/concat_op.cu
+++ b/paddle/operators/concat_op.cu
--- a/paddle/operators/concat_op.h
+++ b/paddle/operators/concat_op.h
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
--- a/paddle/operators/cond_op.h
+++ b/paddle/operators/cond_op.h
--- a/paddle/operators/conv2d_op.cc
+++ b/paddle/operators/conv2d_op.cc
--- a/paddle/operators/conv2d_op.cu
+++ b/paddle/operators/conv2d_op.cu
--- a/paddle/operators/conv2d_op.h
+++ b/paddle/operators/conv2d_op.h
--- a/paddle/operators/conv2dtranspose_op.cc
+++ b/paddle/operators/conv2dtranspose_op.cc
--- a/paddle/operators/conv2dtranspose_op.cu
+++ b/paddle/operators/conv2dtranspose_op.cu
--- a/paddle/operators/conv2dtranspose_op.h
+++ b/paddle/operators/conv2dtranspose_op.h
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
--- a/paddle/operators/conv_cudnn_op.cu
+++ b/paddle/operators/conv_cudnn_op.cu
--- a/paddle/operators/conv_shift_op.cc
+++ b/paddle/operators/conv_shift_op.cc
--- a/paddle/operators/conv_shift_op.cu
+++ b/paddle/operators/conv_shift_op.cu
--- a/paddle/operators/conv_shift_op.h
+++ b/paddle/operators/conv_shift_op.h
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
--- a/paddle/operators/crop_op.cu
+++ b/paddle/operators/crop_op.cu
--- a/paddle/operators/crop_op.h
+++ b/paddle/operators/crop_op.h
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
--- a/paddle/operators/decayed_adagrad_op.cc
+++ b/paddle/operators/decayed_adagrad_op.cc
--- a/paddle/operators/decayed_adagrad_op.cu
+++ b/paddle/operators/decayed_adagrad_op.cu
--- a/paddle/operators/decayed_adagrad_op.h
+++ b/paddle/operators/decayed_adagrad_op.h
--- a/paddle/operators/detail/strided_memcpy.h
+++ b/paddle/operators/detail/strided_memcpy.h
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
--- a/paddle/operators/dynamic_recurrent_op.cc
+++ b/paddle/operators/dynamic_recurrent_op.cc
--- a/paddle/operators/dynamic_recurrent_op.h
+++ b/paddle/operators/dynamic_recurrent_op.h
--- a/paddle/operators/dynamic_recurrent_op_test.cc
+++ b/paddle/operators/dynamic_recurrent_op_test.cc
--- a/paddle/operators/elementwise_add_op.cc
+++ b/paddle/operators/elementwise_add_op.cc
--- a/paddle/operators/elementwise_add_op.cu
+++ b/paddle/operators/elementwise_add_op.cu
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
--- a/paddle/operators/elementwise_div_op.cc
+++ b/paddle/operators/elementwise_div_op.cc
--- a/paddle/operators/elementwise_div_op.cu
+++ b/paddle/operators/elementwise_div_op.cu
--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
--- a/paddle/operators/elementwise_mul_op.cu
+++ b/paddle/operators/elementwise_mul_op.cu
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
--- a/paddle/operators/elementwise_sub_op.cc
+++ b/paddle/operators/elementwise_sub_op.cc
--- a/paddle/operators/elementwise_sub_op.cu
+++ b/paddle/operators/elementwise_sub_op.cu
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
--- a/paddle/operators/fill_constant_op.cu
+++ b/paddle/operators/fill_constant_op.cu
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
--- a/paddle/operators/gather.cu.h
+++ b/paddle/operators/gather.cu.h
--- a/paddle/operators/gather.h
+++ b/paddle/operators/gather.h
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
--- a/paddle/operators/identity_op.cc
+++ b/paddle/operators/identity_op.cc
--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
--- a/paddle/operators/gather_test.cc
+++ b/paddle/operators/gather_test.cc
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
--- a/paddle/operators/gru_unit_op.h
+++ b/paddle/operators/gru_unit_op.h
--- a/paddle/operators/huber_loss_op.cc
+++ b/paddle/operators/huber_loss_op.cc
--- a/paddle/operators/images/batch_norm_fork.dot
+++ b/paddle/operators/images/batch_norm_fork.dot
--- a/paddle/operators/images/batch_norm_fork.png
+++ b/paddle/operators/images/batch_norm_fork.png
--- a/paddle/operators/images/batch_norm_op_kernel.png
+++ b/paddle/operators/images/batch_norm_op_kernel.png
--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
--- a/paddle/operators/increment_op.cu
+++ b/paddle/operators/increment_op.cu
--- a/paddle/operators/increment_op.h
+++ b/paddle/operators/increment_op.h
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
--- a/paddle/operators/lookup_table_op.h
+++ b/paddle/operators/lookup_table_op.h
--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
--- a/paddle/operators/lstm_op.cu
+++ b/paddle/operators/lstm_op.cu
--- a/paddle/operators/lstm_op.h
+++ b/paddle/operators/lstm_op.h
--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
--- a/paddle/operators/lstm_unit_op.cu
+++ b/paddle/operators/lstm_unit_op.cu
--- a/paddle/operators/lstm_unit_op.h
+++ b/paddle/operators/lstm_unit_op.h
--- a/paddle/operators/margin_rank_loss_op.cc
+++ b/paddle/operators/margin_rank_loss_op.cc
--- a/paddle/operators/margin_rank_loss_op.cu
+++ b/paddle/operators/margin_rank_loss_op.cu
--- a/paddle/operators/margin_rank_loss_op.h
+++ b/paddle/operators/margin_rank_loss_op.h
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
--- a/paddle/operators/math/cross_entropy.cu
+++ b/paddle/operators/math/cross_entropy.cu
--- a/paddle/operators/math/cross_entropy.h
+++ b/paddle/operators/math/cross_entropy.h
--- a/paddle/operators/math/detail/CMakeLists.txt
+++ b/paddle/operators/math/detail/CMakeLists.txt
--- a/paddle/operators/math/detail/hl_activation_functions.h
+++ b/paddle/operators/math/detail/hl_activation_functions.h
--- a/paddle/operators/math/detail/hl_avx_functions.cc
+++ b/paddle/operators/math/detail/hl_avx_functions.cc
--- a/paddle/operators/math/detail/hl_avx_functions.h
+++ b/paddle/operators/math/detail/hl_avx_functions.h
--- a/paddle/operators/math/detail/hl_cpu_functions.cc
+++ b/paddle/operators/math/detail/hl_cpu_functions.cc
--- a/paddle/operators/math/detail/hl_functions.h
+++ b/paddle/operators/math/detail/hl_functions.h
--- a/paddle/operators/math/detail/hl_gpu_functions.h
+++ b/paddle/operators/math/detail/hl_gpu_functions.h
--- a/paddle/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_cpu_kernel.h
--- a/paddle/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_gpu_kernel.h
--- a/paddle/operators/math/detail/lstm_kernel.h
+++ b/paddle/operators/math/detail/lstm_kernel.h
--- a/paddle/operators/math/im2col.cc
+++ b/paddle/operators/math/im2col.cc
--- a/paddle/operators/math/im2col.cu
+++ b/paddle/operators/math/im2col.cu
--- a/paddle/operators/math/im2col.h
+++ b/paddle/operators/math/im2col.h
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
--- a/paddle/operators/math/lstm_compute.cc
+++ b/paddle/operators/math/lstm_compute.cc
--- a/paddle/operators/math/lstm_compute.cu
+++ b/paddle/operators/math/lstm_compute.cu
--- a/paddle/operators/math/lstm_compute.h
+++ b/paddle/operators/math/lstm_compute.h
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
--- a/paddle/operators/math/math_function_test.cu
+++ b/paddle/operators/math/math_function_test.cu
--- a/paddle/operators/math/matmul.h
+++ b/paddle/operators/math/matmul.h
--- a/paddle/operators/math/pooling.cc
+++ b/paddle/operators/math/pooling.cc
--- a/paddle/operators/math/pooling.cu
+++ b/paddle/operators/math/pooling.cu
--- a/paddle/operators/math/pooling.h
+++ b/paddle/operators/math/pooling.h
--- a/paddle/operators/math/selected_rows_functor.cc
+++ b/paddle/operators/math/selected_rows_functor.cc
--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
--- a/paddle/operators/math/selected_rows_functor_test.cc
+++ b/paddle/operators/math/selected_rows_functor_test.cc
--- a/paddle/operators/math/selected_rows_functor_test.cu
+++ b/paddle/operators/math/selected_rows_functor_test.cu
--- a/paddle/operators/math/sequence2batch.cc
+++ b/paddle/operators/math/sequence2batch.cc
--- a/paddle/operators/math/sequence2batch.cu
+++ b/paddle/operators/math/sequence2batch.cu
--- a/paddle/operators/math/sequence2batch.h
+++ b/paddle/operators/math/sequence2batch.h
--- a/paddle/operators/math/softmax.cc
+++ b/paddle/operators/math/softmax.cc
--- a/paddle/operators/math/softmax.cu
+++ b/paddle/operators/math/softmax.cu
--- a/paddle/operators/math/softmax.h
+++ b/paddle/operators/math/softmax.h
--- a/paddle/operators/math/vol2col.cc
+++ b/paddle/operators/math/vol2col.cc
--- a/paddle/operators/math/vol2col.cu
+++ b/paddle/operators/math/vol2col.cu
--- a/paddle/operators/math/vol2col.h
+++ b/paddle/operators/math/vol2col.h
--- a/paddle/operators/math/vol2col_test.cc
+++ b/paddle/operators/math/vol2col_test.cc
--- a/paddle/operators/matmul_op.cc
+++ b/paddle/operators/matmul_op.cc
--- a/paddle/operators/matmul_op.cu
+++ b/paddle/operators/matmul_op.cu
--- a/paddle/operators/matmul_op.h
+++ b/paddle/operators/matmul_op.h
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
--- a/paddle/operators/minus_op.h
+++ b/paddle/operators/minus_op.h
--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
--- a/paddle/operators/modified_huber_loss_op.cu
+++ b/paddle/operators/modified_huber_loss_op.cu
--- a/paddle/operators/modified_huber_loss_op.h
+++ b/paddle/operators/modified_huber_loss_op.h
--- a/paddle/operators/momentum_op.cc
+++ b/paddle/operators/momentum_op.cc
--- a/paddle/operators/momentum_op.cu
+++ b/paddle/operators/momentum_op.cu
--- a/paddle/operators/momentum_op.h
+++ b/paddle/operators/momentum_op.h
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
--- a/paddle/operators/multiplex_op.h
+++ b/paddle/operators/multiplex_op.h
--- a/paddle/operators/name_convention.md
+++ b/paddle/operators/name_convention.md
--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
--- a/paddle/operators/net_op_test.cc
+++ b/paddle/operators/net_op_test.cc
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
--- a/paddle/operators/pad_op.cu
+++ b/paddle/operators/pad_op.cu
--- a/paddle/operators/pad_op.h
+++ b/paddle/operators/pad_op.h
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
--- a/paddle/operators/pool_op.cu
+++ b/paddle/operators/pool_op.cu
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
--- a/paddle/operators/pool_with_index_op.cu
+++ b/paddle/operators/pool_with_index_op.cu
--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
--- a/paddle/operators/prelu_op.cu
+++ b/paddle/operators/prelu_op.cu
--- a/paddle/operators/prelu_op.h
+++ b/paddle/operators/prelu_op.h
--- a/paddle/operators/proximal_gd_op.cc
+++ b/paddle/operators/proximal_gd_op.cc
--- a/paddle/operators/proximal_gd_op.cu
+++ b/paddle/operators/proximal_gd_op.cu
--- a/paddle/operators/proximal_gd_op.h
+++ b/paddle/operators/proximal_gd_op.h
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
--- a/paddle/operators/rank_loss_op.cu
+++ b/paddle/operators/rank_loss_op.cu
--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
--- a/paddle/operators/reduce_op.cu
+++ b/paddle/operators/reduce_op.cu
--- a/paddle/operators/reduce_op.h
+++ b/paddle/operators/reduce_op.h
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
--- a/paddle/operators/rmsprop_op.cc
+++ b/paddle/operators/rmsprop_op.cc
--- a/paddle/operators/rmsprop_op.cu
+++ b/paddle/operators/rmsprop_op.cu
--- a/paddle/operators/rmsprop_op.h
+++ b/paddle/operators/rmsprop_op.h
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ b/paddle/operators/rnn/recurrent_op_utils.h
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
--- a/paddle/operators/scale_op.h
+++ b/paddle/operators/scale_op.h
--- a/paddle/operators/scatter.cu.h
+++ b/paddle/operators/scatter.cu.h
--- a/paddle/operators/scatter.h
+++ b/paddle/operators/scatter.h
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
--- a/paddle/operators/scatter_op.h
+++ b/paddle/operators/scatter_op.h
--- a/paddle/operators/scatter_test.cc
+++ b/paddle/operators/scatter_test.cc
--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
--- a/paddle/operators/sequence_concat_op.cu
+++ b/paddle/operators/sequence_concat_op.cu
--- a/paddle/operators/sequence_concat_op.h
+++ b/paddle/operators/sequence_concat_op.h
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/operators/sequence_softmax_op.cc
--- a/paddle/operators/sequence_softmax_op.cu
+++ b/paddle/operators/sequence_softmax_op.cu
--- a/paddle/operators/sequence_softmax_op.h
+++ b/paddle/operators/sequence_softmax_op.h
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
--- a/paddle/operators/smooth_l1_loss_op.cu
+++ b/paddle/operators/smooth_l1_loss_op.cu
--- a/paddle/operators/smooth_l1_loss_op.h
+++ b/paddle/operators/smooth_l1_loss_op.h
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
--- a/paddle/operators/split_op.cu
+++ b/paddle/operators/split_op.cu
--- a/paddle/operators/split_op.h
+++ b/paddle/operators/split_op.h
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
--- a/paddle/operators/squared_l2_distance_op.h
+++ b/paddle/operators/squared_l2_distance_op.h
--- a/paddle/operators/strided_memcpy.h
+++ b/paddle/operators/strided_memcpy.h
--- a/paddle/operators/strided_memcpy_test.cc
+++ b/paddle/operators/strided_memcpy_test.cc
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
--- a/paddle/operators/sum_op.cu
+++ b/paddle/operators/sum_op.cu
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
--- a/paddle/operators/top_k_op.cu
+++ b/paddle/operators/top_k_op.cu
--- a/paddle/operators/top_k_op.h
+++ b/paddle/operators/top_k_op.h
--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
--- a/paddle/operators/transpose_op.cu
+++ b/paddle/operators/transpose_op.cu
--- a/paddle/operators/transpose_op.h
+++ b/paddle/operators/transpose_op.h
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
--- a/paddle/platform/cuda_helper.h
+++ b/paddle/platform/cuda_helper.h
--- a/paddle/platform/cudnn_helper.h
+++ b/paddle/platform/cudnn_helper.h
--- a/paddle/platform/details/device_ptr_cast.h
+++ b/paddle/platform/details/device_ptr_cast.h
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
--- a/paddle/platform/dynload/cublas.h
+++ b/paddle/platform/dynload/cublas.h
--- a/paddle/platform/dynload/dynamic_loader.cc
+++ b/paddle/platform/dynload/dynamic_loader.cc
--- a/paddle/platform/dynload/dynamic_loader.h
+++ b/paddle/platform/dynload/dynamic_loader.h
--- a/paddle/platform/dynload/nccl.cc
+++ b/paddle/platform/dynload/nccl.cc
--- a/paddle/platform/dynload/nccl.h
+++ b/paddle/platform/dynload/nccl.h
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
--- a/paddle/platform/enforce_test.cc
+++ b/paddle/platform/enforce_test.cc
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
--- a/paddle/platform/hostdevice.h
+++ b/paddle/platform/hostdevice.h
--- a/paddle/platform/macros.h
+++ b/paddle/platform/macros.h
--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
--- a/paddle/platform/transform.h
+++ b/paddle/platform/transform.h
--- a/paddle/platform/transform_test.cu
+++ b/paddle/platform/transform_test.cu
--- a/paddle/platform/variant.h
+++ b/paddle/platform/variant.h
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
--- a/paddle/pybind/.clang-format
+++ b/paddle/pybind/.clang-format
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
--- a/paddle/pybind/exception.cc
+++ b/paddle/pybind/exception.cc
--- a/paddle/pybind/exception.h
+++ b/paddle/pybind/exception.h
--- a/paddle/pybind/print_operators_doc.cc
+++ b/paddle/pybind/print_operators_doc.cc
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
--- a/paddle/pybind/protobuf.h
+++ b/paddle/pybind/protobuf.h
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
--- a/paddle/scripts/cluster_train_v2/fabric/conf.py
+++ b/paddle/scripts/cluster_train_v2/fabric/conf.py
--- a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
+++ b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
--- a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
+++ b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
--- a/paddle/scripts/cluster_train_v2/fabric/run.sh
+++ b/paddle/scripts/cluster_train_v2/fabric/run.sh
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
--- a/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
+++ b/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
--- a/paddle/scripts/travis/build_ios.sh
+++ b/paddle/scripts/travis/build_ios.sh
--- a/paddle/scripts/travis/check_style.sh
+++ b/paddle/scripts/travis/check_style.sh
--- a/paddle/string/.clang-format
+++ b/paddle/string/.clang-format
--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
--- a/paddle/string/printf_test.cc
+++ b/paddle/string/printf_test.cc
--- a/paddle/string/tinyformat/tinyformat.h
+++ b/paddle/string/tinyformat/tinyformat.h
--- a/paddle/string/to_string_test.cc
+++ b/paddle/string/to_string_test.cc
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
--- a/paddle/trainer/MergeModel.cpp
+++ b/paddle/trainer/MergeModel.cpp
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
--- a/paddle/trainer/tests/sample_trainer_config_branch_net.conf
+++ b/paddle/trainer/tests/sample_trainer_config_branch_net.conf
--- a/paddle/trainer/tests/sample_trainer_config_simple_net.conf
+++ b/paddle/trainer/tests/sample_trainer_config_simple_net.conf
--- a/paddle/trainer/tests/test_Compare.cpp
+++ b/paddle/trainer/tests/test_Compare.cpp
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/trainer/tests/test_CompareTwoNets.cpp
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
--- a/paddle/utils/Excepts.h
+++ b/paddle/utils/Excepts.h
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
--- a/paddle/utils/Version.h
+++ b/paddle/utils/Version.h
--- a/paddle/utils/arch/linux/Locks.cpp
+++ b/paddle/utils/arch/linux/Locks.cpp
--- a/paddle/utils/arch/osx/Excepts.cpp
+++ b/paddle/utils/arch/osx/Excepts.cpp
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
--- a/python/paddle/v2/event.py
+++ b/python/paddle/v2/event.py
--- a/python/paddle/v2/framework/default_scope_funcs.py
+++ b/python/paddle/v2/framework/default_scope_funcs.py
--- a/python/paddle/v2/framework/executor.py
+++ b/python/paddle/v2/framework/executor.py
--- a/python/paddle/v2/framework/framework.py
+++ b/python/paddle/v2/framework/framework.py
--- a/python/paddle/v2/framework/layer_helper.py
+++ b/python/paddle/v2/framework/layer_helper.py
--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
--- a/python/paddle/v2/framework/nets.py
+++ b/python/paddle/v2/framework/nets.py
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
--- a/python/paddle/v2/framework/optimizer.py
+++ b/python/paddle/v2/framework/optimizer.py
--- a/python/paddle/v2/framework/tests/.gitignore
+++ b/python/paddle/v2/framework/tests/.gitignore
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
--- a/python/paddle/v2/framework/tests/test_accuracy_op.py
+++ b/python/paddle/v2/framework/tests/test_accuracy_op.py
--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
--- a/python/paddle/v2/framework/tests/test_adadelta_op.py
+++ b/python/paddle/v2/framework/tests/test_adadelta_op.py
--- a/python/paddle/v2/framework/tests/test_adagrad_op.py
+++ b/python/paddle/v2/framework/tests/test_adagrad_op.py
--- a/python/paddle/v2/framework/tests/test_adam_op.py
+++ b/python/paddle/v2/framework/tests/test_adam_op.py
--- a/python/paddle/v2/framework/tests/test_adamax_op.py
+++ b/python/paddle/v2/framework/tests/test_adamax_op.py
--- a/python/paddle/v2/framework/tests/test_add_two_op.py
+++ b/python/paddle/v2/framework/tests/test_add_two_op.py
--- a/python/paddle/v2/framework/tests/test_clip_op.py
+++ b/python/paddle/v2/framework/tests/test_clip_op.py
--- a/python/paddle/v2/framework/tests/test_concat_op.py
+++ b/python/paddle/v2/framework/tests/test_concat_op.py
--- a/python/paddle/v2/framework/tests/test_cond_op.py
+++ b/python/paddle/v2/framework/tests/test_cond_op.py
--- a/python/paddle/v2/framework/tests/test_conv2d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_op.py
--- a/python/paddle/v2/framework/tests/test_conv2dtranspose_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2dtranspose_op.py
--- a/python/paddle/v2/framework/tests/test_conv_shift_op.py
+++ b/python/paddle/v2/framework/tests/test_conv_shift_op.py
--- a/python/paddle/v2/framework/tests/test_cos_sim_op.py
+++ b/python/paddle/v2/framework/tests/test_cos_sim_op.py
--- a/python/paddle/v2/framework/tests/test_crop_op.py
+++ b/python/paddle/v2/framework/tests/test_crop_op.py
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
--- a/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py
+++ b/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py
--- a/python/paddle/v2/framework/tests/test_default_scope_funcs.py
+++ b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
--- a/python/paddle/v2/framework/tests/test_dropout_op.py
+++ b/python/paddle/v2/framework/tests/test_dropout_op.py
--- a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_add_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_div_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_div_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_sub_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_sub_op.py
--- a/python/paddle/v2/framework/tests/test_exception.py
+++ b/python/paddle/v2/framework/tests/test_exception.py
--- a/python/paddle/v2/framework/tests/test_executor_and_mul.py
+++ b/python/paddle/v2/framework/tests/test_executor_and_mul.py
--- a/python/paddle/v2/framework/tests/test_feed_fetch_method.py
+++ b/python/paddle/v2/framework/tests/test_feed_fetch_method.py
--- a/python/paddle/v2/framework/tests/test_fill_constant_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_constant_op.py
--- a/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
--- a/python/paddle/v2/framework/tests/test_fit_a_line.py
+++ b/python/paddle/v2/framework/tests/test_fit_a_line.py
--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
--- a/python/paddle/v2/framework/tests/test_gradient_checker.py
+++ b/python/paddle/v2/framework/tests/test_gradient_checker.py
--- a/python/paddle/v2/framework/tests/test_gru_unit_op.py
+++ b/python/paddle/v2/framework/tests/test_gru_unit_op.py
--- a/python/paddle/v2/framework/tests/test_increment_op.py
+++ b/python/paddle/v2/framework/tests/test_increment_op.py
--- a/python/paddle/v2/framework/tests/test_infer_shape.py
+++ b/python/paddle/v2/framework/tests/test_infer_shape.py
--- a/python/paddle/v2/framework/tests/test_layers.py
+++ b/python/paddle/v2/framework/tests/test_layers.py
--- a/python/paddle/v2/framework/tests/test_lookup_table.py
+++ b/python/paddle/v2/framework/tests/test_lookup_table.py
--- a/python/paddle/v2/framework/tests/test_lstm_op.py
+++ b/python/paddle/v2/framework/tests/test_lstm_op.py
--- a/python/paddle/v2/framework/tests/test_lstm_unit_op.py
+++ b/python/paddle/v2/framework/tests/test_lstm_unit_op.py
--- a/python/paddle/v2/framework/tests/test_margin_rank_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_margin_rank_loss_op.py
--- a/python/paddle/v2/framework/tests/test_matmul_op.py
+++ b/python/paddle/v2/framework/tests/test_matmul_op.py
--- a/python/paddle/v2/framework/tests/test_minus_op.py
+++ b/python/paddle/v2/framework/tests/test_minus_op.py
--- a/python/paddle/v2/framework/tests/test_mnist.py
+++ b/python/paddle/v2/framework/tests/test_mnist.py
--- a/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
--- a/python/paddle/v2/framework/tests/test_momentum_op.py
+++ b/python/paddle/v2/framework/tests/test_momentum_op.py
--- a/python/paddle/v2/framework/tests/test_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_mul_op.py
--- a/python/paddle/v2/framework/tests/test_multiplex_op.py
+++ b/python/paddle/v2/framework/tests/test_multiplex_op.py
--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/framework/tests/test_net.py
--- a/python/paddle/v2/framework/tests/test_op_support_gpu.py
+++ b/python/paddle/v2/framework/tests/test_op_support_gpu.py
--- a/python/paddle/v2/framework/tests/test_operator.py
+++ b/python/paddle/v2/framework/tests/test_operator.py
--- a/python/paddle/v2/framework/tests/test_operator_desc.py
+++ b/python/paddle/v2/framework/tests/test_operator_desc.py
--- a/python/paddle/v2/framework/tests/test_optimizer.py
+++ b/python/paddle/v2/framework/tests/test_optimizer.py
--- a/python/paddle/v2/framework/tests/test_pad_op.py
+++ b/python/paddle/v2/framework/tests/test_pad_op.py
--- a/python/paddle/v2/framework/tests/test_parameter.py
+++ b/python/paddle/v2/framework/tests/test_parameter.py
--- a/python/paddle/v2/framework/tests/test_pool2d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool2d_op.py
--- a/python/paddle/v2/framework/tests/test_pool3d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool3d_op.py
--- a/python/paddle/v2/framework/tests/test_pool_max_op.py
+++ b/python/paddle/v2/framework/tests/test_pool_max_op.py
--- a/python/paddle/v2/framework/tests/test_prelu_op.py
+++ b/python/paddle/v2/framework/tests/test_prelu_op.py
--- a/python/paddle/v2/framework/tests/test_program.py
+++ b/python/paddle/v2/framework/tests/test_program.py
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
--- a/python/paddle/v2/framework/tests/test_proximal_gd_op.py
+++ b/python/paddle/v2/framework/tests/test_proximal_gd_op.py
--- a/python/paddle/v2/framework/tests/test_rank_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_rank_loss_op.py
--- a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
--- a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
--- a/python/paddle/v2/framework/tests/test_reduce_op.py
+++ b/python/paddle/v2/framework/tests/test_reduce_op.py
--- a/python/paddle/v2/framework/tests/test_rmsprop_op.py
+++ b/python/paddle/v2/framework/tests/test_rmsprop_op.py
--- a/python/paddle/v2/framework/tests/test_rnn_helpers.py
+++ b/python/paddle/v2/framework/tests/test_rnn_helpers.py
--- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
--- a/python/paddle/v2/framework/tests/test_scale_and_identity_op.py
+++ b/python/paddle/v2/framework/tests/test_scale_and_identity_op.py
--- a/python/paddle/v2/framework/tests/test_scatter_op.py
+++ b/python/paddle/v2/framework/tests/test_scatter_op.py
--- a/python/paddle/v2/framework/tests/test_scope.py
+++ b/python/paddle/v2/framework/tests/test_scope.py
--- a/python/paddle/v2/framework/tests/test_selected_rows.py
+++ b/python/paddle/v2/framework/tests/test_selected_rows.py
--- a/python/paddle/v2/framework/tests/test_seq_concat_op.py
+++ b/python/paddle/v2/framework/tests/test_seq_concat_op.py
--- a/python/paddle/v2/framework/tests/test_seq_pool.py
+++ b/python/paddle/v2/framework/tests/test_seq_pool.py
--- a/python/paddle/v2/framework/tests/test_sequence_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_sequence_softmax_op.py
--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
--- a/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py
--- a/python/paddle/v2/framework/tests/test_smooth_l1_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_smooth_l1_loss_op.py
--- a/python/paddle/v2/framework/tests/test_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_op.py
--- a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
--- a/python/paddle/v2/framework/tests/test_sigmoid_op.py
+++ b/python/paddle/v2/framework/tests/test_sigmoid_op.py
--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/framework/tests/test_tensor.py
--- a/python/paddle/v2/framework/tests/test_tensor_array.py
+++ b/python/paddle/v2/framework/tests/test_tensor_array.py
--- a/python/paddle/v2/framework/tests/test_top_k_op.py
+++ b/python/paddle/v2/framework/tests/test_top_k_op.py
--- a/python/paddle/v2/framework/tests/test_transpose_op.py
+++ b/python/paddle/v2/framework/tests/test_transpose_op.py
--- a/python/paddle/v2/framework/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/framework/tests/test_uniform_random_op.py
--- a/python/paddle/v2/framework/tests/test_variable.py
+++ b/python/paddle/v2/framework/tests/test_variable.py
--- a/python/paddle/v2/framework/tests/test_word2vec.py
+++ b/python/paddle/v2/framework/tests/test_word2vec.py
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
--- a/python/paddle/v2/tests/CMakeLists.txt
+++ b/python/paddle/v2/tests/CMakeLists.txt
--- a/python/paddle/v2/tests/test_data_feeder.py
+++ b/python/paddle/v2/tests/test_data_feeder.py
--- a/python/paddle/v2/tests/test_paramconf_order.py
+++ b/python/paddle/v2/tests/test_paramconf_order.py
--- a/python/paddle/v2/topology.py
+++ b/python/paddle/v2/topology.py
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
--- a/v1_api_demo/README.md
+++ b/v1_api_demo/README.md