Merge branch 'develop' of github.com:PaddlePaddle/Paddle into hsigmoid_op

2ce56940 · Yancey1989 · 1abd3b3a · 90fc4a6c · 2ce56940 · 2ce56940
305 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -108,14 +108,11 @@ else()
    set(THIRD_PARTY_BUILD_TYPE Release)
 endif()
-if(WITH_MKL)
+set(WITH_MKLML ${WITH_MKL})
-    set(WITH_MKLML ON)
+if (WITH_MKL AND AVX2_FOUND)
-    set(WITH_MKLDNN ${AVX2_FOUND})
+    set(WITH_MKLDNN ON)
-    if(NOT WITH_MKLDNN)
-        message(WARNING "Do not have AVX2 intrinsics and disabled MKL-DNN")
-    endif()
 else()
-    set(WITH_MKLML OFF)
+    message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
    set(WITH_MKLDNN OFF)
 endif()
@@ -166,10 +163,7 @@ set(EXTERNAL_LIBS
 )
 if(WITH_GPU)
-    list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+  include(cuda)
-    if(NOT WITH_DSO)
-        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
-    endif(NOT WITH_DSO)
 endif(WITH_GPU)
 if(WITH_MKLML)

--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@@ -12,11 +12,11 @@ Machine:
 System: CentOS release 6.3 (Final), Docker 1.12.1.
-PaddlePaddle: paddlepaddle/paddle:latest (TODO: will rerun after 0.11.0)
+PaddlePaddle: paddlepaddle/paddle:latest (for MKLML and MKL-DNN), paddlepaddle/paddle:latest-openblas (for OpenBLAS)
+- MKL-DNN tag v0.11
- MKL-DNN tag v0.10
+- MKLML 2018.0.1.20171007
- MKLML 2018.0.20170720
 - OpenBLAS v0.2.20
+(TODO: will rerun after 0.11.0)
 On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
@@ -31,17 +31,37 @@ Input image size - 3 * 224 * 224, Time: images/second
 | BatchSize    | 64    | 128  | 256     |
 |--------------|-------| -----| --------|
-| OpenBLAS     | 7.82  | 8.62  | 10.34  | 
+| OpenBLAS     | 7.80  | 9.00  | 10.80  | 
-| MKLML        | 11.02 | 12.86 | 15.33  |
+| MKLML        | 12.12 | 13.70 | 16.18  |
-| MKL-DNN      | 27.69 | 28.8 | 29.27  |
+| MKL-DNN      | 28.46 | 29.83 | 30.44  |
+chart on batch size 128
+TBD
+ - ResNet-50
+| BatchSize    | 64    | 128   | 256    |
+|--------------|-------| ------| -------|
+| OpenBLAS     | 25.22 | 25.68 | 27.12  | 
+| MKLML        | 32.52 | 31.89 | 33.12  |
+| MKL-DNN      | 81.69 | 82.35 | 84.08  |
 chart on batch size 128
 TBD
- - ResNet
 - GoogLeNet
+| BatchSize    | 64    | 128   | 256    |
+|--------------|-------| ------| -------|
+| OpenBLAS     | 89.52 | 96.97 | 108.25 | 
+| MKLML        | 128.46| 137.89| 158.63 |
+| MKL-DNN      | 250.46| 264.83| 269.50 |
+chart on batch size 128
+TBD
 ### Laptop
 TBD
 ### Desktop

--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@@ -5,6 +5,7 @@ height = 224
 width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
+use_gpu = get_config_arg('use_gpu', bool, True)
 args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
 define_py_data_sources2(
@@ -16,6 +17,8 @@ settings(
    learning_method=MomentumOptimizer(0.9),
    regularization=L2Regularization(0.0005 * batch_size))
+conv_projection = conv_projection if use_gpu else img_conv_layer
 def inception2(name, input, channels, \
    filter1,
    filter3R, filter3,
@@ -138,7 +141,7 @@ def inception(name, input, channels, \
    cat = concat_layer(
        name=name,
        input=[cov1, cov3, cov5, covprj],
-        bias_attr=True,
+        bias_attr=True if use_gpu else False,
        act=ReluActivation())
    return cat

--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
@@ -40,6 +40,7 @@ fi
 for use_mkldnn in True False; do
  for batchsize in 64 128 256; do
    train vgg 19 $batchsize $use_mkldnn
-    train resnet 50  $batchsize $use_mkldnn
+    train resnet 50 $batchsize $use_mkldnn
+    train googlenet v1 $batchsize $use_mkldnn
  done
 done
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
+if(NOT WITH_GPU)
+    return()
+endif()
+set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
+set(paddle_known_gpu_archs7 "30 35 50 52")
+set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
+######################################################################################
+# A function for automatic detection of GPUs installed  (if autodetection is enabled)
+# Usage:
+#   detect_installed_gpus(out_variable)
+function(detect_installed_gpus out_variable)
+  if(NOT CUDA_gpu_detect_output)
+    set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
+    file(WRITE ${cufile} ""
+      "#include <cstdio>\n"
+      "int main() {\n"
+      "  int count = 0;\n"
+      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
+      "  if (count == 0) return -1;\n"
+      "  for (int device = 0; device < count; ++device) {\n"
+      "    cudaDeviceProp prop;\n"
+      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
+      "  }\n"
+      "  return 0;\n"
+      "}\n")
+    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}"
+                    "--run" "${cufile}"
+                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
+                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(nvcc_res EQUAL 0)
+      # only keep the last line of nvcc_out
+      STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
+      STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
+      list(GET nvcc_out -1 nvcc_out)
+      string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
+      set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
+    endif()
+  endif()
+  if(NOT CUDA_gpu_detect_output)
+    message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
+    set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
+  else()
+    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
+  endif()
+endfunction()
+########################################################################
+# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
+# Usage:
+#   select_nvcc_arch_flags(out_variable)
+function(select_nvcc_arch_flags out_variable)
+  # List of arch names
+  set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual")
+  set(archs_name_default "All")
+  if(NOT CMAKE_CROSSCOMPILING)
+    list(APPEND archs_names "Auto")
+  endif()
+  # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
+  set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
+  set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} )
+  mark_as_advanced(CUDA_ARCH_NAME)
+  # verify CUDA_ARCH_NAME value
+  if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
+    string(REPLACE ";" ", " archs_names "${archs_names}")
+    message(FATAL_ERROR "Only ${archs_names} architeture names are supported.")
+  endif()
+  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
+  else()
+    unset(CUDA_ARCH_BIN CACHE)
+    unset(CUDA_ARCH_PTX CACHE)
+  endif()
+  if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
+    set(cuda_arch_bin "30 35")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
+    set(cuda_arch_bin "50")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
+    set(cuda_arch_bin "60 61")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
+    set(cuda_arch_bin "70")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
+    set(cuda_arch_bin ${paddle_known_gpu_archs})
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
+    detect_installed_gpus(cuda_arch_bin)
+  else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(cuda_arch_bin ${CUDA_ARCH_BIN})
+  endif()
+  # remove dots and convert to lists
+  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
+  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
+  list(REMOVE_DUPLICATES cuda_arch_bin)
+  list(REMOVE_DUPLICATES cuda_arch_ptx)
+  set(nvcc_flags "")
+  set(nvcc_archs_readable "")
+  # Tell NVCC to add binaries for the specified GPUs
+  foreach(arch ${cuda_arch_bin})
+    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
+      # User explicitly specified PTX for the concrete BIN
+      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
+      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
+    else()
+      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
+      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
+      list(APPEND nvcc_archs_readable sm_${arch})
+    endif()
+  endforeach()
+  # Tell NVCC to add PTX intermediate code for the specified architectures
+  foreach(arch ${cuda_arch_ptx})
+    list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
+    list(APPEND nvcc_archs_readable compute_${arch})
+  endforeach()
+  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
+  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
+  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
+endfunction()
+message(STATUS "CUDA detected: " ${CUDA_VERSION})
+if (${CUDA_VERSION} LESS 7.0)
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
+elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
+  # warning for now.
+  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
+endif()
+include_directories(${CUDA_INCLUDE_DIRS})
+list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+if(NOT WITH_DSO)
+    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
+endif(NOT WITH_DSO)
+# setting nvcc arch flags
+select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
+list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
+message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
+# Set C++11 support
+set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
+# So, don't set these flags here.
+list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
+list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
+list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+# Set :expt-relaxed-constexpr to suppress Eigen warnings
+list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
+if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
+endif()
+mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
+mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -28,15 +28,8 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
 ExternalProject_Add(
    extern_gflags
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    # TODO(yiwang): The annoying warnings mentioned in
+    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
-    # https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by
+    GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
-    # gflags.  I fired a PR https://github.com/gflags/gflags/pull/230
-    # to fix it.  Before it gets accepted by the gflags team, we use
-    # my personal fork, which contains above fix, temporarily.  Let's
-    # change this back to the official Github repo once my PR is
-    # merged.
-    GIT_REPOSITORY  "https://github.com/wangkuiyi/gflags.git"
-    GIT_TAG         986964c07427ecb9cdb5bd73f73ebbd40e54dadb
    PREFIX          ${GFLAGS_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -149,58 +149,3 @@ endforeach()
 foreach(flag ${GPU_COMMON_FLAGS})
    safe_set_nvflag(${flag})
 endforeach()
-set(CUDA_PROPAGATE_HOST_FLAGS OFF)
-# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
-# So, don't set these flags here.
-LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
-LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
-if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
-endif()
-function(specify_cuda_arch cuda_version cuda_arch)
-    if(${cuda_version} VERSION_GREATER "8.0")
-        foreach(capability 61 62)
-          if(${cuda_arch} STREQUAL ${capability})
-            list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
-          endif()
-        endforeach()
-    elseif(${cuda_version} VERSION_GREATER "7.0" and ${cuda_arch} STREQUAL "53")
-        list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
-    endif()
-endfunction()
-# Common gpu architectures: Kepler, Maxwell
-foreach(capability 30 35 50)
-      list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
-endforeach()
-if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
-      list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
-endif()
-# Modern gpu architectures: Pascal
-if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
-      list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
-      list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
-endif()
-# Custom gpu architecture
-set(CUDA_ARCH)
-if(CUDA_ARCH)
-  specify_cuda_arch(${CUDA_VERSION} ${CUDA_ARCH})
-endif()
-set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -459,11 +459,11 @@ function(py_test TARGET_NAME)
  if(WITH_TESTING)
    set(options STATIC static SHARED shared)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
+    set(multiValueArgs SRCS DEPS ARGS)
-    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})  
+    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
-             python2 ${py_test_SRCS}
+             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()
 endfunction()
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -168,17 +168,3 @@ function(create_resources res_file output_file)
    COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file}
    DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py)
 endfunction()
-# Create a python unittest using run_python_tests.sh,
-# which takes care of making correct running environment
-function(add_python_test TEST_NAME)
-    foreach(arg ${ARGN})
-        get_filename_component(py_fn ${arg} NAME_WE)
-        set(TRG_NAME ${TEST_NAME}_${py_fn})
-        add_test(NAME ${TRG_NAME}
-                COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
-                python2 ${arg}
-                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-    endforeach()
-endfunction()
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -335,6 +335,16 @@ bilinear_interp
 ..  autoclass:: paddle.v2.layer.bilinear_interp
    :noindex:
+dot_prod
+---------
+.. autoclass:: paddle.v2.layer.dot_prod
+    :noindex:
+out_prod
+--------
+.. autoclass:: paddle.v2.layer.out_prod
+    :noindex:
 power
 -----
 ..  autoclass:: paddle.v2.layer.power
@@ -372,6 +382,11 @@ cos_sim
 ..  autoclass:: paddle.v2.layer.cos_sim
    :noindex:
+l2_distance
+-----------
+..  autoclass:: paddle.v2.layer.l2_distance
+    :noindex:
 trans
 -----
 ..  autoclass:: paddle.v2.layer.trans

--- a/doc/design/reader/README.md
+++ b/doc/design/reader/README.md
 # Python Data Reader Design Doc
-At training and testing time, PaddlePaddle programs need to read data. To ease the users' work to write data reading code, we define that
+During the training and testing phases, PaddlePaddle programs need to read data. To help the users write code that performs reading input data, we define the following:
- A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items.
+- A *reader*: A function that reads data (from file, network, random number generator, etc) and yields the data items.
- A *reader creator* is a function that returns a reader function.
+- A *reader creator*: A function that returns a reader function.
- A *reader decorator* is a function, which accepts one or more readers, and returns a reader.
+- A *reader decorator*: A function, which takes in one or more readers, and returns a reader.
- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
+- A *batch reader*: A function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
-and provide function which converts reader to batch reader, frequently used reader creators and reader decorators.
+and also provide a function which can convert a reader to a batch reader, frequently used reader creators and reader decorators.
 ## Data Reader Interface
-Indeed, *data reader* doesn't have to be a function that reads and yields data items. It can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`):
+*Data reader* doesn't have to be a function that reads and yields data items. It can just be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`) as follows:
 ```
 iterable = data_reader()
 ```
-Element produced from the iterable should be a **single** entry of data, **not** a mini batch. That entry of data could be a single item, or a tuple of items. Item should be of [supported type](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int)
+The item produced from the iterable should be a **single** entry of data and **not** a mini batch. The entry of data could be a single item or a tuple of items. Item should be of one of the [supported types](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int etc.)
-An example implementation for single item data reader creator:
+An example implementation for single item data reader creator is as follows:
 ```python
 def reader_creator_random_image(width, height):
@@ -29,7 +29,7 @@ def reader_creator_random_image(width, height):
    return reader
 ```
-An example implementation for multiple item data reader creator:
+An example implementation for multiple item data reader creator is as follows:
 ```python
 def reader_creator_random_image_and_label(width, height, label):
    def reader():
@@ -40,9 +40,10 @@ def reader_creator_random_image_and_label(width, height, label):
 ## Batch Reader Interface
-*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple.
+*Batch reader* can be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list should be a tuple.
+Here are some valid outputs:
-Here are valid outputs:
 ```python
 # a mini batch of three data items. Each data item consist three columns of data, each of which is 1.
 [(1, 1, 1),
@@ -58,20 +59,22 @@ Here are valid outputs:
 Please note that each item inside the list must be a tuple, below is an invalid output:
 ```python
 # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],).
- # Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
+ # Otherwise it is ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
- # or three column of datas, each of which is 1.
+ # or three columns of data, each of which is 1.
 [[1,1,1],
 [2,2,2],
 [3,3,3]]
 ```
-It's easy to convert from reader to batch reader:
+It is easy to convert from a reader to a batch reader:
 ```python
 mnist_train = paddle.dataset.mnist.train()
 mnist_train_batch_reader = paddle.batch(mnist_train, 128)
 ```
-Also easy to create custom batch reader:
+It is also straight forward to create a custom batch reader:
 ```python
 def custom_batch_reader():
    while True:
@@ -85,7 +88,8 @@ mnist_random_image_batch_reader = custom_batch_reader
 ## Usage
-batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`:
+Following is how we can use the reader with PaddlePaddle:
+The batch reader, a mapping from item(s) to data layer, the batch size and the number of total passes will be passed into `paddle.train` as follows:
 ```python
 # two data layer is created:
@@ -99,13 +103,13 @@ paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...)
 ## Data Reader Decorator
-*Data reader decorator* takes a single or multiple data reader, returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` syntax.
+The *Data reader decorator* takes in a single reader or multiple data readers and returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` in the syntax.
-Since we have a strict interface for data readers (no parameter, return a single data item). Data reader can be used flexiable via data reader decorators. Following are a few examples:
+Since we have a strict interface for data readers (no parameters and return a single data item), a data reader can be used in a flexible way using data reader decorators. Following are a few examples:
 ### Prefetch Data
-Since reading data may take time and training can not proceed without data. It is generally a good idea to prefetch data.
+Since reading data may take some time and training can not proceed without data, it is generally a good idea to prefetch the data.
 Use `paddle.reader.buffered` to prefetch data:
@@ -117,9 +121,9 @@ buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100)
 ### Compose Multiple Data Readers
-For example, we want to use a source of real images (reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
+For example, if we want to use a source of real images (say reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
-We can do:
+We can do the following :
 ```python
 def reader_creator_random_image(width, height):
@@ -139,13 +143,13 @@ false_reader = reader_creator_bool(False)
 reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader)
 # Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry.
-# And we don't care second item at this time.
+# And we don't care about the second item at this time.
 paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
 ```
 ### Shuffle
-Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader that buffers `n` data entries and shuffle them before a data entry is read.
+Given the shuffle buffer size `n`, `paddle.reader.shuffle` returns a data reader that buffers `n` data entries and shuffles them before a data entry is read.
 Example:
 ```python
@@ -154,21 +158,21 @@ reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512)
 ## Q & A
-### Why reader return only a single entry, but not a mini batch?
+### Why does a reader return only a single entry, and not a mini batch?
-Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2).
+Returning a single entry makes reusing existing data readers much easier (for example, if an existing reader returns 3 entries instead if a single entry, the training code will be more complicated because it need to handle cases like a batch size 2).
-We provide function `paddle.batch` to turn (single entry) reader into batch reader.
+We provide a function: `paddle.batch` to turn (a single entry) reader into a batch reader.
-### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient?
+### Why do we need a batch reader, isn't is sufficient to give the reader and batch_size as arguments during training ?
-In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically.
+In most of the cases, it would be sufficient to give the reader and batch_size as arguments to the train method. However sometimes the user wants to customize the order of data entries inside a mini batch, or even change the batch size dynamically. For these cases using a batch reader is very efficient and helpful.
-### Why use a dictionary but not a list to provide mapping?
+### Why use a dictionary instead of a list to provide mapping?
-We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["image", "label"]`) is because that user can easily resue item (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or skip item (e.g., using `{"image_a":0, "label":2}`).
+Using a dictionary (`{"image":0, "label":1}`) instead of a list (`["image", "label"]`) gives the advantage that the user can easily reuse the items (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or even skip an item (e.g., using `{"image_a":0, "label":2}`).
-### How to create custom data reader creator
+### How to create a custom data reader creator ?
 ```python
 def image_reader_creator(image_path, label_path, n):
@@ -192,7 +196,7 @@ paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...)
 ### How is `paddle.train` implemented
-An example implementation of paddle.train could be:
+An example implementation of paddle.train is:
 ```python
 def train(batch_reader, mapping, batch_size, total_pass):

--- a/doc/getstarted/basic_usage/index_cn.rst
+++ b/doc/getstarted/basic_usage/index_cn.rst
-经典的线性回归任务
-==================
-PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍将向你展示如何利用PaddlePaddle来解决一个经典的线性回归问题。
-任务简介
--------
-我们展示如何用PaddlePaddle解决 `单变量的线性回归 <https://www.baidu.com/s?wd=单变量线性回归>`_ 问题。线性回归的输入是一批点 `(x, y)` ，其中 `y = wx + b + ε`， 而 ε 是一个符合高斯分布的随机变量。线性回归的输出是从这批点估计出来的参数 `w` 和 `b` 。
-一个例子是房产估值。我们假设房产的价格（y）是其大小（x）的一个线性函数，那么我们可以通过收集市场上房子的大小和价格，用来估计线性函数的参数w 和 b。
-准备数据
-----------
-假设变量 `x` 和 `y` 的真实关系为： `y = 2x + 0.3 + ε`，这里展示如何使用观测数据来拟合这一线性关系。首先，Python代码将随机产生2000个观测点，作为线性回归的输入。下面脚本符合PaddlePaddle期待的读取数据的Python程序的模式。
-.. code-block:: python
-    # dataprovider.py
-    from paddle.trainer.PyDataProvider2 import *
-    import random
-    # 定义输入数据的类型: 2个浮点数
-    @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-    def process(settings, input_file):
-        for i in xrange(2000):
-            x = random.random()
-            yield [x], [2*x+0.3]
-训练模型
-----------
-为了还原 `y = 2x + 0.3`，我们先从一条随机的直线 `y' = wx + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `y'` 和 `y` 的差距不断减小，最终趋于接近。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
-在PaddlePaddle里，该模型的网络配置如下。
-.. code-block:: python
-    # trainer_config.py
-    from paddle.trainer_config_helpers import *
-    # 1. 定义数据来源，调用上面的process函数获得观测数据
-    data_file = 'empty.list'
-    with open(data_file, 'w') as f: f.writelines(' ')
-    define_py_data_sources2(train_list=data_file, test_list=None, 
-                            module='dataprovider', obj='process',args={})
-    # 2. 学习算法。控制如何改变模型参数 w 和 b
-    settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-    # 3. 神经网络配置
-    x = data_layer(name='x', size=1)
-    y = data_layer(name='y', size=1)
-    # 线性计算网络层: ȳ = wx + b
-    ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-    # 计算误差函数，即  ȳ 和真实 y 之间的距离
-    cost = square_error_cost(input= ȳ, label=y)
-    outputs(cost)
-这段简短的配置展示了PaddlePaddle的基本用法：
- 第一部分定义了数据输入。一般情况下，PaddlePaddle先从一个文件列表里获得数据文件地址，然后交给用户自定义的函数（例如上面的 `process`函数）进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件，所以放一个空列表（`empty.list`）即可。
- 第二部分主要是选择学习算法，它定义了模型参数改变的规则。PaddlePaddle提供了很多优秀的学习算法，这里使用一个基于momentum的随机梯度下降(SGD)算法，该算法每批量(batch)读取12个采样数据进行随机梯度计算来更新更新。
- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络层，所以很多时候你需要做的只是定义正确的网络层并把它们连接起来。这里使用了三种网络单元：
-    - **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到接下来的网络层。这里数据层有两个，分别对应于变量 `x` 和 `y`。
-    - **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以拟合任意的函数来学习复杂的数据关系。
-    - **回归误差代价层**：回归误差代价层 `square_error_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
-定义了网络结构并保存为 `trainer_config.py` 之后，运行以下训练命令：
-.. code-block:: bash
-    paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
-PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加误差代价函数的输出在不断的减小，这意味着模型在训练数据上不断的改进，直到逼近真实解：` y = 2x + 0.3 `
-模型检验
-----------
-训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用学习的模型对另外一组测试数据进行预测，评价预测的效果。在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
-PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件，所以可以利用如下方法读取模型的参数。
-.. code-block:: python
-    import numpy as np
-    import os
-    def load(file_name):
-        with open(file_name, 'rb') as f:
-            f.read(16) # skip header for float type.
-            return np.fromfile(f, dtype=np.float32)
-    print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-    # w=1.999743, b=0.300137
-.. image:: ./parameters.png
-     :align: center
-     :scale: 80 %
-从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型一致。
-这样，我们用PaddlePaddle解决了单变量线性回归问题， 包括数据输入、模型训练和最后的结果验证。
--- a/doc/getstarted/basic_usage/index_en.rst
+++ b/doc/getstarted/basic_usage/index_en.rst
-Simple Linear Regression
-========================
-PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
-Problem Background
------------------
-Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - `simple linear regression <https://en.wikipedia.org/wiki/Simple_linear_regression>`_: you have observed a set of two-dimensional data points of ``X`` and ``Y``, where ``X`` is an explanatory variable and ``Y`` is corresponding dependent variable, and you want to recover the underlying correlation between ``X`` and ``Y``. Linear regression can be used in many practical scenarios. For example, ``X`` can be a variable about house size, and ``Y`` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
-Prepare the Data
-----------------
-Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
-    .. code-block:: python
-        # dataprovider.py
-        from paddle.trainer.PyDataProvider2 import *
-        import random
-        # define data types of input: 2 real numbers
-        @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-        def process(settings, input_file):
-            for i in xrange(2000):
-                x = random.random()
-                yield [x], [2*x+0.3]
-Train a NeuralNetwork
----------------------
-To recover this relationship between ``X`` and ``Y``, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line ``Y' = wX + b`` , then we gradually adapt ``w`` and ``b`` to minimize the difference between ``Y'`` and ``Y``. Here is what it looks like in PaddlePaddle:
-    .. code-block:: python
-        # trainer_config.py
-        from paddle.trainer_config_helpers import *
-        # 1. read data. Suppose you saved above python code as dataprovider.py
-        data_file = 'empty.list'
-        with open(data_file, 'w') as f: f.writelines(' ')
-        define_py_data_sources2(train_list=data_file, test_list=None, 
-                module='dataprovider', obj='process',args={})
-        # 2. learning algorithm
-        settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-        # 3. Network configuration
-        x = data_layer(name='x', size=1)
-        y = data_layer(name='y', size=1)
-        y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-        cost = square_error_cost(input=y_predict, label=y)
-        outputs(cost)
-Some of the most fundamental usages of PaddlePaddle are demonstrated:
-  The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly.
-  The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time.
-  Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration:
-	-  **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for ``X`` and ``Y``.
-	-  **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model.
-	-  **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters.
-Now that everything is ready, you can train the network with a simple command line call:
-    .. code-block:: bash
-        paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
-This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path ``./output``. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
-Evaluate the Model
-------------------
-Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: ``w=2, b=0.3``, thus a better option is to check out model parameters directly.
-In PaddlePaddle, training is just to get a collection of model parameters, which are ``w`` and ``b`` in this case. Each parameter is saved in an individual file in the popular ``numpy`` array format. Here is the code that reads parameters from last pass.
-    .. code-block:: python
-        import numpy as np
-        import os
-        def load(file_name):
-            with open(file_name, 'rb') as f:
-                f.read(16) # skip header for float type.
-                return np.fromfile(f, dtype=np.float32)
-        print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-        # w=1.999743, b=0.300137
-    .. image:: parameters.png
-        :align: center
-Although starts from a random guess, you can see that value of ``w`` changes quickly towards 2 and ``b`` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer.
-There, you have recovered the underlying pattern between ``X`` and ``Y`` only from observed data.
--- a/doc/getstarted/basic_usage/parameters.png
+++ b/doc/getstarted/basic_usage/parameters.png
--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
+从源码编译PaddlePaddle
+======================
+.. _build_step:
+编译方法
+----------------
+PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译工具。
+我们推荐您使用PaddlePaddle编译环境镜像完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境
+可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。
+编译PaddlePaddle，需要执行：
+.. code-block:: bash
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 如果使用Docker编译环境，执行下面的命令编译CPU-Only的二进制
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   # 如果不使用Docker编译环境，执行下面的命令
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
+.. code-block:: bash
+   pip install python/dist/*.whl
+.. _build_step:
+编译依赖
+----------------
+PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其他的依赖软件，会自动在编译时下载。
+.. csv-table:: PaddlePaddle编译依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+   "CMake", ">=3.5", ""
+   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "可选"
+.. _build_options:
+编译选项
+----------------
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。
+用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考
+`官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如：
+..  code-block:: bash
+    cmake .. -DWITH_GPU=OFF
+..  csv-table:: 编译选项说明
+    :header: "选项", "说明", "默认值"
+    :widths: 1, 7, 2
+    "WITH_GPU", "是否支持GPU", "ON"
+    "WITH_C_API", "是否仅编译CAPI", "OFF"
+    "WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
+    "WITH_DSO", "是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。", "ON"
+    "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
+    "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
+    "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
+    "WITH_TESTING", "是否开启单元测试", "ON"
+    "WITH_DOC", "是否编译中英文文档", "OFF"
+    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
+    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
+    "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"
+BLAS
+++++
+PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
+`OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
+还会下载MKL-DNN数学库，详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+如果关闭MKL，则会使用OpenBLAS作为BLAS库。
+CUDA/cuDNN
+++++++++++
+PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
+使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构，加速编译。
+PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cuDNN是同一个版本。
+我们推荐使用最新版本的cuDNN。
+编译选项的设置
++++++++++++++
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
+..  code-block:: bash
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+**注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（** :code:`rm -rf` ）**后，再指定。**
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
-Installing from Sources
-==========================
-* [1. Download and Setup](#download)
-* [2. Requirements](#requirements)
-* [3. Build on Ubuntu](#ubuntu)
-* [4. Build on Centos](#centos)
-## <span id="download">Download and Setup</span> 
-You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
-```bash
-git clone https://github.com/PaddlePaddle/Paddle paddle
-cd paddle
-```
-## <span id="requirements">Requirements</span>
-To compile the source code, your computer must be equipped with the following dependencies.
- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) and gfortran compiler
- **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X)
- **BLAS**: MKL, OpenBlas or ATLAS
- **Python**: only support Python 2.7
- **Go**
-**Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
-For CUDA 8.0, GCC versions later than 5.3 are not supported!
-### Options
-PaddlePaddle supports some build options. 
-<html>
-<table> 
-<thead>
-<tr>
-<th scope="col" class="left">Optional</th>
-<th scope="col" class="left">Description</th>
-</tr>
-</thead>
-<tbody>
-<tr><td class="left">WITH_GPU</td><td class="left">Compile PaddlePaddle with NVIDIA GPU</td></tr>
-<tr><td class="left">WITH_AVX</td><td class="left">Compile PaddlePaddle with AVX intrinsics</td></tr>
-<tr><td class="left">WITH_DSO</td><td class="left">Compile PaddlePaddle with dynamic linked CUDA</td></tr>
-<tr><td class="left">WITH_TESTING</td><td class="left">Compile PaddlePaddle with unit testing</td></tr>
-<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile PaddlePaddle with inference api</td></tr>
-<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile PaddlePaddle with style check</td></tr>
-<tr><td class="left">WITH_PYTHON</td><td class="left">Compile PaddlePaddle with python interpreter</td></tr>
-<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile PaddlePaddle with double precision</td></tr>
-<tr><td class="left">WITH_RDMA</td><td class="left">Compile PaddlePaddle with RDMA support</td></tr>
-<tr><td class="left">WITH_TIMER</td><td class="left">Compile PaddlePaddle with stats timer</td></tr>
-<tr><td class="left">WITH_PROFILER</td><td class="left">Compile PaddlePaddle with GPU profiler</td></tr>
-<tr><td class="left">WITH_DOC</td><td class="left">Compile PaddlePaddle with documentation</td></tr>
-<tr><td class="left">WITH_COVERAGE</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
-<tr><td class="left">COVERALLS_UPLOAD</td><td class="left">Package code coverage data to coveralls</td></tr>
-<tr><td class="left">ON_TRAVIS</td><td class="left">Exclude special unit test on Travis CI</td></tr>
-</tbody>
-</table>
-</html>
-**Note:**
-  - The GPU version works best with Cuda Toolkit 8.0 and cuDNN v5.
-  - Other versions like Cuda Toolkit 7.0, 7.5 and cuDNN v3, v4 are also supported.
-  - **To utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa.**
-As a simple example, consider the following:  
-1. **BLAS Dependencies(optional)**
-    CMake will search BLAS libraries from the system. If not found, OpenBLAS will be downloaded, built and installed automatically.
-    To utilize preinstalled BLAS， you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
-    ```bash
-    # specify MKL
-    cmake .. -DMKL_ROOT=<mkl_path>
-    # or specify OpenBLAS
-    cmake .. -DOPENBLAS_ROOT=<openblas_path>
-    ```
-2. **Doc Dependencies(optional)**
-    To generate PaddlePaddle's documentation, install dependencies and set `-DWITH_DOC=ON` as follows:
-    ```bash
-    pip install 'sphinx>=1.4.0'
-    pip install sphinx_rtd_theme recommonmark
-    # install doxygen on Ubuntu
-    sudo apt-get install doxygen 
-    # install doxygen on Mac OS X
-    brew install doxygen
-    # active docs in cmake
-    cmake .. -DWITH_DOC=ON`
-    ```
-## <span id="ubuntu">Build on Ubuntu 14.04</span>
-### Install Dependencies
- **Paddle Dependencies**
-    ```bash
-    # necessary
-    sudo apt-get update
-    sudo apt-get install -y git curl gcc g++ gfortran make build-essential automake
-    sudo apt-get install -y python python-pip python-numpy libpython-dev bison
-    sudo pip install 'protobuf==3.1.0.post1'
-    # Install Go
-    # You can follow https://golang.org/doc/install for a detailed explanation.
-    wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
-    tar -C $HOME -xzf go.tgz && \
-    mkdir $HOME/gopath && \
-    rm go.tgz
-    # Setup environment variables
-    export GOROOT=$HOME/go
-    export GOPATH=$HOME/gopath
-    export PATH=$PATH:$GOROOT/bin
-    # install cmake 3.4
-    curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
-        cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \
-        cd .. && rm -rf cmake-3.4.1
-    ```
- **GPU Dependencies (optional)**
-    To build GPU version, you will need the following installed:
-        1. a CUDA-capable GPU
-        2. A supported version of Linux with a GCC compiler and toolchain
-        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
-    The CUDA development environment relies on tight integration with the host development environment,
-    including the host compiler and C runtime libraries, and is therefore only supported on
-    distribution versions that have been qualified for this CUDA Toolkit release.
-    After downloading cuDNN library, issue the following commands:
-    ```bash
-    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
-    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
-    ```
-    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
-    ```bash
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-    export PATH=/usr/local/cuda/bin:$PATH
-    ```
-### Build and Install
-As usual, the best option is to create build folder under paddle project directory.
-```bash
-mkdir build && cd build
-``` 
-Finally, you can build and install PaddlePaddle:
-```bash
-# you can add build option here, such as:    
-cmake .. -DCMAKE_INSTALL_PREFIX=<path to install>
-# please use sudo make install, if you want to install PaddlePaddle into the system
-make -j `nproc` && make install
-# set PaddlePaddle installation path in ~/.bashrc
-export PATH=<path to install>/bin:$PATH
-# install PaddlePaddle Python modules.
-sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-```
-## <span id="centos">Build on Centos 7</span>
-### Install Dependencies
- **CPU Dependencies**
-    ```bash
-    # necessary
-    sudo yum update
-    sudo yum install -y epel-release
-    sudo yum install -y make cmake3 python-devel python-pip gcc-gfortran swig git
-    sudo pip install wheel numpy
-    sudo pip install 'protobuf>=3.0.0'
-    ```
- **GPU Dependencies (optional)**
-    To build GPU version, you will need the following installed:
-        1. a CUDA-capable GPU
-        2. A supported version of Linux with a GCC compiler and toolchain
-        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
-    The CUDA development environment relies on tight integration with the host development environment,
-    including the host compiler and C runtime libraries, and is therefore only supported on
-    distribution versions that have been qualified for this CUDA Toolkit release.
-    After downloading cuDNN library, issue the following commands:
-    ```bash
-    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
-    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
-    ```
-    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
-    ```bash
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-    export PATH=/usr/local/cuda/bin:$PATH
-    ```
-### Build and Install
-As usual, the best option is to create build folder under paddle project directory.
-```bash
-mkdir build && cd build
-``` 
-Finally, you can build and install PaddlePaddle:
-```bash
-# you can add build option here, such as:    
-cmake3 .. -DCMAKE_INSTALL_PREFIX=<path to install>
-# please use sudo make install, if you want to install PaddlePaddle into the system
-make -j `nproc` && make install
-# set PaddlePaddle installation path in ~/.bashrc
-export PATH=<path to install>/bin:$PATH
-# install PaddlePaddle Python modules.
-sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-```
--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
+Build PaddlePaddle from Sources
+==========================
+.. _build_step:
+How To Build
+----------------
+PaddlePaddle mainly uses `CMake <https://cmake.org>`_ and GCC, G++ as compile
+tools. We recommend you to use our pre-built Docker image to run the build
+to avoid installing dependencies by yourself. We have several build environment
+Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_.
+Then run:
+.. code-block:: bash
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # run the following command to build CPU-Only binaries if you are using docker
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   # else run these commands
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+When the compile finishes, you can get the output whl package under
+build/python/dist, then you can choose to install the whl on local
+machine or copy it to the target machine.
+.. code-block:: bash
+   pip install python/dist/*.whl
+.. _build_step:
+Compile Dependencies
+----------------
+PaddlePaddle need the following dependencies when compiling, other dependencies
+will be downloaded automatically.
+.. csv-table:: PaddlePaddle Compile Dependencies
+   :header: "Dependency", "Version", "Description"
+   :widths: 10, 15, 30
+   "CMake", ">=3.5", ""
+   "GCC", "4.8.2", "Recommend devtools2 for CentOS"
+   "Python", "2.7.x", "Need libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "Optional"
+.. _build_options:
+Build Options
+----------------
+Build options include whether build binaries for CPU or GPU, which BLAS
+library to use etc. You may pass these settings when running cmake.
+For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`_ 。
+.. _build_options_bool:
+Bool Type Options
+----------------
+You can add :code:`-D` argument to pass such options, like:
+..  code-block:: bash
+    cmake .. -DWITH_GPU=OFF
+..  csv-table:: Bool Type Options
+    :header: "Option", "Description", "Default"
+    :widths: 1, 7, 2
+    "WITH_GPU", "Build with GPU support", "ON"
+    "WITH_C_API", "Build only CAPI", "OFF"
+    "WITH_DOUBLE", "Build with double precision", "OFF"
+    "WITH_DSO", "Dynamically load CUDA libraries", "ON"
+    "WITH_AVX", "Build with AVX support", "ON"
+    "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
+    "WITH_STYLE_CHECK", "Check code style when building", "ON"
+    "WITH_TESTING", "Build unit tests", "ON"
+    "WITH_DOC", "Build documentaions", "OFF"
+    "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
+    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
+    "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
+BLAS
+++++
+PaddlePaddle supports `MKL <https://software.intel.com/en-us/intel-mkl>`_ and
+`OpenBlAS <http://www.openblas.net/>`_ as BLAS library。By default it uses MKL.
+If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded
+and used, for more `details <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ .
+If you choose not to use MKL, then OpenBlAS will be used.
+CUDA/cuDNN
+++++++++++
+PaddlePaddle will automatically find CUDA and cuDNN when compiling and running.
+parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture
+automatically in order to speed up the build.
+PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to
+keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN
+you built.
+Pass Compile Options
++++++++++++++
+You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
+When running cmake command, it will search system paths like
+:code:`/usr/lib:/usr/local/lib` and then search paths that you
+passed to cmake, i.e.
+..  code-block:: bash
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.**
--- a/doc/getstarted/build_and_install/cmake.png
+++ b/doc/getstarted/build_and_install/cmake.png
--- a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
-PaddlePaddle的编译选项
-======================
-PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考 `官方文档 <https://cmake.org/cmake-tutorial>`_ 。
-Bool型的编译选项
----------------
-用户可在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如
-..  code-block:: bash
-    cmake .. -DWITH_GPU=OFF
-..  csv-table:: Bool型的编译选项
-    :widths: 1, 7, 2
-    :file: compile_options.csv
-BLAS/CUDA/Cudnn的编译选项
--------------------------
-BLAS
-+++++
-PaddlePaddle支持以下任意一种BLAS库：`MKL <https://software.intel.com/en-us/intel-mkl>`_ ，`ATLAS <http://math-atlas.sourceforge.net/>`_ ，`OpenBlAS <http://www.openblas.net/>`_ 和 `REFERENCE BLAS <http://www.netlib.org/blas/>`_ 。
-..  csv-table:: BLAS路径相关的编译选项
-    :widths: 1, 2, 7
-    :file: cblas_settings.csv
-CUDA/Cudnn
-+++++++++++
-PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
-编译选项的设置
-++++++++++++++
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时，首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
-..  code-block:: bash
-    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
-注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
--- a/doc/getstarted/build_and_install/cmake/cblas_settings.csv
+++ b/doc/getstarted/build_and_install/cmake/cblas_settings.csv
-编译选项,描述,注意
-MKL_ROOT,MKL的路径,${MKL_ROOT}/include下需要包含mkl.h，${MKL_ROOT}/lib目录下需要包含mkl_core，mkl_sequential和mkl_intel_lp64三个库。
-ATLAS_ROOT,ATLAS的路径,${ATLAS_ROOT}/include下需要包含cblas.h，${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库。
-OPENBLAS_ROOT,OpenBLAS的路径,${OPENBLAS_ROOT}/include下需要包含cblas.h，${OPENBLAS_ROOT}/lib下需要包含openblas库。
-REFERENCE_CBLAS_ROOT,REFERENCE BLAS的路径,${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h，${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库。
\ No newline at end of file
--- a/doc/getstarted/build_and_install/cmake/compile_options.csv
+++ b/doc/getstarted/build_and_install/cmake/compile_options.csv
-选项,说明,默认值
-WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
-WITH_DOUBLE,是否使用双精度浮点数。,否
-WITH_DSO,是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。,是
-WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
-WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
-WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
-WITH_RDMA,是否开启RDMA,否
-WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢，打印的日志变多，但是方便调试和测Benchmark,否
-WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
-WITH_DOC,是否编译中英文文档,否
-WITH_SWIG_PY,是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练,取决于是否寻找到SWIG
\ No newline at end of file
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
-PaddlePaddle的Docker容器使用方式
+使用Docker安装运行PaddlePaddle
 ================================
-PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行。 请注意，您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。
+使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。
+您可以在 `Docker官网 <https://docs.docker.com/get-started/>`_ 获得基本的Docker安装和使用方法。
-Docker使用入门
+如果您在使用Windows，可以参考
------------------------------
+`这篇 <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+教程，完成在Windows上安装和使用Docker。
-几个基础的概念帮助理解和使用Docker：
- *镜像*：一个Docker镜像是一个打包好的软件。它包含了这个软件本身和它所依赖的运行环境。PaddlePaddle的Docker镜像就包含了PaddlePaddle的Python库以及其依赖的多个Python库。这样我们可以直接在Docker中运行需要的程序而不需要安装后在执行。可以执行：
+在了解Docker的基本使用方法之后，即可开始下面的步骤：
-  .. code-block:: bash
+.. _docker_pull:
-     docker images
+获取PaddlePaddle的Docker镜像
+------------------------------
-  来列出当前系统中的所有镜像，同样可以执行：
+执行下面的命令获取最新的PaddlePaddle Docker镜像
  .. code-block:: bash
-     docker pull paddlepaddle/paddle:0.10.0
-  来下载Docker镜像，paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的，推荐国内用户使用docker.paddlepaddle.org/paddle下载。
+     docker pull paddlepaddle/paddle
- *容器*： 如果说一个Docker镜像就是一个程序，那容器就是这个程序运行时产生的“进程”。
+对于国内用户，我们提供了加速访问的镜像源：
-  实际上，一个容器就是一个操作系统的进程，但是是运行在独立的进程空间，文件系统以及网络之上。
-  可以执行：
  .. code-block:: bash
-     docker run paddlepaddle/paddle:0.10.0
+     docker pull docker.paddlepaddle.org/paddle
-  来使用一个镜像启动一个容器。
+下载GPU版本的Docker镜像：
- 默认情况下，Docker容器会运行在独立的文件系统空间之上，我们无法在Docker容器中
-  访问到主机上的文件。可以通过*挂载Volume*的方式，将主机上的文件或目录挂载到
-  Docker容器中。下面的命令把当前目录挂载到了容器中的 /data 目录下，容器使用
-  debian镜像，并且启动后执行 :code:`ls /data`。
  .. code-block:: bash
-     docker run --rm -v $(pwd):/data debian ls /data
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddle.org/paddle:latest-gpu
-PaddlePaddle发布的Docker镜像使用说明
------------------------------
-我们把PaddlePaddle的编译环境打包成一个镜像，称为开发镜像，里面涵盖了
-PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打包成一个镜
-像，称为生产镜像，里面涵盖了PaddlePaddle运行所需的所有环境。每次
-PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运
-行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在
-`dockerhub.com <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 
-和国内镜像`docker.paddlepaddle.org` 提供最新
-的Docker镜像，可以在"tags"标签下找到最新的Paddle镜像版本。
-**注意：为了方便在国内的开发者下载Docker镜像，我们提供了国内的镜像服务器供大家使用。如果您在国内，请把文档里命令中的paddlepaddle/paddle替换成docker.paddlepaddle.org/paddle。**
-1. 开发镜像：:code:`paddlepaddle/paddle:0.10.0-dev`
-   这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境，完成开发，编译，发布，
-   文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具，所以如果需要自行配置开发环境需要考虑版本的因素。
-   开发镜像包含了以下工具：
-   - gcc/clang
-   - nvcc
-   - Python
-   - sphinx
-   - woboq
-   - sshd
-   很多开发者会使用远程的安装有GPU的服务器工作，用户可以使用ssh登录到这台服务器上并执行 :code:`docker exec`进入开发镜像并开始工作，
-   也可以在开发镜像中启动一个SSHD服务，方便开发者直接登录到镜像中进行开发:
-   以交互容器方式运行开发镜像：
-   .. code-block:: bash
-      docker run -it --rm -v $(pwd):/paddle  paddlepaddle/paddle:0.10.0-dev /bin/bash
-   或者，可以以后台进程方式运行容器：
-   .. code-block:: bash
-      docker run -d -p 2202:22 -p 8888:8888 -v $(pwd):/paddle paddlepaddle/paddle:0.10.0-dev /usr/sbin/sshd -D
-   然后用密码 :code:`root` SSH进入容器：
-   .. code-block:: bash
-      ssh -p 2202 root@localhost
-   SSH方式的一个优点是我们可以从多个终端进入容器。比如，一个终端运行vi，另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上，并在笔记本上通过SSH与其连接。
-2. 生产镜像：根据CPU、GPU和非AVX区分了如下4个镜像：
-   - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
-   - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
-   - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
-   - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
-   纯CPU镜像以及GPU镜像都会用到AVX指令集，但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX：
-   .. code-block:: bash
-      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-   如果输出是No，就需要选择使用no-AVX的镜像
-   **注：在0.10.0之后的版本，PaddlePaddle都可以自动判断硬件是否支持AVX，所以无需判断AVX即可使用**
-   以上方法在GPU镜像里也能用，只是请不要忘记提前在物理机上安装GPU最新驱动。
+选择下载使用不同的BLAS库的Docker镜像：
-   为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。
-   .. code-block:: bash
+  .. code-block:: bash
-      nvidia-docker run -it --rm paddledev/paddle:0.10.0-gpu /bin/bash
-   注意: 如果使用nvidia-docker存在问题，你也许可以尝试更老的方法，具体如下，但是我们并不推荐这种方法。：
+     # 默认是使用MKL的镜像
+     docker pull paddlepaddle/paddle
+     # 使用OpenBLAS的镜像
+     docker pull paddlepaddle/paddle:latest-openblas
-   .. code-block:: bash
+下载指定版本的Docker镜像，可以从 `DockerHub网站 <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 获取可选的tag，并执行下面的命令：
-      export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+  .. code-block:: bash
-      export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-      docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0-gpu
-3. 运行以及发布您的AI程序
+     docker pull paddlepaddle/paddle:[tag]
+     # 比如：
+     docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu
-   假设您已经完成了一个AI训练的python程序 :code:`a.py`，这个程序是您在开发机上使用开发镜像完成开发。此时您可以运行这个命令在开发机上进行测试运行：
+.. _docker_run:
-   .. code-block:: bash
+在Docker中执行PaddlePaddle训练程序
+------------------------------
-      docker run -it -v $PWD:/work paddle /work/a.py
+假设您已经在当前目录（比如在/home/work）编写了一个PaddlePaddle的程序 :code:`train.py` （可以参考
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+编写），就可以使用下面的命令开始执行训练：
-   如果要使用GPU，请运行：
+  .. code-block:: bash
-   .. code-block:: bash
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+上述命令中， :code:`-it` 参数说明容器已交互式运行； :code:`-v $PWD:/work`
+指定将当前路径（Linux中$PWD变量会展开为当前路径的绝对路径）挂载到容器内部的 :code:`/work`
+目录； :code:`paddlepaddle/paddle` 指定需要使用的容器； 最后 :code:`/work/train.py`
+为容器内执行的命令，即运行训练程序。
-      nvidia-docker run -it -v $PWD:/work paddle /work/a.py
+当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码：
+  .. code-block:: bash
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
-   这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像，可以编写`Dockerfile`使用`FROM paddledev/paddle:0.10.0`
+**注：PaddlePaddle Docker镜像为了减小体积，默认没有安装vim，您可以在容器中执行** :code:`apt-get install -y vim` **安装后，在容器中编辑代码。**
-   创建和发布自己的AI程序镜像。
-运行PaddlePaddle Book
+.. _docker_run_book:
---------------------
-Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
+使用Docker启动PaddlePaddle Book教程
+------------------------------
+使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook，可以通过网页浏览。
 PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
 如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
+大家可以通过它阅读教程，或者制作和分享带有代码、公式、图表、文字的交互式文档。
 我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：
-.. code-block:: bash
+  .. code-block:: bash
-    docker run -p 8888:8888 paddlepaddle/book
+     docker run -p 8888:8888 paddlepaddle/book
 然后在浏览器中输入以下网址：
-.. code-block:: text
+  .. code-block:: text
-    http://localhost:8888/
+     http://localhost:8888/
 就这么简单，享受您的旅程！
-通过Docker容器开发PaddlePaddle
+.. _docker_run_gpu:
------------------------------
-开发人员可以在Docker开发镜像中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux，Mac OS X和Windows。
-1. 制作PaddlePaddle开发镜像
+使用Docker执行GPU训练
+------------------------------
-   PaddlePaddle每次发布新版本都会发布对应的开发镜像供开发者直接使用。这里介绍如生成造这个开发镜像。
-   生成Docker镜像的方式有两个，一个是直接把一个容器转换成镜像，另一个是创建Dockerfile并运行docker build指令按照Dockerfile生成镜像。第一个方法的好处是简单快捷，适合自己实验，可以快速迭代。第二个方法的好处是Dockerfile可以把整个生成流程描述很清楚，其他人很容易看懂镜像生成过程，持续集成系统也可以简单地复现这个过程。我们采用第二个方法。Dockerfile位于PaddlePaddle repo的根目录。生成生产镜像只需要运行：
-   .. code-block:: bash
-      git clone https://github.com/PaddlePaddle/Paddle.git
-      cd Paddle
-      docker build -t paddle:dev .
-   docker build这个命令的-t指定了生成的镜像的名字，这里我们用paddle:dev。到此，PaddlePaddle开发镜像就被构建完毕了。
-2. 制作PaddlePaddle生产镜像
+为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_ 来运行镜像。
+请不要忘记提前在物理机上安装GPU最新驱动。
-   生产镜像的生成分为两步，第一步是运行：
+  .. code-block:: bash
-   .. code-block:: bash
+     nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
-      docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev
-   以上命令会编译PaddlePaddle，生成运行程序，以及生成创建生产镜像的Dockerfile。所有生成的的文件都在build目录下。“WITH_GPU”控制生成的生产镜像是否支持GPU，“WITH_AVX”控制生成的生产镜像是否支持AVX，”WITH_TEST“控制是否生成单元测试。
+**注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**
-   第二步是运行：
+  .. code-block:: bash
-   .. code-block:: bash
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-      docker build -t paddle:prod -f build/Dockerfile ./build
+     docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
-   以上命令会按照生成的Dockerfile把生成的程序拷贝到生产镜像中并做相应的配置，最终生成名为paddle:prod的生产镜像。
+**关于AVX：**
-3. 运行单元测试
+AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
+是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
+`编译 <./build_from_source_cn.rst>`_ PaddlePaddle为no-avx版本。
-   运行以下指令：
+以下指令能检查Linux电脑是否支持AVX：
   .. code-block:: bash
-      docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
-文档
----
-Paddle的Docker开发镜像带有一个通过 `woboq code browser
-<https://github.com/woboq/woboq_codebrowser>`_ 生成的HTML版本的C++源代码，便于用户浏览C++源码。
-只要在Docker里启动PaddlePaddle的时候给它一个名字，就可以再运行另一个Nginx Docker镜像来服务HTML代码：
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-.. code-block:: bash
-   docker run -d --name paddle-cpu-doc paddle:0.10.0-dev
-   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
-接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。
+如果输出是No，就需要选择使用no-AVX的镜像
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
 PaddlePaddle in Docker Containers
 =================================
-Docker container is currently the only officially-supported way to
+Run PaddlePaddle in Docker container so that you don't need to care about
-running PaddlePaddle.  This is reasonable as Docker now runs on all
+runtime dependencies, also you can run under Windows system. You can get
-major operating systems including Linux, Mac OS X, and Windows.
+tutorials at `here <https://docs.docker.com/get-started/>`_ .
-Please be aware that you will need to change `Dockers settings
-<https://github.com/PaddlePaddle/Paddle/issues/627>`_ to make full use
-of your hardware resource on Mac OS X and Windows.
-Working With Docker
+If you are using Windows, please refer to
-------------------
+`this <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+tutorial to start running docker under windows.
-Docker is simple as long as we understand a few basic concepts:
+After you've read above tutorials you may proceed the following steps.
- *image*: A Docker image is a pack of software. It could contain one or more programs and all their dependencies. For example, the PaddlePaddle's Docker image includes pre-built PaddlePaddle and Python and many Python packages. We can run a Docker image directly, other than installing all these software. We can type
+.. _docker_pull:
-  .. code-block:: bash
+Pull PaddlePaddle Docker Image
+------------------------------
-     docker images
-  to list all images in the system. We can also run
+Run the following command to download the latest Docker images:
  .. code-block:: bash
-     docker pull paddlepaddle/paddle:0.10.0
-  to download a Docker image, paddlepaddle/paddle in this example,
+     docker pull paddlepaddle/paddle
-  from Dockerhub.com.
- *container*: considering a Docker image a program, a container is a
+For users in China, we provide a faster mirror:
-  "process" that runs the image. Indeed, a container is exactly an
-  operating system process, but with a virtualized filesystem, network
-  port space, and other virtualized environment. We can type
  .. code-block:: bash
-     docker run paddlepaddle/paddle:0.10.0
+     docker pull docker.paddlepaddle.org/paddle
-  to start a container to run a Docker image, paddlepaddle/paddle in this example.
+Download GPU version images:
- By default docker container have an isolated file system namespace,
-  we can not see the files in the host file system. By using *volume*,
-  mounted files in host will be visible inside docker container.
-  Following command will mount current dirctory into /data inside
-  docker container, run docker container from debian image with
-  command :code:`ls /data`.
  .. code-block:: bash
-     docker run --rm -v $(pwd):/data debian ls /data
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddle.org/paddle:latest-gpu
-Usage of CPU-only and GPU Images
----------------------------------
-We package PaddlePaddle's compile environment into a Docker image,
-called the develop image, it contains all compiling tools that
-PaddlePaddle needs. We package compiled PaddlePaddle program into a
-Docker image as well, called the production image, it contains all
-runtime environment that running PaddlePaddle needs. For each version
-of PaddlePaddle, we release both of them. Production image includes
-CPU-only version and a CUDA GPU version and their no-AVX versions.
-We put the docker images on `dockerhub.com
-<https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_. You can find the
-latest versions under "tags" tab at dockerhub.com. 
-** NOTE: If you are in China, you can use our Docker image registry mirror to speed up the download process. To use it, please replace all paddlepaddle/paddle in the commands to docker.paddlepaddle.org/paddle.**
-1. development image :code:`paddlepaddle/paddle:<version>-dev`
-   This image has packed related develop tools and runtime
-   environment. Users and developers can use this image instead of
-   their own local computer to accomplish development, build,
-   releasing, document writing etc. While different version of paddle
-   may depends on different version of libraries and tools, if you
-   want to setup a local environment, you must pay attention to the
-   versions.  The development image contains:
-   - gcc/clang
-   - nvcc
-   - Python
-   - sphinx
-   - woboq
-   - sshd
-   Many developers use servers with GPUs, they can use ssh to login to
-   the server and run :code:`docker exec` to enter the docker
-   container and start their work.  Also they can start a development
-   docker image with SSHD service, so they can login to the container
-   and start work.
-2. Production images, this image might have multiple variants:
-   - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
-   - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
-   - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
-   - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
-   Please be aware that the CPU-only and the GPU images both use the
-   AVX instruction set, but old computers produced before 2008 do not
-   support AVX.  The following command checks if your Linux computer
-   supports AVX:
-   .. code-block:: bash
-      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-   **NOTE：versions after 0.10.0 will automatically detect system AVX support, so manual detect is not needed in this case.**
-   To run the CPU-only image as an interactive container:
-   .. code-block:: bash
-      docker run -it --rm paddlepaddle/paddle:0.10.0 /bin/bash
-   Above method work with the GPU image too -- the recommended way is
-   using `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_.
-   Please install nvidia-docker first following this `tutorial
-   <https://github.com/NVIDIA/nvidia-docker#quick-start>`_.
-   Now you can run a GPU image:
-   .. code-block:: bash
-      nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0-gpu /bin/bash
-Train Model Using Python API
----------------------------
-Our official docker image provides a runtime for PaddlePaddle
-programs. The typical workflow will be as follows:
-Create a directory as workspace:
-.. code-block:: bash
-   mkdir ~/workspace
-Edit a PaddlePaddle python program using your favourite editor
-.. code-block:: bash
-   emacs ~/workspace/example.py
-Run the program using docker:
-.. code-block:: bash
-   docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 python /workspace/example.py
-Or if you are using GPU for training:
-.. code-block:: bash
+Choose between different BLAS version:
-   nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu python /workspace/example.py
+  .. code-block:: bash
-Above commands will start a docker container by running :code:`python
-/workspace/example.py`. It will stop once :code:`python
-/workspace/example.py` finishes.
-Another way is to tell docker to start a :code:`/bin/bash` session and
-run PaddlePaddle program interactively:
-.. code-block:: bash
-   docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 /bin/bash
-   # now we are inside docker container
-   cd /workspace
-   python example.py
-Running with GPU is identical:
-.. code-block:: bash
-   nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu /bin/bash
-   # now we are inside docker container
-   cd /workspace
-   python example.py
-Develop PaddlePaddle or Train Model Using C++ API
---------------------------------------------------
-We will be using PaddlePaddle development image since it contains all
-compiling tools and dependencies.
-1. Build PaddlePaddle develop image
+     # image using MKL by default
+     docker pull paddlepaddle/paddle
+     # image using OpenBLAS
+     docker pull paddlepaddle/paddle:latest-openblas
-   Use following command to build PaddlePaddle develop image:
-   .. code-block:: bash
+If you want to use legacy versions, choose a tag from
+`DockerHub <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_
+and run:
-      git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle
+  .. code-block:: bash
-      docker build -t paddle:dev .
-2. Build PaddlePaddle production image
-   There are two steps for building production image, the first step is to run:
+     docker pull paddlepaddle/paddle:[tag]
+     # i.e.
+     docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu
-   .. code-block:: bash
+.. _docker_run:
-      docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev
+Launch your training program in Docker
+------------------------------
-   The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated.
+Assume that you have already written a PaddlePaddle program
+named :code:`train.py` under directory :code:`/home/work` (refer to 
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+for more samples), then run the following command:
-   The second step is to run:
+  .. code-block:: bash
-   .. code-block:: bash
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
-      docker build -t paddle:prod -f build/Dockerfile ./build
+In the above command, :code:`-it` means run the container interactively;
+:code:`-v $PWD:/work` means mount the current directory ($PWD will expand
+to current absolute path in Linux) under :code:`/work` in the container.
+:code:`paddlepaddle/paddle` to specify image to use; finnally
+:code:`/work/train.py` is the command to run inside docker.
-   The above command will generate the production image by copying the compiled PaddlePaddle program into the image.
+Also, you can go into the container shell, run or debug your code
+interactively:
-3. Run unit test
+  .. code-block:: bash
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
-   Following command will run unit test:
+**NOTE: We did not install vim in the default docker image to reduce the image size, you can run** :code:`apt-get install -y vim` **to install it if you need to edit python files.**
-   .. code-block:: bash
+.. _docker_run_book:
-      docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
 PaddlePaddle Book
 ------------------
-The Jupyter Notebook is an open-source web application that allows
+You can create a container serving PaddlePaddle Book using Jupyter Notebook in
-you to create and share documents that contain live code, equations,
+one minute using Docker. PaddlePaddle Book is an interactive Jupyter Notebook
-visualizations and explanatory text in a single browser.
+for users and developers.If you want to
-PaddlePaddle Book is an interactive Jupyter Notebook for users and developers.
-We already exposed port 8888 for this book. If you want to
 dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
 We provide a packaged book image, simply issue the command:
-.. code-block:: bash
+  .. code-block:: bash
-    docker run -p 8888:8888 paddlepaddle/book
+     docker run -p 8888:8888 paddlepaddle/book
 Then, you would back and paste the address into the local browser:
-.. code-block:: text
+  .. code-block:: text
-    http://localhost:8888/
+     http://localhost:8888/
 That's all. Enjoy your journey!
+.. _docker_run_gpu:
-Documentation
+Train with Docker with GPU
-------------
+------------------------------
-Paddle Docker images include an HTML version of C++ source code
+We recommend using
-generated using `woboq code browser
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_
-<https://github.com/woboq/woboq_codebrowser>`_.  This makes it easy
+to run GPU training jobs. Please ensure you have latest
-for users to browse and understand the C++ source code.
+GPU driver installed before move on.
-As long as we give the Paddle Docker container a name, we can run an
+  .. code-block:: bash
-additional Nginx Docker container to serve the volume from the Paddle
-container:
-.. code-block:: bash
+     nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
-   docker run -d --name paddle-cpu-doc paddle:<version>
+**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**
-   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
+  .. code-block:: bash
-Then we can direct our Web browser to the HTML version of source code
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-at http://localhost:8088/paddle/
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
+**About AVX:**
+AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations.
+The latest PaddlePaddle Docker image turns AVX on by default, so, if your
+computer doesn't support AVX, you'll probably need to
+`build <./build_from_source_en.rst>`_ with :code:`WITH_AVX=OFF`.
+The following command will tell you whether your computer supports AVX.
+   .. code-block:: bash
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
@@ -6,12 +6,13 @@
 安装流程
 ++++++++
-PaddlePaddle提供Docker镜像来部署环境。
+PaddlePaddle提供pip和Docker的安装方式：
 .. toctree::
   :maxdepth: 1
-   docker_install_cn.rst 
+   pip_install_cn.rst
+   docker_install_cn.rst
 编译流程
@@ -19,9 +20,14 @@ PaddlePaddle提供Docker镜像来部署环境。
 ..  warning::
-    编译流程主要推荐高级用户查看，普通用户请走安装流程。
+    建议直接使用上述安装流程，方便快速安装。只有在遇到需要独立定制的二进制时才需要编译。
 ..  toctree::
    :maxdepth: 1
-    cmake/build_from_source_cn.rst
+    build_from_source_cn.rst
+常见问题解答
++++++++++
+`常见问题解答 <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_cn.html>`_
--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/getstarted/build_and_install/index_en.rst
 Install and Build
 =================
-Install PaddlePaddle
+.. _install_steps:
----------------------
-..  toctree::
+Install Steps
-    :maxdepth: 1
++++++++
+You can choose either pip or Docker to complete your install:
+.. toctree::
+   :maxdepth: 1
+   pip_install_en.rst
+   docker_install_en.rst
-    docker_install_en.rst
 Build from Source
 -----------------
 ..  warning::
-    Please use :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
+    We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary.
 ..  toctree::
    :maxdepth: 1
    build_from_source_en.md
+FAQ
++++++++++
+`FAQ <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_en.html>`_
--- a/doc/getstarted/build_and_install/paddleci.png
+++ b/doc/getstarted/build_and_install/paddleci.png
--- a/doc/getstarted/build_and_install/pip_install_cn.rst
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
+使用pip安装PaddlePaddle
+================================
+PaddlePaddle可以使用常用的Python包管理工具
+`pip <https://pip.pypa.io/en/stable/installing/>`_
+完成安装，并可以在大多数主流的Linux操作系统以及MacOS上执行。
+.. _pip_install:
+使用pip安装
+------------------------------
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件。
+  .. code-block:: bash
+     pip install paddlepaddle
+如果需要安装支持GPU的版本，需要执行：
+  .. code-block:: bash
+     pip install paddlepaddle-gpu
+如果需要获取并安装最新的（开发分支）PaddlePaddle，可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装，
+您可以从下面的表格中找到需要的版本：
+如果在点击下面链接时出现如下登陆界面，点击“Log in as guest”即可开始下载：
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+..  csv-table:: 各个版本最新的whl包
+    :header: "版本说明", "cp27-cp27mu", "cp27-cp27mu", "C-API"
+    :widths: 1, 3, 3, 3
+    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+.. _pip_dependency:
+运行环境依赖
+------------------------------
+PaddlePaddle安装包由于不仅仅包含.py程序，而且包含了C++编写的部分，所以我们确保发布的二进制包可以支持主流的Linux操作系统，比如CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上。
+PaddlePaddle发布的安装包会尽量对齐 `manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_ 标准，通常使用CentOS 5作为编译环境。但由于CUDA库通常需要CentOS 6以上，而且CentOS 5即将停止维护，所以我们默认使用CentOS 6作为标准编译环境。
+.. csv-table:: PaddlePaddle环境依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+   "操作系统", "Linux, MacOS", "CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上"
+   "Python", "2.7.x", "暂时不支持Python3"
+   "libc.so", "GLIBC_2.7", "glibc至少包含GLIBC_2.7以上的符号"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "至少包含GLIBCXX_3.4.11, CXXABI_1.3.3以上的符号"
+   "libgcc_s.so", "GCC_3.3", "至少包含GCC_3.3以上的符号"
+.. _pip_faq:
+安装常见问题和解决方法
+------------------------------
+- paddlepaddle*.whl is not a supported wheel on this platform.
+  出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准，需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip：
+    .. code-block:: bash
+       pip install --upgrade pip
+  如果仍然存在问题，可以执行：
+      .. code-block:: bash
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+  获取当前系统支持的安装包格式，并检查和需安装的包是否匹配。pypi安装包可以在 `这个 <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_ 链接中找到。
+  如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ，需要升级pip版本到最新； 如果系统支持 manylinux1_x86_64 而安装包（本地）是 linux_x86_64 ，可以重命名这个whl包为 manylinux1_x86_64 再安装。
\ No newline at end of file
--- a/doc/getstarted/build_and_install/pip_install_en.rst
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
+Install PaddlePaddle Using pip
+================================
+You can use current widely used Python package management
+tool `pip <https://pip.pypa.io/en/stable/installing/>`_
+to install PaddlePaddle. This method can be used in
+most of current Linux systems or MacOS.
+.. _pip_install:
+Install Using pip
+------------------------------
+Run the following command to install PaddlePaddle on the current
+machine, it will also download requirements.
+  .. code-block:: bash
+     pip install paddlepaddle
+If you wish to install GPU version, just run:
+  .. code-block:: bash
+     pip install paddlepaddle-gpu
+If you wish to install the latest develop branch PaddlePaddle, 
+you can download the latest whl package from our CI system. Access
+the below links, log in as guest, then click at the "Artifact"
+tab, you'll find the download link of whl packages.
+If the links below shows up the login form, just click "Log in as guest" to start the download:
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+..  csv-table:: whl package of each version
+    :header: "version", "cp27-cp27mu", "cp27-cp27mu", "C-API"
+    :widths: 1, 3, 3, 3
+    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+.. _pip_dependency:
+Runtime Dependency
+------------------------------
+PaddlePaddle installation packages (whl) does not only contain .py files,
+but also binaries built from C++ code. We ensure that PaddlePaddle can
+run on current mainline Linux distributions, like CentOS 6, Ubuntu 14.04
+and MacOS 10.12.
+PaddlePaddle whl packages are trying to satisfy
+`manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_
+standard, which uses CentOS 5 as default build environment. But CUDA libraries
+seems only run on CentOS 6 at least, also, CentOS 5 is about to end its lifetime,
+so we use CentOS 6 as default build environment.
+.. csv-table:: PaddlePaddle Runtime Deps
+   :header: "Dependency", "version", "description"
+   :widths: 10, 15, 30
+   "OS", "Linux, MacOS", "CentOS 6 or later，Ubuntu 14.04 or later，MacOS 10.12 or later"
+   "Python", "2.7.x", "Currently Python3 is not supported"
+   "libc.so", "GLIBC_2.7", "glibc at least include GLIBC_2.7 symbols"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "At least include GLIBCXX_3.4.11, CXXABI_1.3.3 symbols"
+   "libgcc_s.so", "GCC_3.3", "At least include GCC_3.3 symbols"
+.. _pip_faq:
+FAQ
+------------------------------
+- paddlepaddle*.whl is not a supported wheel on this platform.
+  The main cause of this issue is that your current platform is
+  not supported. Please check that you are using Python 2.7 series.
+  Besides, pypi only supports manylinux1 standard, you'll need to
+  upgrade your pip to >9.0.0. Then run the below command:
+    .. code-block:: bash
+       pip install --upgrade pip
+  If the problem still exists, run the following command:
+      .. code-block:: bash
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+  Then you'll get supported package suffixes, then check if it matches
+  the file name of the whl package. You can find default whl package at
+  `here <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_
+  If your system supports linux_x86_64 but the whl package is manylinux1_x86_64,
+  you'll need to update pip to the latest version; If your system supports
+  manylinux1_x86_64 but the whl package is linux_x86_64 you can rename the
+  file to manylinux1_x86_64 suffix and then install.
--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
 新手入门
 ============
+.. _quick_install:
+快速安装
++++++++
+PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
+执行下面的命令完成快速安装：
+  .. code-block:: bash
+     pip install paddlepaddle
+如果需要安装支持GPU的版本，需要执行：
+  .. code-block:: bash
+     pip install paddlepaddle-gpu
+更详细的安装和编译方法参考：
 ..  toctree::
  :maxdepth: 1
  build_and_install/index_cn.rst
-  concepts/use_concepts_cn.rst
- `深度学习入门课程 <http://book.paddlepaddle.org/index.cn.html>`_
+.. _quick_start:
+快速开始
++++++++
+创建一个 housing.py 并粘贴此Python代码：
+  .. code-block:: python
+     import paddle.v2 as paddle
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
+..  toctree::
+  :maxdepth: 1
+  concepts/use_concepts_cn.rst
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
 GET STARTED
 ============
+.. _quick_install:
+Quick Install
+----------------------
+You can use pip to install PaddlePaddle with a single command, supports
+CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
+Simply run the following command to install:
+  .. code-block:: bash
+     pip install paddlepaddle
+If you need to install GPU version, run:
+  .. code-block:: bash
+     pip install paddlepaddle-gpu
+For more details about installation and build:
 ..  toctree::
  :maxdepth: 1
  build_and_install/index_en.rst
- `Deep Learning 101 <http://book.paddlepaddle.org/index.html>`_
+.. _quick_start:
+Quick Start
++++++++
+Create a new file called housing.py, and paste this Python
+code:
+  .. code-block:: python
+     import paddle.v2 as paddle
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+Run :code:`python housing.py` and voila! It should print out a list of predictions
+for the test housing data.
--- a/doc/howto/optimization/cpu_profiling.md
+++ b/doc/howto/optimization/cpu_profiling.md
+此教程会介绍如何使用Python的cProfile包，与Python库yep，google perftools来运行性能分析(Profiling)与调优。
+运行性能分析可以让开发人员科学的，有条不紊的对程序进行性能优化。性能分析是性能调优的基础。因为在程序实际运行中，真正的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。
+性能优化的步骤，通常是循环重复若干次『性能分析 --> 寻找瓶颈 ---> 调优瓶颈 --> 性能分析确认调优效果』。其中性能分析是性能调优的至关重要的量化指标。
+Paddle提供了Python语言绑定。用户使用Python进行神经网络编程，训练，测试。Python解释器通过`pybind`和`swig`调用Paddle的动态链接库，进而调用Paddle C++部分的代码。所以Paddle的性能分析与调优分为两个部分:
+* Python代码的性能分析
+* Python与C++混合代码的性能分析
+## Python代码的性能分析
+### 生成性能分析文件
+Python标准库中提供了性能分析的工具包，[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
+```bash
+python -m cProfile -o profile.out main.py
+```
+其中`-o`标识了一个输出的文件名，用来存储本次性能分析的结果。如果不指定这个文件，`cProfile`会打印一些统计信息到`stdout`。这不方便我们进行后期处理(进行`sort`, `split`, `cut`等等)。
+### 查看性能分析文件
+当main.py运行完毕后，性能分析结果文件`profile.out`就生成出来了。我们可以使用[cprofilev](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务，将性能分析结果以网页的形式展示出来。
+使用`pip install cprofilev`安装`cprofilev`工具。安装完成后，使用如下命令开启HTTP服务
+```bash
+cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
+```
+其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
+访问对应网址，即可显示性能分析的结果。性能分析结果格式如下:
+```text
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
+```
+每一列的含义是:
+| 列名 | 含义 |
+| --- | --- |
+| ncalls | 函数的调用次数 |
+| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
+| percall | tottime的每次调用平均时间 |
+| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
+| percall | cumtime的每次调用平均时间 |
+| filename:lineno(function) | 文件名, 行号，函数名 |
+### 寻找性能瓶颈
+通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
+将性能分析结果按照tottime排序，效果如下:
+```text
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
+```
+可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python与C++混合代码的性能分析`来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
+```text
+Called By:
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+Function                                                                                                 was called by...
+                                                                                                             ncalls  tottime  cumtime
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+Called:
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+```
+通常观察热点函数间的调用关系，和对应行的代码，就可以了解到问题代码在哪里。当我们做出性能修正后，再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
+## Python与C++混合代码的性能分析
+### 生成性能分析文件
+C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
+使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为
+```bash
+apt install libgoogle-perftools-dev
+pip install yep
+```
+安装完毕后，我们可以通过
+```bash
+python -m yep -v main.py
+```
+生成性能分析文件。生成的性能分析文件为`main.py.prof`。
+命令行中的`-v`指定在生成性能分析文件之后，在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同，编译时可能会去掉调试信息，运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果，可以采取下面几点措施:
+1. 编译时指定`-g`生成调试信息。使用cmake的话，可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
+2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
+3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟如果单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+### 查看性能分析文件
+在运行完性能分析后，会生成性能分析结果文件。我们可以使用[pprof](https://github.com/google/pprof)来显示性能分析结果。注意，这里使用了用`Go`语言重构后的`pprof`，因为这个工具具有web服务界面，且展示效果更好。
+安装`pprof`的命令和一般的`Go`程序是一样的，其命令如下:
+```bash
+go get github.com/google/pprof
+```
+进而我们可以使用如下命令开启一个HTTP服务:
+```bash
+pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
+```
+这行命令中，`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径，进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
+访问对应的网址，我们可以查看性能分析的结果。结果如下图所示:
+![result](./pprof_1.png)
+### 寻找性能瓶颈
+与寻找Python代码的性能瓶颈类似，寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
+例如下图中，
+![kernel_perf](./pprof_2.png)
+在一次训练中，乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然，`MomentumOp`的性能有问题。
+在`pprof`中，对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题，再检查其他部分的性能问题，可以更有次序的完成性能的优化。
+## 总结
+至此，两种性能分析的方式都介绍完毕了。希望通过这两种性能分析的方式，Paddle的开发人员和使用人员可以有次序的，科学的发现和解决性能问题。
--- a/doc/howto/optimization/pprof_1.png
+++ b/doc/howto/optimization/pprof_1.png
--- a/doc/howto/optimization/pprof_2.png
+++ b/doc/howto/optimization/pprof_2.png
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -55,7 +55,7 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
 }
 PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                          paddle_real* value) {
+                                            paddle_real* value) {
  if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
  auto ptr = cast(mat);
  if (ptr->mat == nullptr) return kPD_NULLPTR;
@@ -75,7 +75,7 @@ PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
 }
 PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                          paddle_real* result) {
+                                            paddle_real* result) {
  if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
  auto ptr = cast(mat);
  if (ptr->mat == nullptr) return kPD_NULLPTR;

--- a/paddle/capi/examples/model_inference/dense/main.c
+++ b/paddle/capi/examples/model_inference/dense/main.c
 #include <paddle/capi.h>
 #include <time.h>
 #include "../common/common.h"
 #define CONFIG_BIN "./trainer_config.bin"
@@ -27,20 +28,19 @@ int main() {
  CHECK(paddle_arguments_resize(in_args, 1));
  // Create input matrix.
-  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 10,
+  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
                                           /* size */ 784,
                                           /* useGPU */ false);
  srand(time(0));
-  std::vector<paddle_real> input;
+  paddle_real* array;
-  input.resize(784 * 10);
+  // Get First row.
+  CHECK(paddle_matrix_get_row(mat, 0, &array));
-  for (int i = 0; i < input.size(); ++i) {
+  for (int i = 0; i < 784; ++i) {
-    input[i] = rand() / ((float)RAND_MAX);
+    array[i] = rand() / ((float)RAND_MAX);
  }
-  // Set value for the input matrix
-  CHECK(paddle_matrix_set_value(mat, input.data()));
  CHECK(paddle_arguments_set_value(in_args, 0, mat));
@@ -53,17 +53,18 @@ int main() {
  CHECK(paddle_arguments_get_value(out_args, 0, prob));
-  std::std::vector<paddle_real> result;
+  uint64_t height;
-  int height;
+  uint64_t width;
-  int width;
-  CHECK(paddle_matrix_get_shape(prob, &height, &width);
+  CHECK(paddle_matrix_get_shape(prob, &height, &width));
-  result.resize(height * width);
+  CHECK(paddle_matrix_get_row(prob, 0, &array));
-  CHECK(paddle_matrix_get_value(prob, result.data()));
-  printf("Prob: ");
+  printf("Prob: \n");
  for (int i = 0; i < height * width; ++i) {
-    printf("%.2f ", result[i]);
+    printf("%.4f ", array[i]);
+    if ((i + 1) % width == 0) {
+      printf("\n");
+    }
  }
  printf("\n");

--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
@@ -79,7 +79,7 @@ PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
 * @note  value should contain enough element of data to init the mat
 */
 PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                          paddle_real* value);
+                                            paddle_real* value);
 /**
 * @brief PDMatGetRow Get raw row buffer from matrix
@@ -93,14 +93,14 @@ PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
                                          paddle_real** rawRowBuffer);
 /**
- * @brief copy data from the matrix 
+ * @brief copy data from the matrix
 * @param [in] mat Target matrix
- * @param [out] result pointer to store the matrix data 
+ * @param [out] result pointer to store the matrix data
 * @return paddle_error
 * @note the space of the result should allocated before invoke this API
 */
 PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                          paddle_real* result);
+                                            paddle_real* result);
 /**
 * @brief PDMatCreateNone Create None Matrix
 * @return

--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -6,7 +6,10 @@ cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
+cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
@@ -51,10 +54,6 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope frame
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
-cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor)
-cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place)
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
        proto_desc)
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)

--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -22,7 +22,6 @@
 #include "paddle/framework/block_desc.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/dynamic_recurrent_op.h"
 #include "paddle/operators/net_op.h"
 namespace paddle {
@@ -218,21 +217,6 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
                     return false;
                   });
-    // process recurrent gradient op as a special operator.
-    if (forwardOp.Type() == "dynamic_recurrent") {
-      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
-      // or this will result in infinite loop.
-      const auto& rnnop =
-          *static_cast<const operators::DynamicRecurrentOp*>(&forwardOp);
-      auto rnn_grad_op =
-          static_cast<operators::DynamicRecurrentGradientOp*>(grad_op.get());
-      const auto& stepnet_op =
-          *static_cast<const OperatorBase*>(&rnnop.rnn.GetStepUnit());
-      // create stepnet's gradient op
-      rnn_grad_op->rnn.SetStepUnit(
-          BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id));
-    }
    if (net->ops_.empty()) {  // Current no aux op is added to network
      return grad_op;
    }
@@ -513,21 +497,16 @@ ParamGradInfoMap AppendBackward(
  const int root_block_idx = 0;
  auto root_block = program_desc.MutableBlock(root_block_idx);
-  // insert fill one op for target
-  // TODO(qiao) add some check to the target.
  std::string fill_one_op_out = GradVarName(target.Name());
-  std::vector<int64_t> target_shape_desc = target.Shape();
+  bool is_scalar = target.Shape() == std::vector<int64_t>{1};
-  std::vector<int> target_shape;
+  PADDLE_ENFORCE(is_scalar, "target should be scalar");
-  std::transform(target_shape_desc.begin(), target_shape_desc.end(),
-                 std::back_inserter(target_shape),
-                 [](int64_t dim) { return static_cast<int>(dim); });
  VLOG(3) << "backward from loss=" << target.Name()
          << " data_type=" << target.GetDataType();
  std::unique_ptr<OpDescBind> fill_one_op(
      new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}},
-                     {{"shape", target_shape},
+                     {{"shape", std::vector<int>{1}},
                      {"value", static_cast<float>(1.0)},
-                      {"data_type", target.GetDataType()}}));
+                      {"dtype", target.GetDataType()}}));
  // infer var type of fill_one_op
  fill_one_op->InferVarType(root_block);

--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -508,6 +508,7 @@ TEST(Backward, simple_single_op) {
  op->SetOutput("Out", {"out"});
  auto target = f::VarDescBind("out");
+  target.SetShape({1});
  auto var_to_grad = AppendBackward(program, target, {});
  ASSERT_EQ(block->AllOps().size(), 3UL);
@@ -544,6 +545,7 @@ TEST(Backward, default_attribute) {
  op->CheckAttrs();
  auto target = f::VarDescBind("out");
+  target.SetShape({1});
  AppendBackward(program, target, {});
  ASSERT_EQ(block->AllOps().size(), 3UL);
@@ -581,6 +583,7 @@ TEST(Backward, simple_mult_op) {
  op3->SetOutput("Out", {"out3"});
  auto target = f::VarDescBind("out3");
+  target.SetShape({1});
  size_t forward_len = block->AllOps().size();
  auto var_to_grad = AppendBackward(program, target, {});
@@ -670,6 +673,7 @@ TEST(Backward, intermedia_var_no_grad) {
  op4->SetOutput("Out", {"out4"});
  auto target = f::VarDescBind("out4");
+  target.SetShape({1});
  size_t forward_len = block->AllOps().size();
  auto var_to_grad = AppendBackward(program, target, {"out3"});
@@ -730,6 +734,7 @@ TEST(Backward, var_no_grad) {
  op2->SetOutput("Z", {"z2"});
  auto target = f::VarDescBind("z2");
+  target.SetShape({1});
  size_t forward_len = block->AllOps().size();
  auto var_to_grad = AppendBackward(program, target, {"z1"});
@@ -810,6 +815,7 @@ TEST(Backward, shared_var) {
  op3->SetOutput("Out", {"out3"});
  auto target = f::VarDescBind("out3");
+  target.SetShape({1});
  size_t forward_len = block->AllOps().size();
  auto var_to_grad = AppendBackward(program, target, {});
@@ -888,6 +894,7 @@ TEST(Backward, half_backward) {
  op1->SetOutput("Out", {"out"});
  auto target = f::VarDescBind("out");
+  target.SetShape({1});
  size_t forward_len = block->AllOps().size();
  auto var_to_grad = AppendBackward(program, target, {"b"});
  f::OpDescBind *fill_op = block->AllOps()[forward_len];

--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -46,6 +46,8 @@ inline std::type_index ToTypeIndex(DataType type) {
      return typeid(int);
    case DataType::INT64:
      return typeid(int64_t);
+    case DataType::BOOL:
+      return typeid(bool);
    default:
      PADDLE_THROW("Not support type %d", type);
  }
@@ -66,6 +68,9 @@ inline void VisitDataType(DataType type, Visitor visitor) {
    case DataType::INT64:
      visitor.template operator()<int64_t>();
      break;
+    case DataType::BOOL:
+      visitor.template operator()<bool>();
+      break;
    default:
      PADDLE_THROW("Not supported");
  }

--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -120,7 +120,7 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
  for (auto& op_desc : block.AllOps()) {
    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
-    VLOG(10) << op->DebugString();
+    VLOG(3) << op->DebugString();
    op->Run(*local_scope, *device);
  }
  if (create_local_scope) {

--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -24,6 +24,7 @@
 #include <glog/logging.h>
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
@@ -175,9 +176,9 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
  PADDLE_ENFORCE_EQ(num_instances, lod_level.size() - 1);
  for (size_t ins = 0; ins < num_instances; ins++) {
    for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) {
-      tensor.Slice(elem, elem + 1)
+      auto slice = tensor.Slice(elem, elem + 1);
-          .CopyFrom(source.Slice(ins, ins + 1), platform::CPUPlace(),
+      CopyFrom(source.Slice(ins, ins + 1), platform::CPUPlace(),
-                    platform::CPUDeviceContext());
+               platform::CPUDeviceContext(), &slice);
    }
  }
  return tensor;

--- a/paddle/framework/prune.cc
+++ b/paddle/framework/prune.cc
@@ -26,6 +26,8 @@ namespace framework {
 const std::string kFeedOpType = "feed";
 const std::string kFetchOpType = "fetch";
+const std::string kDropOutOpType = "dropout";
+const std::string kBatchNormOpType = "batch_norm";
 bool HasDependentVar(const OpDesc& op_desc,
                     const std::set<std::string>& dependent_vars) {
@@ -106,5 +108,26 @@ void Prune(const ProgramDesc& input, ProgramDesc* output) {
  prune_impl(input, output, 0);
 }
+void inference_optimize_impl(const ProgramDesc& input, ProgramDesc* output,
+                             int block_id) {
+  *output = input;
+  auto* op_field = output->mutable_blocks(block_id)->mutable_ops();
+  for (auto& op_desc : *op_field) {
+    if (op_desc.type() == kDropOutOpType ||
+        op_desc.type() == kBatchNormOpType) {
+      for (auto& attr : *op_desc.mutable_attrs()) {
+        if (attr.name() == "is_test") {
+          attr.set_b(true);
+          break;
+        }
+      }
+    }
+  }
+}
+void InferenceOptimize(const ProgramDesc& input, ProgramDesc* output) {
+  inference_optimize_impl(input, output, 0);
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/prune.h
+++ b/paddle/framework/prune.h
@@ -22,5 +22,7 @@ namespace framework {
 void Prune(const ProgramDesc& input, ProgramDesc* output);
+void InferenceOptimize(const ProgramDesc& input, ProgramDesc* output);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -89,34 +89,6 @@ class Tensor {
  /*! The internal of two tensors share the same memory block. */
  inline Tensor& ShareDataWith(const Tensor& src);
-  /**
-   * @brief   Copy the content of external tensor to a new place.
-   *
-   * @param[in] src        The external tensor.
-   * @param[in] dst_place  The dst place.
-   * @param[in] ctx        The device context contains device resources.
-   *
-   * @note    CopyFrom supports CPU <-> GPU, GPU <-> GPU.
-   */
-  // TODO(qijun): https://github.com/PaddlePaddle/Paddle/issues/4647
-  // Remove `CopyFrom` and `CopyFromVector` from Tensor interface
-  // and make them global functions
-  inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
-                       const platform::DeviceContext& ctx);
-  /**
-   * @brief   Copy the content of an external vector to a tensor.
-   *
-   * @param[in] src        The external tensor.
-   * @param[in] ctx        The device context contains device resources.
-   *
-   * * @note    CopyFromVector assumes that the tensor has been resized
-   *            before invoking.
-   */
-  template <typename T>
-  inline void CopyFromVector(const std::vector<T>& src,
-                             const platform::DeviceContext& ctx);
  /**
   * @brief  Return a sub-tensor of the given tensor.
   *
@@ -141,7 +113,6 @@ class Tensor {
  size_t memory_size() const;
- private:
  inline void check_memory_size() const;
 private:

--- a/paddle/framework/tensor_array.cc
+++ b/paddle/framework/tensor_array.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-#include "paddle/framework/tensor_array.h"
-#include <glog/logging.h>
-#include <algorithm>
-#include <limits>
-#include "paddle/framework/eigen.h"
-namespace paddle {
-namespace framework {
-namespace detail {
-/*
- * Offer an iterator over the length-sorted lod-tensor's top level. The top
- * level of a lod-tensor stores batch-size of sequences, each top-level sequence
- * may contains several lower-level sequences, sort top-level lod by the numbers
- * of lower-level sequences in descending order, so that during RNN's running,
- * the batch-size will keep decreasing, the short sentences will end at the tail
- * of each batch.
- *
- * Let's take a simple lod-tensor for example
- *
- *   |(0)       |(1)        top-level has two instances
- *   |||        |||||    lower-level
- *
- * sort by lower-level's length
- *
- *   |(1)       |(0)
- *   |||||      |||
- *
- * when RNN runs, it get 5 batches (equals the number of elements the longest
- * sequence has)
- *
- * |||||
- * |||
- *
- * the first three batches has two elements, the last two elements just has 1
- * element each.
- */
-struct DynamicBatchUnpacker {
-  using value_type = float;
-  DynamicBatchUnpacker(const LoDTensor& source, size_t level,
-                       bool descend = true)
-      : source(&source), level(level) {
-    BuildLengthSortedMeta(descend);
-  }
-  LoDTensor GetBatch(size_t index);
-  std::vector<DySeqMeta> meta;
-  LoDTensor const* source;
-  size_t level;
- protected:
-  void BuildLengthSortedMeta(bool descend);
-};
-LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
-                           const std::vector<DySeqMeta>& meta, const LoD& lod,
-                           size_t level);
-std::vector<size_t> GenDyBatchIndice(const DySeqMetaBatch& meta, int batch_id) {
-  // collect indice need to copy to the batch
-  std::vector<size_t> indice;
-  for (const auto& seq : meta) {
-    size_t id = seq.begin + batch_id;
-    if (id >= seq.end) break;
-    indice.push_back(id);
-  }
-  return indice;
-}
-}  // namespace detail
-const LoDTensor& TensorArray::Read(size_t index) const {
-  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
-  if (index >= size()) {
-    values_.resize(index + 1);
-  }
-  return values_[index];
-}
-void TensorArray::Write(size_t index, const LoDTensor& value) {
-  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
-  if (index >= size()) {
-    values_.resize(index + 1);
-  }
-  values_[index].set_lod(value.lod());
-  values_[index].Resize(value.dims());
-  values_[index].mutable_data<value_type>(value.place());
-  values_[index].CopyFrom(value, value.place(), platform::CPUDeviceContext());
-}
-void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
-  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
-  if (index >= size()) {
-    values_.resize(index + 1);
-  }
-  values_[index].set_lod(value.lod());
-  values_[index].ShareDataWith(value);
-}
-LoDTensor TensorArray::Pack(size_t level, const std::vector<DySeqMeta>& meta,
-                            const LoD& lod) const {
-  return detail::PackDynamicBatch(values_, meta, lod, level);
-}
-DySeqMetaBatch TensorArray::Unpack(const LoDTensor& source, int level,
-                                   bool length_desend) {
-  detail::DynamicBatchUnpacker unpacker(source, level,
-                                        length_desend /*descend*/);
-  // find max length of all the sequences
-  size_t max_length = 0;
-  for (const auto& seq : unpacker.meta) {
-    max_length = std::max(max_length, seq.end - seq.begin);
-  }
-  // write batches to values
-  for (size_t batch_id = 0; batch_id < max_length; batch_id++) {
-    Write(batch_id, unpacker.GetBatch(batch_id));
-  }
-  PADDLE_ENFORCE(!unpacker.meta.empty());
-  return unpacker.meta;
-}
-LoDTensor TensorArray::LodPack(size_t level) const {
-  PADDLE_ENFORCE_GT(size(), 0UL, "no time step exists");
-  // the levels should be no less than 2
-  LoDTensor merged;
-  const LoDTensor *pre, *cur;
-  pre = &Read(0);
-  for (size_t step = 1; step < size(); step++) {
-    cur = &Read(step);
-    PADDLE_ENFORCE_GT(cur->NumLevels(), 0);
-    PADDLE_ENFORCE_GT(pre->NumLevels(), 0);
-    PADDLE_ENFORCE_EQ(pre->NumLevels(), cur->NumLevels());
-    PADDLE_ENFORCE_EQ(pre->NumElements(level), cur->NumElements(level));
-    merged = LodPackTwo(*pre, *cur, level);
-    pre = &merged;
-  }
-  return merged;
-}
-/*
- * NOTE currently, only the lowest level supports packing.
- * The lowest LoD will be changed, while the relative offsets in levels above
- * stay unchanged.
- *
- * previous step : [0] [1] [3]
- * current step: [0 1 2] [2 3] []
- * packed to
- *   [0 0] [0 1] [0 2] [1 2] [1 3] [3]
- */
-LoDTensor TensorArray::LodPackTwo(const LoDTensor& pre, const LoDTensor& cur,
-                                  size_t level) const {
-  PADDLE_ENFORCE_EQ(pre.NumLevels(), cur.NumLevels());
-  PADDLE_ENFORCE_EQ(pre.NumLevels(), level + 1,
-                    "Only the lowest LoD level supports pack temporarily.");
-  // calculate the result tensor's shape first
-  size_t num_instances = 0;
-  for (size_t elem = 0; elem < pre.NumElements(level); elem++) {
-    size_t prefix_size = pre.NumElements(level, elem);
-    size_t num_candidates = cur.NumElements(level, elem);
-    if (num_candidates > 0) {
-      num_instances += num_candidates * (prefix_size + 1);
-    } else {
-      num_instances += prefix_size;
-    }
-  }
-  auto res_dims = pre.dims();
-  res_dims[0] = num_instances;
-  LoDTensor result;
-  result.Resize(res_dims);
-  result.mutable_data<value_type>(cur.place());
-  Vector<size_t> last_lod_level;
-  // copy data
-  size_t index = 0;
-  last_lod_level.push_back(index);
-  for (size_t elem = 0; elem < pre.NumElements(level); elem++) {
-    size_t prefix_size = pre.NumElements(level, elem);
-    size_t num_candidates = cur.NumElements(level, elem);
-    // slice the prefix Tensor
-    LoDTensor prefix = pre;
-    prefix.ShrinkInLevel(level, elem, elem + 1);
-    LoDTensor candidate = cur;
-    if (num_candidates > 0) {
-      candidate.ShrinkInLevel(level, elem, elem + 1);
-    } else {  // just push prefix
-      result.Slice(index, index + prefix_size)
-          .CopyFrom(prefix, result.place(), platform::CPUDeviceContext());
-      index += prefix_size;
-      last_lod_level.push_back(index);
-    }
-    for (size_t candi = 0; candi < num_candidates; candi++) {
-      // TODO(superjom) support GPU
-      result.Slice(index, index + prefix_size)
-          .CopyFrom(prefix, result.place(), platform::CPUDeviceContext());
-      index += prefix_size;
-      // copy candidate record
-      result.Slice(index, index + 1)
-          .CopyFrom(candidate.Slice(candi, candi + 1), result.place(),
-                    platform::CPUDeviceContext());
-      index++;
-      last_lod_level.push_back(index);
-    }
-  }
-  // update lod
-  auto lod = cur.lod();
-  lod.back() = last_lod_level;
-  result.set_lod(lod);
-  return result;
-}
-/*
- * source [0 1 2] [3 4] [5 6 7] will be transformd to a list of LoDTensors such
- * as
- * [0 3 5] [1 4 6] [2 7] with 1-level LoDs:
- * - [0 1 2 3]
- * - [0 1 2 3]
- * - [0 1 1 2], the [1,1) here means the second sequence is empty
- *
- * NOTE Unpack a LoDTensor in this approach may result in a big LoD.
- */
-void TensorArray::LodUnpack(const LoDTensor& source, size_t level) {
-  PADDLE_ENFORCE_EQ(level, source.NumLevels() - 1,
-                    "only the lowest LoD level supports unpack.");
-  const size_t non_empty_instances = source.dims()[0];
-  size_t index = 0;
-  Vector<size_t> lowest_lod_level;
-  lowest_lod_level.push_back(index);
-  for (size_t step = 0; step < non_empty_instances; step++) {
-    size_t num_instances = 0;
-    for (size_t id = 0; id < source.NumElements(level); id++) {
-      auto instance = source;
-      instance.ShrinkInLevel(level, id, id + 1);
-      if (static_cast<size_t>(instance.dims()[0]) > step) {
-        num_instances++;
-        index++;
-      }
-      lowest_lod_level.push_back(index);
-    }
-    // create tensor for this time step
-    LoDTensor tensor;
-    auto dims = source.dims();
-    dims[0] = num_instances;
-    // set lod
-    auto lod = source.lod();
-    lod.back() = lowest_lod_level;
-    tensor.set_lod(lod);
-    index = 0;
-    for (size_t id = 0; id < source.NumElements(level); id++) {
-      auto instance = source;
-      instance.ShrinkInLevel(level, id, id + 1);
-      if (static_cast<size_t>(instance.dims()[0]) > step) {
-        // copy this instance
-        tensor.Slice(index, index + 1)
-            .CopyFrom(instance.Slice(step, step + 1), tensor.place(),
-                      platform::CPUDeviceContext());
-        index++;
-      }
-    }
-    Write(step, tensor);
-  }
-}
-LoDTensor TensorArray::Stack() const {
-  LoDTensor result;
-  if (size() == 0) return result;
-  const auto& first_dims = values_.front().dims();
-  // check all the values have the same shape
-  // TODO(superjom) check the same dtypes
-  for (size_t idx = 1; idx < size(); idx++) {
-    const auto& value_dims = values_[idx].dims();
-    PADDLE_ENFORCE_EQ(first_dims, value_dims);
-  }
-  // copy
-  auto result_dims = vectorize(first_dims);
-  result_dims.insert(result_dims.begin(), size());
-  result.Resize(make_ddim(result_dims));
-  result.mutable_data<value_type>(platform::CPUPlace());
-  for (size_t idx = 0; idx < size(); idx++) {
-    result.Slice(idx, idx + 1)
-        .CopyFrom(Read(idx), platform::CPUPlace(),
-                  platform::CPUDeviceContext());
-  }
-  return result;
-}
-void TensorArray::Unstack(const LoDTensor& source) const {
-  Unstack(source, false /*data_shared*/);
-}
-void TensorArray::UnstackShared(const LoDTensor& source) const {
-  Unstack(source, true /*data_shared*/);
-}
-void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const {
-  size_t first_dim = source.dims()[0];
-  DDim value_dims = slice_ddim(source.dims(), 1, source.dims().size());
-  PADDLE_ENFORCE_GT(first_dim, 0,
-                    "source should have some data to be unstacked");
-  values_.resize(first_dim);
-  for (size_t elem = 0; elem < first_dim; elem++) {
-    // create a new value
-    auto& value = values_[elem];
-    if (data_shared) {
-      // share memory
-      value.ShareDataWith(source.Slice(elem, elem + 1));
-    } else {
-      // copy
-      value.Resize(value_dims);
-      value.CopyFrom(source.Slice(elem, elem + 1), platform::CPUPlace(),
-                     platform::CPUDeviceContext());
-    }
-  }
-}
-size_t TensorArray::size() const { return values_.size(); }
-namespace detail {
-void DynamicBatchUnpacker::BuildLengthSortedMeta(bool descend) {
-  PADDLE_ENFORCE(meta.empty(), "duplicate build meta");
-  // collect meta for each sequence in some level
-  auto lod = SliceLevels(source->lod(), level, level + 1)[0];
-  for (size_t seq_id = 0; seq_id < lod.size() - 1; seq_id++) {
-    DySeqMeta seq_meta({lod[seq_id], lod[seq_id + 1], seq_id});
-    meta.push_back(seq_meta);
-  }
-  PADDLE_ENFORCE_GT(meta.size(), 0, "meta is empty");
-  // sort by length
-  sort(meta.begin(), meta.end(),
-       [descend](const DySeqMeta& a, const DySeqMeta& b) {
-         bool a_ge_b = (a.end - a.begin) > (b.end - b.begin);
-         return descend ? a_ge_b : !a_ge_b;
-       });
-}
-LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {
-  PADDLE_ENFORCE(!meta.empty(), "should build meta first");
-  LoDTensor result;
-  auto indice = detail::GenDyBatchIndice(meta, index);
-  PADDLE_ENFORCE(!indice.empty(), "invalid batch at %d", index);
-  // copy the indice of records in LoDTensor
-  auto record_dims = slice_ddim(source->dims(), 1, source->dims().size());
-  auto record_dims_vec = vectorize(record_dims);
-  record_dims_vec.insert(record_dims_vec.begin(), indice.size());
-  result.Resize(make_ddim(record_dims_vec));
-  result.mutable_data<value_type>(platform::CPUPlace());
-  for (size_t i = 0; i < indice.size(); i++) {
-    auto index = indice[i];
-    auto target = result.Slice(i, i + 1);
-    auto slice = source->Slice(index, index + 1);
-    target.CopyFrom(slice, platform::CPUPlace(), platform::CPUDeviceContext());
-  }
-  return result;
-}
-// TODO(supejom) to cache lod if reasonable
-LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
-                           const std::vector<DySeqMeta>& meta, const LoD& lod,
-                           size_t level) {
-  PADDLE_ENFORCE(!source.empty());
-  PADDLE_ENFORCE(!meta.empty());
-  PADDLE_ENFORCE(!lod.empty());
-  LoDTensor result;
-  // init result space
-  auto record_dims = slice_ddim(source[0].dims(), 1, source[0].dims().size());
-  auto record_dims_vec = vectorize(record_dims);
-  auto height = lod[level].back();
-  record_dims_vec.insert(record_dims_vec.begin(), height);
-  result.Resize(make_ddim(record_dims_vec));
-  result.mutable_data<float>(platform::CPUPlace());
-  for (size_t batch_id = 0; batch_id < source.size(); batch_id++) {
-    for (size_t seq_id = 0; seq_id < meta.size(); seq_id++) {
-      const auto& seq_meta = meta[seq_id];
-      // source is source[batch_id][seq_id]
-      // target is result[index]
-      auto index = seq_meta.begin + batch_id;
-      if (index >= seq_meta.end) break;
-      auto source_ = source[batch_id].Slice(seq_id, seq_id + 1);
-      auto target = result.Slice(index, index + 1);
-      target.CopyFrom(source_, platform::CPUPlace(),
-                      platform::CPUDeviceContext());
-    }
-  }
-  result.set_lod(lod);
-  return result;
-}
-}  // namespace detail
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/framework/tensor_array.h
+++ b/paddle/framework/tensor_array.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-#pragma once
-#include <vector>
-#include "paddle/framework/lod_tensor.h"
-namespace paddle {
-namespace framework {
-/*
- * DyBatchSeqPosition stores indices of the basic element in tensor. It is used
- * after lod-tensor's re-assembling, its info can be used to recover the order
- * in original lod-tensor.
- */
-struct DySeqMeta {
-  DySeqMeta(size_t begin, size_t end, size_t ori_idx)
-      : begin(begin), end(end), ori_idx(ori_idx) {}
-  size_t begin;
-  size_t end;  // not included
-  size_t ori_idx;
-};
-using DySeqMetaBatch = std::vector<DySeqMeta>;
-/*
- * Extract the indices of instances.
- */
-std::vector<size_t> GenDyBatchIndice(const DySeqMetaBatch &metas, int batch_id);
-/*
- * TensorArray is a C-array-like array of tensors, it is meant to be used with
- * dynamic iteration primitives such as while_loop. It is used to segment inputs
- * and store states in all time steps.
- *
- * By providing some methods similar to a C++ array, the difinition of some
- * state-based dynamic models such as RNN cound be more natural and highly
- * flexible.
- */
-class TensorArray {
- public:
-  using value_type = float;
-  // max number of values allowed to store.
-  const size_t MAX_SIZE{100000};
-  /*
-   * Read the value at location `index` in the `TensorArray`.
-   */
-  const LoDTensor &Read(size_t index) const;
-  /*
-   * Write value into the index of the TensorArray.
-   */
-  void Write(size_t index, const LoDTensor &value);
-  /*
-   * Write value into the index of the TensorArray, with memory shared.
-   */
-  void WriteShared(size_t index, const LoDTensor &value);
-  /*
-   * Recover the original LoD-arranged LoDTensor with the `values`, `level` and
-   * `indice_map`.
-   */
-  LoDTensor Pack(size_t level, const DySeqMetaBatch &meta,
-                 const LoD &lod) const;
-  /*
-   * Split LoDTensor in some `level` and write the generated batches to
-   * `values`, if set `desend`, will sort by length in descending order else in
-   * ascending order.
-   */
-  DySeqMetaBatch Unpack(const LoDTensor &source, int level, bool length_desend);
-  /*
-   * Pack an array of LoDTensors to a LoDTensor.
-   */
-  LoDTensor LodPack(size_t level) const;
-  /*
-   * Unpack a LoDTensor to an array of LoDTensors.
-   */
-  void LodUnpack(const LoDTensor &source, size_t level);
-  /*
-   * Pack the values into a tensor with rank one higher than each tensor in
-   * values.
-   */
-  LoDTensor Stack() const;
-  /*
-   * Unstacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors.
-   */
-  void Unstack(const LoDTensor &source) const;
-  /*
-   * Unstacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors,
-   * with memory of tensors shared.
-   */
-  void UnstackShared(const LoDTensor &source) const;
-  /*
-   * Return the number of values.
-   */
-  size_t size() const;
- protected:
-  void Unstack(const LoDTensor &source, bool data_shared) const;
-  LoDTensor LodPackTwo(const LoDTensor &pre, const LoDTensor &cur,
-                       size_t level) const;
- private:
-  mutable std::vector<LoDTensor> values_;
-};  // class TensorArray
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/framework/tensor_array_test.cc
+++ b/paddle/framework/tensor_array_test.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-#include "paddle/framework/tensor_array.h"
-#include <gtest/gtest.h>
-namespace paddle {
-namespace framework {
-class TensorArrayTester : public ::testing::Test {
- protected:
-  void SetUp() override {
-    LoDTensor source;
-    source.Resize(make_ddim({batch_size, dim}));
-    int* data = source.mutable_data<int>(platform::CPUPlace());
-    for (int i = 0; i < 16 * 32; i++) {
-      data[i] = i;
-    }
-    ta.Unstack(source);
-  }
-  TensorArray ta;
-  const int batch_size = 16;
-  const int dim = 32;
-};
-TEST_F(TensorArrayTester, Read) {
-  for (int i = 0; i < batch_size; i++) {
-    const auto& tensor = ta.Read(i);
-    ASSERT_EQ(tensor.dims()[0], 1);
-    ASSERT_EQ(tensor.dims()[1], dim);
-  }
-}
-TEST_F(TensorArrayTester, Write) {
-  LoDTensor source;
-  source.Resize(make_ddim({1, dim}));
-  for (int i = 0; i < dim; i++) {
-    *(source.mutable_data<int>(platform::CPUPlace()) + i) = i;
-  }
-  ta.Write(2, source);
-  const auto& tensor = ta.Read(2);
-  for (int i = 0; i < dim; i++) {
-    EXPECT_EQ(*(tensor.data<int>() + i), *(source.data<int>() + i));
-  }
-}
-TEST_F(TensorArrayTester, WriteShared) {
-  LoDTensor source;
-  source.Resize(make_ddim({1, dim}));
-  for (int i = 0; i < dim; i++) {
-    *(source.mutable_data<int>(platform::CPUPlace()) + i) = i;
-  }
-  ta.WriteShared(2, source);
-  const auto& tensor = ta.Read(2);
-  for (int i = 0; i < dim; i++) {
-    EXPECT_EQ(*(tensor.data<int>() + i), *(source.data<int>() + i));
-  }
-  EXPECT_EQ(source.data<int>(), tensor.data<int>());
-}
-class TensorArrayPackTester : public ::testing::Test {
- protected:
-  virtual void SetUp() override {
-    lod.push_back(std::vector<size_t>{0, 2, 9, 13});
-    source.set_lod(lod);
-    source.Resize(make_ddim({13, 128}));
-    source.mutable_data<int>(platform::CPUPlace());
-    // content of each setence: 0 1 2 3 4
-    const auto& level = lod.front();
-    for (size_t i = 0; i < level.size() - 1; i++) {
-      size_t begin = level[i];
-      size_t end = level[i + 1];
-      for (size_t j = begin; j < end; j++) {
-        auto record = source.Slice(j, j + 1);
-        for (int dim = 0; dim < 128; dim++) {
-          record.mutable_data<int>(platform::CPUPlace())[dim] = j - begin;
-        }
-      }
-    }
-    // unpack
-    meta = ta.Unpack(source, 0, true);
-  }
-  LoD lod;
-  TensorArray ta;
-  LoDTensor source;
-  std::vector<DySeqMeta> meta;
-};
-TEST_F(TensorArrayPackTester, Unpack) {
-  ASSERT_EQ(ta.size(), 7UL);
-  const auto& t0 = ta.Read(0);
-  const auto& t1 = ta.Read(1);
-  ASSERT_EQ(t0.data<int>()[0], int(0));
-  ASSERT_EQ(t1.data<int>()[0], int(1));
-}
-TEST_F(TensorArrayPackTester, Pack) {
-  LoDTensor packed = ta.Pack(0, meta, lod);
-}
-TEST_F(TensorArrayTester, size) {
-  ASSERT_EQ(ta.size(), static_cast<size_t>(batch_size));
-}
-TEST(TensorArray, LodPack) {
-  // three time steps, each step stores a LoDTensors
-  // - [0] [1]
-  // - [2 3], [4 5]
-  // - [6 7] [] [8], [9, 10]
-  // try to get a LoDTensor with content:
-  // - [0 2 6]
-  // - [0 2 7]
-  // - [0 3]
-  // - [1 4 8]
-  // - [1 5 9]
-  // - [1 5 10]
-  std::array<LoDTensor, 3> tensors;
-  tensors[0].Resize(make_ddim({2, 1}));
-  tensors[1].Resize(make_ddim({4, 1}));
-  tensors[2].Resize(make_ddim({5, 1}));
-  int index = 0;
-  for (auto& t : tensors) {
-    t.mutable_data<int>(platform::CPUPlace());
-    for (int i = 0; i < t.dims()[0]; i++) {
-      t.data<int>()[i] = index;
-      index++;
-    }
-  }
-  std::array<LoD, 3> lods;
-  std::vector<std::vector<size_t>> levels{
-      {0, 1, 2}, {0, 2, 4}, {0, 2, 2, 3, 5}};
-  for (int i = 0; i < 3; i++) {
-    lods[i].emplace_back(levels[i].begin(), levels[i].end());
-  }
-  TensorArray ta;
-  for (int i = 0; i < 3; i++) {
-    tensors[i].set_lod(lods[i]);
-    ta.Write(i, tensors[i]);
-  }
-  auto merged = ta.LodPack(0);
-  std::vector<int> target_tensor_data{{0, 2, 6,  // 0
-                                       0, 2, 7,  // 1
-                                       0, 3,     // 2
-                                       1, 4, 8,  // 3
-                                       1, 5, 9,  // 5
-                                       1, 5, 10}};
-  EXPECT_EQ(merged.dims()[0], (int)target_tensor_data.size());
-  for (size_t i = 0; i < target_tensor_data.size(); i++) {
-    EXPECT_EQ(target_tensor_data[i], merged.data<int>()[i]);
-  }
-}
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -150,84 +150,6 @@ inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
  return *this;
 }
-inline void Tensor::CopyFrom(const Tensor& src,
-                             const platform::Place& dst_place,
-                             const platform::DeviceContext& ctx) {
-  src.check_memory_size();
-  Resize(src.dims());
-  auto src_place = src.holder_->place();
-  auto src_ptr = src.data<void>();
-  auto dst_ptr = mutable_data(dst_place, src.type());
-  auto size = src.numel() * SizeOfType(src.type());
-  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
-                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
-  }
-#ifdef PADDLE_WITH_CUDA
-  else if (platform::is_gpu_place(src_place) &&
-           platform::is_cpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
-    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  } else if (platform::is_gpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  }
-#endif
-}
-template <typename T>
-inline void Tensor::CopyFromVector(const std::vector<T>& src,
-                                   const platform::DeviceContext& ctx) {
-  auto dst_place = ctx.GetPlace();
-  auto src_ptr = static_cast<const void*>(src.data());
-  platform::CPUPlace src_place;
-  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
-  auto size = src.size() * sizeof(T);
-  if (platform::is_cpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, src_place,
-                 src_ptr, size);
-  }
-#ifdef PADDLE_WITH_CUDA
-  else if (platform::is_gpu_place(dst_place)) {
-    memory::Copy(
-        boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place, src_ptr,
-        size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  }
-#endif
-}
 inline Tensor Tensor::Slice(int begin_idx, int end_idx) const {
  check_memory_size();
  PADDLE_ENFORCE_GE(begin_idx, 0,

--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -188,178 +188,6 @@ TEST(Tensor, Slice) {
 #endif
 }
-TEST(Tensor, CopyFrom) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  {
-    Tensor src_tensor;
-    Tensor dst_tensor;
-    CPUDeviceContext cpu_ctx((CPUPlace()));
-    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
-    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    memcpy(src_ptr, arr, 9 * sizeof(int));
-    auto cpu_place = new paddle::platform::CPUPlace();
-    dst_tensor.CopyFrom(src_tensor, *cpu_place, cpu_ctx);
-    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-    Tensor slice_tensor = src_tensor.Slice(1, 2);
-    dst_tensor.CopyFrom(slice_tensor, *cpu_place, cpu_ctx);
-    const int* slice_ptr = slice_tensor.data<int>();
-    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(dst_ptr, slice_ptr);
-    for (size_t i = 0; i < 3; ++i) {
-      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
-    }
-  }
-#ifdef PADDLE_WITH_CUDA
-  {
-    Tensor src_tensor;
-    Tensor gpu_tensor;
-    Tensor dst_tensor;
-    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
-    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    memcpy(src_ptr, arr, 9 * sizeof(int));
-    // CPU Tensor to GPU Tensor
-    auto gpu_place = new paddle::platform::GPUPlace(0);
-    CUDADeviceContext gpu_ctx(*gpu_place);
-    gpu_tensor.CopyFrom(src_tensor, *gpu_place, gpu_ctx);
-    // GPU Tensor to CPU Tensor
-    auto cpu_place = new paddle::platform::CPUPlace();
-    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-    Tensor slice_tensor = src_tensor.Slice(1, 2);
-    // CPU Slice Tensor to GPU Tensor
-    gpu_tensor.CopyFrom(slice_tensor, *gpu_place, gpu_ctx);
-    // GPU Tensor to CPU Tensor
-    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
-    // Sync before Compare Slice Tensors
-    gpu_ctx.Wait();
-    const int* slice_ptr = slice_tensor.data<int>();
-    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(dst_ptr, slice_ptr);
-    for (size_t i = 0; i < 3; ++i) {
-      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
-    }
-  }
-#endif
-}
-TEST(Tensor, CopyFromVector) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  {
-    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor cpu_tensor;
-    // Copy to CPU Tensor
-    cpu_tensor.Resize(make_ddim({3, 3}));
-    auto cpu_place = new paddle::platform::CPUPlace();
-    CPUDeviceContext cpu_ctx(*cpu_place);
-    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
-    // Compare Tensors
-    const int* cpu_ptr = cpu_tensor.data<int>();
-    const int* src_ptr = src_vec.data();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-    }
-    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
-    cpu_tensor.Resize(make_ddim({2, 2}));
-    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
-    cpu_ptr = cpu_tensor.data<int>();
-    src_ptr = src_vec.data();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    for (size_t i = 0; i < 5; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-    }
-    delete cpu_place;
-  }
-#ifdef PADDLE_WITH_CUDA
-  {
-    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor cpu_tensor;
-    Tensor gpu_tensor;
-    Tensor dst_tensor;
-    // Copy to CPU Tensor
-    cpu_tensor.Resize(make_ddim({3, 3}));
-    auto cpu_place = new paddle::platform::CPUPlace();
-    CPUDeviceContext cpu_ctx(*cpu_place);
-    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
-    // Copy to GPUTensor
-    gpu_tensor.Resize(make_ddim({3, 3}));
-    auto gpu_place = new paddle::platform::GPUPlace();
-    CUDADeviceContext gpu_ctx(*gpu_place);
-    gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
-    // Copy from GPU to CPU tensor for comparison
-    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    const int* src_ptr = src_vec.data();
-    const int* cpu_ptr = cpu_tensor.data<int>();
-    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
-    cpu_tensor.Resize(make_ddim({2, 2}));
-    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
-    gpu_tensor.Resize(make_ddim({2, 2}));
-    gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
-    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    src_ptr = src_vec.data();
-    cpu_ptr = cpu_tensor.data<int>();
-    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 5; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-    delete cpu_place;
-    delete gpu_place;
-  }
-#endif
-}
 TEST(Tensor, ReshapeToMatrix) {
  using namespace paddle::framework;
  using namespace paddle::platform;

--- a/paddle/framework/tensor_util.h
+++ b/paddle/framework/tensor_util.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/framework/tensor.h"
+namespace paddle {
+namespace framework {
+/**
+ * @brief   Copy the content of external tensor to a new place.
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] dst_place  The dst place.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * @note    CopyFrom supports CPU <-> GPU, GPU <-> GPU.
+ */
+inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
+                     const platform::DeviceContext& ctx, Tensor* dst) {
+  src.check_memory_size();
+  dst->Resize(src.dims());
+  auto src_place = src.place();
+  auto src_ptr = src.data<void>();
+  auto dst_ptr = dst->mutable_data(dst_place, src.type());
+  auto size = src.numel() * SizeOfType(src.type());
+  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
+    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  } else if (platform::is_gpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+/**
+ * @brief   Copy the content of an external vector to a tensor.
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * * @note    CopyFromVector assumes that the tensor has been resized
+ *            before invoking.
+ */
+template <typename T>
+inline void CopyFromVector(const std::vector<T>& src,
+                           const platform::DeviceContext& ctx, Tensor* dst) {
+  auto dst_place = ctx.GetPlace();
+  auto src_ptr = static_cast<const void*>(src.data());
+  platform::CPUPlace src_place;
+  dst->Resize({static_cast<int64_t>(src.size())});
+  auto dst_ptr = static_cast<void*>(dst->mutable_data<T>(dst_place));
+  auto size = src.size() * sizeof(T);
+  if (platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, src_place,
+                 src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
+    memory::Copy(
+        boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place, src_ptr,
+        size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+/**
+ * @brief   Copy the content of a tensor to a vector
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * * @note    CopyFromVector assumes that the tensor has been resized
+ *            before invoking.
+ */
+template <typename T>
+inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
+                         std::vector<T>* dst) {
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+  auto size = src.numel() * sizeof(T);
+  platform::CPUPlace dst_place;
+  dst->resize(src.numel());
+  auto dst_ptr = static_cast<void*>(dst->data());
+  if (platform::is_cpu_place(src.place())) {
+    memory::Copy(dst_place, dst_ptr,
+                 boost::get<platform::CPUPlace>(src.place()), src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src.place())) {  // NOLINT
+    memory::Copy(
+        dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()),
+        src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/tensor_util_test.cc
+++ b/paddle/framework/tensor_util_test.cc
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+#include "paddle/framework/tensor_util.h"
+#include <gtest/gtest.h>
+#include <string>
+namespace paddle {
+namespace framework {
+TEST(CopyFrom, Tensor) {
+  Tensor src_tensor;
+  Tensor dst_tensor;
+  platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
+  int* src_ptr =
+      src_tensor.mutable_data<int>(make_ddim({3, 3}), platform::CPUPlace());
+  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  memcpy(src_ptr, arr, 9 * sizeof(int));
+  auto cpu_place = new platform::CPUPlace();
+  CopyFrom(src_tensor, *cpu_place, cpu_ctx, &dst_tensor);
+  const int* dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
+  Tensor slice_tensor = src_tensor.Slice(1, 2);
+  CopyFrom(slice_tensor, *cpu_place, cpu_ctx, &dst_tensor);
+  const int* slice_ptr = slice_tensor.data<int>();
+  dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(dst_ptr, slice_ptr);
+  for (size_t i = 0; i < 3; ++i) {
+    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    Tensor src_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+    int* src_ptr =
+        src_tensor.mutable_data<int>(make_ddim({3, 3}), platform::CPUPlace());
+    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    memcpy(src_ptr, arr, 9 * sizeof(int));
+    // CPU Tensor to GPU Tensor
+    auto gpu_place = new platform::GPUPlace(0);
+    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+    // GPU Tensor to CPU Tensor
+    auto cpu_place = new platform::CPUPlace();
+    CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+    Tensor slice_tensor = src_tensor.Slice(1, 2);
+    // CPU Slice Tensor to GPU Tensor
+    CopyFrom(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+    // GPU Tensor to CPU Tensor
+    CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+    // Sync before Compare Slice Tensors
+    gpu_ctx.Wait();
+    const int* slice_ptr = slice_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(dst_ptr, slice_ptr);
+    for (size_t i = 0; i < 3; ++i) {
+      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    }
+  }
+#endif
+}
+TEST(CopyFromVector, Tensor) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor cpu_tensor;
+    // Copy to CPU Tensor
+    cpu_tensor.Resize(make_ddim({3, 3}));
+    auto cpu_place = new paddle::platform::CPUPlace();
+    CPUDeviceContext cpu_ctx(*cpu_place);
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    // Compare Tensors
+    const int* cpu_ptr = cpu_tensor.data<int>();
+    const int* src_ptr = src_vec.data();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+    }
+    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
+    cpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    cpu_ptr = cpu_tensor.data<int>();
+    src_ptr = src_vec.data();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    for (size_t i = 0; i < 5; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+    }
+    delete cpu_place;
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor cpu_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+    // Copy to CPU Tensor
+    cpu_tensor.Resize(make_ddim({3, 3}));
+    auto cpu_place = new paddle::platform::CPUPlace();
+    CPUDeviceContext cpu_ctx(*cpu_place);
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    // Copy to GPUTensor
+    gpu_tensor.Resize(make_ddim({3, 3}));
+    auto gpu_place = new paddle::platform::GPUPlace();
+    CUDADeviceContext gpu_ctx(*gpu_place);
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    // Copy from GPU to CPU tensor for comparison
+    CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    const int* src_ptr = src_vec.data();
+    const int* cpu_ptr = cpu_tensor.data<int>();
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
+    cpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    gpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    src_ptr = src_vec.data();
+    cpu_ptr = cpu_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 5; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+    delete cpu_place;
+    delete gpu_place;
+  }
+#endif
+}
+TEST(CopyToVector, Tensor) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    Tensor src;
+    int* src_ptr = src.mutable_data<int>({3, 3}, CPUPlace());
+    for (int i = 0; i < 3 * 3; ++i) {
+      src_ptr[i] = i;
+    }
+    CPUPlace place;
+    CPUDeviceContext cpu_ctx(place);
+    std::vector<int> dst;
+    CopyToVector<int>(src, cpu_ctx, &dst);
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_ptr[i], dst[i]);
+    }
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor gpu_tensor;
+    GPUPlace place;
+    CUDADeviceContext gpu_ctx(place);
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    std::vector<int> dst;
+    CopyToVector<int>(gpu_tensor, gpu_ctx, &dst);
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_vec[i], dst[i]);
+    }
+  }
+#endif
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -212,6 +212,37 @@ Error __must_check backward(Argument& act) {
 }
 END_DEFINE_ACTIVATION(sequence_softmax)
+/*
+ * @brief SoftSign Activation.
+ * \f[
+ * f(z) = \frac{z}{1 + |z|}
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(softsign)
+private:
+MatrixPtr denominator_;
+Error __must_check forward(Argument& act) {
+  size_t height = act.value->getHeight();
+  size_t width = act.value->getWidth();
+  Matrix::resizeOrCreate(
+      denominator_, height, width, false, useGpu(act.deviceId));
+  denominator_->assign(*act.value);
+  denominator_->abs2();
+  denominator_->add(1.);
+  act.value->dotDiv(*act.value, *denominator_);
+  return Error();
+}
+Error __must_check backward(Argument& act) {
+  denominator_->square2();
+  denominator_->scalarDiv(*denominator_, 1.);
+  act.grad->dotMul(*act.grad, *denominator_);
+  return Error();
+}
+END_DEFINE_ACTIVATION(softsign)
 /**
 * @brief Relu Activation.
 * forward. y = max(0, z)

--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -41,6 +41,7 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap,
    useGlobalStats_ = config_.use_global_stats();
  }
  movingAvgFraction_ = config_.moving_average_fraction();
+  epsilon_ = config_.epsilon();
  weight_.reset(new Weight(1, channels_, parameters_[0]));
  movingMean_.reset(new Weight(1, channels_, parameters_[1]));

--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -94,6 +94,8 @@ protected:
  bool useGlobalStats_;
  // use to compute moving mean and variance.
  real movingAvgFraction_;
+  // Epsilon is a small random noise used in batch normalization for stability.
+  real epsilon_;
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -22,8 +22,6 @@ namespace paddle {
 REGISTER_LAYER(batch_norm, BatchNormalizationLayer);
-const real BatchNormalizationLayer::EPS = 1E-5;
 bool BatchNormalizationLayer::init(const LayerMap& layerMap,
                                   const ParameterMap& parameterMap) {
  /* Initialize the basic parent class */
@@ -53,7 +51,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
  calMovingMeanAndVar();
-  savedInvVar_->subScalar(-EPS);
+  savedInvVar_->subScalar(-epsilon_);
  savedInvVar_->sqrt2(*savedInvVar_);
 }
@@ -74,7 +72,7 @@ void BatchNormalizationLayer::setMeanAndStd() {
  savedInvVar_->copyFrom(*(movingVar_->getW()));
  savedInvVar_->downClip(real(0.0));
-  savedInvVar_->subScalar(-EPS);
+  savedInvVar_->subScalar(-epsilon_);
  savedInvVar_->sqrt2(*savedInvVar_);
 }

--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -39,9 +39,6 @@ public:
  void backward(const UpdateCallback& callback = nullptr) override;
 protected:
-  /// Epsilon value used in the batch normalization formula.
-  static const real EPS;
  /// Load pre-calculated mean and std.
  void setMeanAndStd();

--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -21,8 +21,6 @@ namespace paddle {
 REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer);
-const double CudnnBatchNormLayer::EPS = 1E-5;
 bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
                               const ParameterMap& parameterMap) {
  /* Initialize the basic parent class */
@@ -61,6 +59,9 @@ void CudnnBatchNormLayer::forward(PassType passType) {
  real* movingMean = movingMean_->getW()->getData();
  real* movingVar = movingVar_->getW()->getData();
+  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
+  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
  if (!useGlobalStats_) {
    REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
    real* savedMean = savedMean_->getData();
@@ -75,7 +76,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                   1.0 - movingAvgFraction_,
                                   movingMean,
                                   movingVar,
-                                   EPS,
+                                   eps_,
                                   savedMean,
                                   savedInvVar);
  } else {
@@ -90,7 +91,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                      beta,
                                      movingMean,
                                      movingVar,
-                                      EPS);
+                                      eps_);
    } else {
      // There is a limitation in cudnn library.
      // When the batch size is larger than 1024 in cuDNN v5.1,
@@ -101,7 +102,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                   beta,
                                   movingMean,
                                   movingVar,
-                                   EPS,
+                                   eps_,
                                   batchSize,
                                   channels_,
                                   imageH_ * imageD_,
@@ -128,6 +129,9 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
  real* savedMean = savedMean_->getData();
  real* savedInvVar = savedInvVar_->getData();
+  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
+  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
  auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) {
    Matrix::resizeOrCreate(m, h, w, false, true);
    m->zeroMem();
@@ -157,7 +161,7 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
                         gamma,
                         gammaGrad,
                         betaGrad,
-                         EPS,
+                         eps_,
                         savedMean,
                         savedInvVar);

--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
+#include <cudnn.h>
 #include "BatchNormBaseLayer.h"
 #include "Layer.h"
 #include "paddle/utils/Stat.h"
@@ -46,12 +47,9 @@ public:
  void backward(const UpdateCallback& callback = nullptr) override;
 protected:
-  /**
+  /// Epsilon value used in the batch normalization formula.
-   * Epsilon value used in the batch normalization formula.
+  /// Same epsilon value should be used in forward and backward functions.
-   * Minimum allowed value is CUDNN_BN_MIN_EPSILON defined in cudnn.h.
+  double eps_;
-   * Same epsilon value should be used in forward and backward functions.
-   */
-  static const double EPS;
  /// Input/output tensor descriptor desc
  hl_tensor_descriptor ioDesc_;

--- a/paddle/gserver/layers/DotProdLayer.cpp
+++ b/paddle/gserver/layers/DotProdLayer.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+/**
+ * @brief A layer for computing the dot product of two vectors.
+ * Input1: vector (batchSize * dim)
+ * Input2: vector (batchSize * dim)
+ * Output: a matrix: (batchSize * 1)
+ */
+class DotProdLayer : public Layer {
+public:
+  explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
+  ~DotProdLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+REGISTER_LAYER(dot_prod, DotProdLayer);
+bool DotProdLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(inputLayers_.size(), 2U);
+  CHECK_EQ(1UL, getSize())
+      << "The output dimensionality of this layer should be fixed to 1.";
+  return true;
+}
+void DotProdLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  size_t batchSize = inV0->getHeight();
+  CHECK_EQ(inV1->getHeight(), batchSize);
+  CHECK_EQ(inV0->getWidth(), inV1->getWidth());
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, 1);
+  }
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
+    outV->sumOfProducts(*inV0, *inV1, 1, 0);
+  }
+}
+void DotProdLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+  {
+    REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
+    if (inG0) {
+      inG0->addRowScale(0, *inV1, *outG);
+    }
+    if (inG1) {
+      inG1->addRowScale(0, *inV0, *outG);
+    }
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/L2DistanceLayer.cpp
+++ b/paddle/gserver/layers/L2DistanceLayer.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "L2DistanceLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+REGISTER_LAYER(l2_distance, L2DistanceLayer);
+bool L2DistanceLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(inputLayers_.size(), 2UL) << "The L2DistanceLayer accepts two and "
+                                     << "only two inputs.";
+  CHECK_EQ(getSize(), 1UL) << "The output dimensionality of L2DistanceLayer "
+                           << "is fixed to be 1.";
+  return true;
+}
+void L2DistanceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const auto inV1 = getInputValue(0);
+  const auto inV2 = getInputValue(1);
+  CHECK(inV1 && inV2);
+  CHECK_EQ(inV1->getHeight(), inV2->getHeight())
+      << "The height of two inputs of this layer must be the same.";
+  CHECK_EQ(inV1->getWidth(), inV2->getWidth())
+      << "The width of two inputs of this layer must be the same.";
+  int batchSize = inV1->getHeight();
+  int output_dim = getSize();
+  {
+    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
+    reserveOutput(batchSize, output_dim);
+    auto outV = getOutputValue();
+    CHECK(outV) << "The output matrix should not be null.";
+    Matrix::resizeOrCreate(
+        inputSub_, inV1->getHeight(), inV1->getWidth(), false, useGpu_);
+    inputSub_->assign(*inV1);
+    inputSub_->sub(*inV2);
+    outV->sumOfProducts(*inputSub_, *inputSub_, 1, 0);
+    outV->sqrt2(*outV);
+  }
+}
+void L2DistanceLayer::backward(const UpdateCallback& callback) {
+  const auto outG = getOutputGrad();
+  const auto outV = getOutputValue();
+  CHECK(outG && outV);
+  auto inGrad1 = getInputGrad(0);
+  auto inGrad2 = getInputGrad(1);
+  {
+    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
+    if (inGrad1 || inGrad2) {
+      outV->scalarDiv(*outV, 1.);
+      outV->dotMul(*outG, *outV);
+    }
+    if (inGrad1) inGrad1->addRowScale(0, *inputSub_, *outV);
+    if (inGrad2) {
+      inputSub_->mulScalar(-1.);
+      inGrad2->addRowScale(0, *inputSub_, *outV);
+    }
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/L2DistanceLayer.h
+++ b/paddle/gserver/layers/L2DistanceLayer.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+namespace paddle {
+/**
+ * @brief The layer calculates the l2 distance between two input vectors.
+ * \f[
+ * f(\bf{x}, \bf{y}) = \sqrt{\sum_{i=1}^D(x_i - y_i)}
+ * \f]
+ *
+ * - Input1: A vector (batchSize * dataDim)
+ * - Input2: A vector (batchSize * dataDim)
+ * - Output: A vector (batchSize * 1)
+ *
+ * The configuration api is: l2_distance_layer.
+ */
+class L2DistanceLayer : public Layer {
+public:
+  explicit L2DistanceLayer(const LayerConfig& config) : Layer(config) {}
+  ~L2DistanceLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+private:
+  // Store the result of subtracting Input2 from Input1 in forward computation,
+  // which will be reused in backward computation.
+  MatrixPtr inputSub_;
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
@@ -38,12 +38,13 @@ bool MKLDNNAddtoLayer::init(const LayerMap& layerMap,
 }
 void MKLDNNAddtoLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
  CHECK_EQ(layerSize_, getSize()) << "this layer size can not be changed";
  reshapeInput(bs, ih, iw);
  ic = inputLayers_[0]->getSize() / ih / iw;
  CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
-  CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw);
+  CHECK_EQ(inputLayers_[0]->getOutputValue()->getElementCnt(),
+           (size_t)bs * ic * ih * iw);
  for (size_t i = 0; i < inputLayers_.size(); i++) {
    CHECK_EQ(int64_t(bs), inputLayers_[i]->getOutput().getBatchSize());
    CHECK_EQ(layerSize_, inputLayers_[i]->getSize());
@@ -57,47 +58,43 @@ void MKLDNNAddtoLayer::reshape(
 }
 void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
-                                MKLDNNMatrixPtr& in,
+                                std::vector<MKLDNNMatrixPtr>& inputs,
-                                MKLDNNMatrixPtr& wgt,
-                                MKLDNNMatrixPtr& bias,
                                MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inVals_, bias, out);
+  resetFwdBuffers(inputs, biasVal_, out);
-  in = inVals_[0];
  std::shared_ptr<sum::primitive_desc> fwdPD;
  std::shared_ptr<sum::primitive_desc> biasPD;
-  resetFwdPD(fwdPD, biasPD, inVals_, bias, out);
+  resetFwdPD(fwdPD, biasPD, inputs, biasVal_, out);
-  resetFwdPipeline(pipeline, fwdPD, biasPD, inVals_, bias, out);
+  resetFwdPipeline(pipeline, fwdPD, biasPD, inputs, biasVal_, out);
 }
 void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
-                                MKLDNNMatrixPtr& in,
+                                std::vector<MKLDNNMatrixPtr>& inputs,
-                                MKLDNNMatrixPtr& wgt,
-                                MKLDNNMatrixPtr& bias,
                                MKLDNNMatrixPtr& out) {
-  resetBwdBuffers(inGrads_, bias, out);
+  resetBwdBuffers(inputs, biasGrad_, out);
-  in = inGrads_[0];
  // backward only need share output grad to input grad
-  for (size_t i = 0; i < inGrads_.size(); i++) {
+  for (size_t i = 0; i < inputs.size(); i++) {
-    if (inGrads_[i] != nullptr) {
+    if (inputs[i] != nullptr) {
-      inGrads_[i] = out;
+      inputs[i] = out;
-      inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
+      inputLayers_[i]->getOutputGrad()->setData(inputs[i]->getData());
    }
  }
  // backward bias
  bwdBias_ = nullptr;
-  if (bias) {
+  if (biasGrad_) {
    std::vector<float> scales(bs_, 1.0);
-    std::vector<memory::primitive_desc> srcPDs(bs_, bias->getPrimitiveDesc());
+    std::vector<memory::primitive_desc> srcPDs(bs_,
-    auto biasPD = sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs);
+                                               biasGrad_->getPrimitiveDesc());
+    auto biasPD =
+        sum::primitive_desc(biasGrad_->getMemoryDesc(), scales, srcPDs);
    std::vector<primitive::at> srcs;
    for (size_t i = 0; i < grads_.size(); ++i) {
      srcs.push_back(*(grads_[i]));
    }
-    bwdBias_.reset(new sum(biasPD, srcs, *bias));
+    bwdBias_.reset(new sum(biasPD, srcs, *biasGrad_));
    pipeline.push_back(*bwdBias_);
  }
 }
@@ -208,7 +205,7 @@ void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
  inputs.resize(inputLayers_.size());
  for (size_t i = 0; i < inputs.size(); i++) {
-    resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
+    resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
    CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
  }

--- a/paddle/gserver/layers/MKLDNNAddtoLayer.h
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
@@ -26,9 +26,6 @@ namespace paddle {
 */
 class MKLDNNAddtoLayer : public MKLDNNLayer {
 protected:
-  std::vector<MKLDNNMatrixPtr> inVals_;
-  std::vector<MKLDNNMatrixPtr> inGrads_;
  // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
  size_t layerSize_;
@@ -50,52 +47,19 @@ public:
            const ParameterMap& parameterMap) override;
  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
+                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
                MKLDNNMatrixPtr& out) override;
  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
+                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
                MKLDNNMatrixPtr& out) override;
  void updateWeights(const UpdateCallback& callback) override;
-  void printValueFormat() override {
-    for (size_t i = 0; i < inVals_.size(); ++i) {
-      VLOG(MKLDNN_FMTS) << i << " input: " << inVals_[i]->getFormat() << " >>>";
-    }
-    if (outVal_) {
-      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
-    }
-    if (extOutVal_) {
-      VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
-    }
-  }
-  void printGradFormat() override {
-    if (extOutGrad_) {
-      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
-    }
-    if (outGrad_) {
-      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
-    }
-    for (size_t i = 0; i < inGrads_.size(); ++i) {
-      VLOG(MKLDNN_FMTS) << i << " input: " << inGrads_[i]->getFormat() << "<<<";
-    }
-  }
 protected:
-  /**
-   * Forward functions: reset buffers(inputs, output, bias),
-   *                    reset primitive descriptor,
-   *                    reset pipeline.
-   */
  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                       MKLDNNMatrixPtr& bias,
                       MKLDNNMatrixPtr& out);
@@ -110,17 +74,10 @@ protected:
                        std::vector<MKLDNNMatrixPtr>& inputs,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
-  /**
-   * Backward functions: reset buffers(inputs, output, bias)
-   */
  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                       MKLDNNMatrixPtr& bias,
                       MKLDNNMatrixPtr& out);
-  /**
-   * prepare for bias
-   */
  void prepareBias(MKLDNNMatrixPtr& bias,
                   const MatrixPtr& biasMat,
                   const MKLDNNMatrixPtr& out,

--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
@@ -21,8 +21,6 @@ namespace paddle {
 REGISTER_LAYER(mkldnn_batch_norm, MKLDNNBatchNormLayer);
-const real MKLDNNBatchNormLayer::EPS = 1E-5;
 bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
                                const ParameterMap& parameterMap) {
  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
@@ -50,6 +48,8 @@ bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
    useGlobalStats_ = config_.use_global_stats();
  }
  movingAvgFraction_ = config_.moving_average_fraction();
+  epsilon_ = config_.epsilon();
  VLOG(MKLDNN_BASE) << "--- " << (useGlobalStats_ ? "use" : "do not use")
                    << " --- global stats";
  VLOG(MKLDNN_BASE) << "Moving average fraction: " << movingAvgFraction_;
@@ -116,21 +116,20 @@ void MKLDNNBatchNormLayer::calMovingMeanAndVar() {
 }
 void MKLDNNBatchNormLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
  reshapeInput(bs, ih, iw);
  oh = ih;
  ow = iw;
  // ic_ and oc can not be changed
-  CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic)
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
      << "Input channel can not be changed";
  reshapeOutput(oh, ow);
  resizeOutput(bs, oc * oh * ow);
 }
 void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
-                                    MKLDNNMatrixPtr& in,
+                                    std::vector<MKLDNNMatrixPtr>& inputs,
-                                    MKLDNNMatrixPtr& wgt,
-                                    MKLDNNMatrixPtr& bias,
                                    MKLDNNMatrixPtr& out) {
  // In training phase, it will always calculate mean and var,
  // so useGlobalStats must be false.
@@ -140,25 +139,23 @@ void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
    useGlobalStats_ = false;
  }
-  resetFwdBuffers(in, wgt, out);
+  resetFwdBuffers(inputs[0], wgtVal_, out);
-  resetFwdPD(fwdPD_, in, wgt, out);
+  resetFwdPD(fwdPD_, inputs[0], wgtVal_, out);
-  resetFwdPipeline(pipeline, fwdPD_, in, wgt, out);
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, out);
 }
 void MKLDNNBatchNormLayer::resetBwd(std::vector<primitive>& pipeline,
-                                    MKLDNNMatrixPtr& in,
+                                    std::vector<MKLDNNMatrixPtr>& inputs,
-                                    MKLDNNMatrixPtr& wgt,
-                                    MKLDNNMatrixPtr& bias,
                                    MKLDNNMatrixPtr& out) {
  std::shared_ptr<bn_bwd::primitive_desc> pd;
-  resetBwdBuffers(in, wgt, out);
+  resetBwdBuffers(inputs[0], wgtGrad_, out);
-  resetBwdPD(pd, in, wgt, out);
+  resetBwdPD(pd, inputs[0], wgtGrad_, out);
-  resetBwdPipeline(pipeline, pd, in, wgt, out);
+  resetBwdPipeline(pipeline, pd, inputs[0], wgtGrad_, out);
 }
 void MKLDNNBatchNormLayer::forward(PassType passType) {
@@ -213,7 +210,7 @@ void MKLDNNBatchNormLayer::resetFwdPD(
  if (wgt) {
    flags_ = (flags_ | batch_normalization_flag::use_scale_shift);
  }
-  auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), EPS, flags_);
+  auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), epsilon_, flags_);
  pd.reset(new bn_fwd::primitive_desc(fwdDesc, engine_));
  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
  if (wgt) {
@@ -260,9 +257,9 @@ void MKLDNNBatchNormLayer::resetFwdPipeline(
 void MKLDNNBatchNormLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                           MKLDNNMatrixPtr& wgt,
                                           MKLDNNMatrixPtr& out) {
-  CHECK(inVal_ && outVal_);
+  CHECK(inVals_[0] && outVal_);
  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
  if (gradScaleShift_) {
    CHECK(wgtVal_);
    resetWithMatrix(wgt, gradScaleShift_, wgtVal_->getPrimitiveDesc());
@@ -280,7 +277,7 @@ void MKLDNNBatchNormLayer::resetBwdPD(
  }
  CHECK_PRIMITIVE_DESC_EQ(out, in->getPrimitiveDesc());
  auto md = in->getMemoryDesc();
-  auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, EPS, flags_);
+  auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, epsilon_, flags_);
  pd.reset(new bn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
  CHECK(pd->weights_primitive_desc() == fwdPD_->weights_primitive_desc());
  CHECK_PRIMITIVE_DESC_EQ(wgt, pd->diff_weights_primitive_desc());
@@ -297,11 +294,12 @@ void MKLDNNBatchNormLayer::resetBwdPipeline(
  if (pd == nullptr) {
    return;
  }
-  CHECK(inVal_);
+  CHECK(inVals_[0]);
  bwdData_.reset(
      wgt && wgtVal_
-          ? new bn_bwd(*pd, *inVal_, *mean_, *var_, *out, *wgtVal_, *in, *wgt)
+          ? new bn_bwd(
-          : new bn_bwd(*pd, *inVal_, *mean_, *var_, *out, *in));
+                *pd, *inVals_[0], *mean_, *var_, *out, *wgtVal_, *in, *wgt)
+          : new bn_bwd(*pd, *inVals_[0], *mean_, *var_, *out, *in));
  pipeline.push_back(*bwdData_);
 }

--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.h
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
--- a/paddle/gserver/layers/MKLDNNConcatLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.cpp
--- a/paddle/gserver/layers/MKLDNNConcatLayer.h
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.h
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
--- a/paddle/gserver/layers/MKLDNNPoolLayer.h
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.h
--- a/paddle/gserver/layers/ROIPoolLayer.cpp
+++ b/paddle/gserver/layers/ROIPoolLayer.cpp
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
--- a/paddle/operators/adadelta_op.cu
+++ b/paddle/operators/adadelta_op.cu
--- a/paddle/operators/adadelta_op.h
+++ b/paddle/operators/adadelta_op.h
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
--- a/paddle/operators/adam_op.cu
+++ b/paddle/operators/adam_op.cu
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
--- a/paddle/operators/adamax_op.cu
+++ b/paddle/operators/adamax_op.cu
--- a/paddle/operators/adamax_op.h
+++ b/paddle/operators/adamax_op.h
--- a/paddle/operators/array_operator.h
+++ b/paddle/operators/array_operator.h
--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/operators/beam_search_decode_op.cc
--- a/paddle/operators/beam_search_decode_op.h
+++ b/paddle/operators/beam_search_decode_op.h
--- a/paddle/operators/beam_search_op.cc
+++ b/paddle/operators/beam_search_op.cc
--- a/paddle/operators/bilinear_tensor_product_op.cc
+++ b/paddle/operators/bilinear_tensor_product_op.cc
--- a/paddle/operators/cast_op.cc
+++ b/paddle/operators/cast_op.cc
--- a/paddle/operators/cast_op.h
+++ b/paddle/operators/cast_op.h
--- a/paddle/operators/conv_cudnn_op.cc
+++ b/paddle/operators/conv_cudnn_op.cc
--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/operators/conv_op.cu.cc
--- a/paddle/operators/conv_op.h
+++ b/paddle/operators/conv_op.h
--- a/paddle/operators/conv2d_transpose_cudnn_op.cc
+++ b/paddle/operators/conv2d_transpose_cudnn_op.cc
--- a/paddle/operators/conv2d_transpose_cudnn_op.cu.cc
+++ b/paddle/operators/conv2d_transpose_cudnn_op.cu.cc
--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
--- a/paddle/operators/conv_transpose_op.cu.cc
+++ b/paddle/operators/conv_transpose_op.cu.cc
--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
--- a/paddle/operators/dynamic_recurrent_op.cc
+++ b/paddle/operators/dynamic_recurrent_op.cc
--- a/paddle/operators/dynamic_recurrent_op.h
+++ b/paddle/operators/dynamic_recurrent_op.h
--- a/paddle/operators/dynamic_recurrent_op_test.cc
+++ b/paddle/operators/dynamic_recurrent_op_test.cc
--- a/paddle/operators/expand_op.h
+++ b/paddle/operators/expand_op.h
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
--- a/paddle/operators/ftrl_op.cc
+++ b/paddle/operators/ftrl_op.cc
--- a/paddle/operators/ftrl_op.cu
+++ b/paddle/operators/ftrl_op.cu
--- a/paddle/operators/ftrl_op.h
+++ b/paddle/operators/ftrl_op.h
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
--- a/paddle/operators/gru_unit_op.h
+++ b/paddle/operators/gru_unit_op.h
--- a/paddle/operators/huber_loss_op.cc
+++ b/paddle/operators/huber_loss_op.cc
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
--- a/paddle/operators/lod_reset_op.h
+++ b/paddle/operators/lod_reset_op.h
--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
--- a/paddle/operators/logical_op.cc
+++ b/paddle/operators/logical_op.cc
--- a/paddle/operators/logical_op.cu
+++ b/paddle/operators/logical_op.cu
--- a/paddle/operators/logical_op.h
+++ b/paddle/operators/logical_op.h
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
--- a/paddle/operators/math/context_project.h
+++ b/paddle/operators/math/context_project.h
--- a/paddle/operators/math/im2col.h
+++ b/paddle/operators/math/im2col.h
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
--- a/paddle/operators/math/math_function_test.cu
+++ b/paddle/operators/math/math_function_test.cu
--- a/paddle/operators/math/maxouting.cc
+++ b/paddle/operators/math/maxouting.cc
--- a/paddle/operators/math/maxouting.cu
+++ b/paddle/operators/math/maxouting.cu
--- a/paddle/operators/math/maxouting.h
+++ b/paddle/operators/math/maxouting.h
--- a/paddle/operators/math/pooling.cc
+++ b/paddle/operators/math/pooling.cc
--- a/paddle/operators/math/pooling.cu
+++ b/paddle/operators/math/pooling.cu
--- a/paddle/operators/math/pooling.h
+++ b/paddle/operators/math/pooling.h
--- a/paddle/operators/math/selected_rows_functor.cc
+++ b/paddle/operators/math/selected_rows_functor.cc
--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
--- a/paddle/operators/math/selected_rows_functor_test.cu
+++ b/paddle/operators/math/selected_rows_functor_test.cu
--- a/paddle/operators/math/vol2col.h
+++ b/paddle/operators/math/vol2col.h
--- a/paddle/operators/math/vol2col_test.cc
+++ b/paddle/operators/math/vol2col_test.cc
--- a/paddle/operators/max_sequence_len_op.cc
+++ b/paddle/operators/max_sequence_len_op.cc
--- a/paddle/operators/maxout_op.cc
+++ b/paddle/operators/maxout_op.cc
--- a/paddle/operators/maxout_op.cu.cc
+++ b/paddle/operators/maxout_op.cu.cc
--- a/paddle/operators/maxout_op.h
+++ b/paddle/operators/maxout_op.h
--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/operators/merge_lod_tensor_op.cc
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
--- a/paddle/operators/pool_cudnn_op.cc
+++ b/paddle/operators/pool_cudnn_op.cc
--- a/paddle/operators/pool_cudnn_op.cu.cc
+++ b/paddle/operators/pool_cudnn_op.cu.cc
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
--- a/paddle/operators/pool_op.cu.cc
+++ b/paddle/operators/pool_op.cu.cc
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
--- a/paddle/operators/pool_with_index_op.cu.cc
+++ b/paddle/operators/pool_with_index_op.cu.cc
--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ b/paddle/operators/rnn/recurrent_op_utils.h
--- a/paddle/operators/rnn_memory_helper_op.cc
+++ b/paddle/operators/rnn_memory_helper_op.cc
--- a/paddle/operators/roi_pool_op.cc
+++ b/paddle/operators/roi_pool_op.cc
--- a/paddle/operators/roi_pool_op.cu
+++ b/paddle/operators/roi_pool_op.cu
--- a/paddle/operators/roi_pool_op.h
+++ b/paddle/operators/roi_pool_op.h
--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
--- a/paddle/operators/sequence_conv_op.cu.cc
+++ b/paddle/operators/sequence_conv_op.cu.cc
--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/operators/sequence_slice_op.cc
--- a/paddle/operators/sequence_slice_op.cu
+++ b/paddle/operators/sequence_slice_op.cu
--- a/paddle/operators/sequence_slice_op.h
+++ b/paddle/operators/sequence_slice_op.h
--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/operators/split_lod_tensor_op.cc
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
--- a/paddle/operators/sum_op.cu
+++ b/paddle/operators/sum_op.cu
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
--- a/paddle/operators/tensor.save
+++ b/paddle/operators/tensor.save
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
--- a/paddle/platform/cuda_helper.h
+++ b/paddle/platform/cuda_helper.h
--- a/paddle/platform/cudnn_helper.h
+++ b/paddle/platform/cudnn_helper.h
--- a/paddle/platform/cudnn_helper_test.cc
+++ b/paddle/platform/cudnn_helper_test.cc
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
--- a/paddle/platform/enforce.cc
+++ b/paddle/platform/enforce.cc
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
--- a/paddle/trainer/tests/chunking.conf
+++ b/paddle/trainer/tests/chunking.conf
--- a/paddle/trainer/tests/compare_sparse_data
+++ b/paddle/trainer/tests/compare_sparse_data
--- a/paddle/trainer/tests/data_bin_part
+++ b/paddle/trainer/tests/data_bin_part
--- a/paddle/trainer/tests/gen_proto_data.py
+++ b/paddle/trainer/tests/gen_proto_data.py
--- a/paddle/trainer/tests/test.txt
+++ b/paddle/trainer/tests/test.txt
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
--- a/paddle/trainer/tests/test_config.conf
+++ b/paddle/trainer/tests/test_config.conf
--- a/paddle/trainer/tests/test_files.txt
+++ b/paddle/trainer/tests/test_files.txt
--- a/paddle/trainer/tests/train.list
+++ b/paddle/trainer/tests/train.list
--- a/paddle/trainer/tests/train.txt
+++ b/paddle/trainer/tests/train.txt
--- a/paddle/trainer/tests/train_files.txt
+++ b/paddle/trainer/tests/train_files.txt
--- a/paddle/trainer/tests/train_sparse.list
+++ b/paddle/trainer/tests/train_sparse.list
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_l2_distance_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_l2_distance_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
--- a/python/paddle/v2/fluid/initializer.py
+++ b/python/paddle/v2/fluid/initializer.py
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
--- a/python/paddle/v2/fluid/layers.py
+++ b/python/paddle/v2/fluid/layers.py
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
--- a/python/paddle/v2/fluid/regularizer.py
+++ b/python/paddle/v2/fluid/regularizer.py
--- a/python/paddle/v2/fluid/tests/.gitignore
+++ b/python/paddle/v2/fluid/tests/.gitignore
--- a/python/paddle/v2/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
--- a/python/paddle/v2/fluid/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
--- a/python/paddle/v2/fluid/tests/test_activation_op.py
+++ b/python/paddle/v2/fluid/tests/test_activation_op.py
--- a/python/paddle/v2/fluid/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
--- a/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
+++ b/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
--- a/python/paddle/v2/fluid/tests/test_cast_op.py
+++ b/python/paddle/v2/fluid/tests/test_cast_op.py
--- a/python/paddle/v2/fluid/tests/test_conditional_block.py
+++ b/python/paddle/v2/fluid/tests/test_conditional_block.py
--- a/python/paddle/v2/fluid/tests/test_conv2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py
--- a/python/paddle/v2/fluid/tests/test_conv3d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv3d_op.py
--- a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
--- a/python/paddle/v2/fluid/tests/test_dropout_op.py
+++ b/python/paddle/v2/fluid/tests/test_dropout_op.py
--- a/python/paddle/v2/fluid/tests/test_dynamic_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_dynamic_recurrent_op.py
--- a/python/paddle/v2/fluid/tests/test_executor_and_mul.py
+++ b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
--- a/python/paddle/v2/fluid/tests/test_ftrl_op.py
+++ b/python/paddle/v2/fluid/tests/test_ftrl_op.py
--- a/python/paddle/v2/fluid/tests/test_gru_op.py
+++ b/python/paddle/v2/fluid/tests/test_gru_op.py
--- a/python/paddle/v2/fluid/tests/test_gru_unit_op.py
+++ b/python/paddle/v2/fluid/tests/test_gru_unit_op.py
--- a/python/paddle/v2/fluid/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
--- a/python/paddle/v2/fluid/tests/test_inference_model_io.py
+++ b/python/paddle/v2/fluid/tests/test_inference_model_io.py
--- a/python/paddle/v2/fluid/tests/test_initializer.py
+++ b/python/paddle/v2/fluid/tests/test_initializer.py
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
--- a/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
+++ b/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
--- a/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
+++ b/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
--- a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
--- a/python/paddle/v2/fluid/tests/test_logical_op.py
+++ b/python/paddle/v2/fluid/tests/test_logical_op.py
--- a/python/paddle/v2/fluid/tests/test_maxout_op.py
+++ b/python/paddle/v2/fluid/tests/test_maxout_op.py
--- a/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
+++ b/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
--- a/python/paddle/v2/fluid/tests/test_nccl_init_op.py
+++ b/python/paddle/v2/fluid/tests/test_nccl_init_op.py
--- a/python/paddle/v2/fluid/tests/test_optimizer.py
+++ b/python/paddle/v2/fluid/tests/test_optimizer.py
--- a/python/paddle/v2/fluid/tests/test_parameter.py
+++ b/python/paddle/v2/fluid/tests/test_parameter.py
--- a/python/paddle/v2/fluid/tests/test_pool2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_pool2d_op.py
--- a/python/paddle/v2/fluid/tests/test_pool3d_op.py
+++ b/python/paddle/v2/fluid/tests/test_pool3d_op.py
--- a/python/paddle/v2/fluid/tests/test_pool_max_op.py
+++ b/python/paddle/v2/fluid/tests/test_pool_max_op.py
--- a/python/paddle/v2/fluid/tests/test_program.py
+++ b/python/paddle/v2/fluid/tests/test_program.py
--- a/python/paddle/v2/fluid/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
--- a/python/paddle/v2/fluid/tests/test_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py
--- a/python/paddle/v2/fluid/tests/test_regularizer.py
+++ b/python/paddle/v2/fluid/tests/test_regularizer.py
--- a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
+++ b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
--- a/python/paddle/v2/fluid/tests/test_roi_pool_op.py
+++ b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
--- a/python/paddle/v2/fluid/tests/test_sequence_slice_op.py
+++ b/python/paddle/v2/fluid/tests/test_sequence_slice_op.py
--- a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
+++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
--- a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
--- a/python/paddle/v2/fluid/tests/test_tensor_array.py
+++ b/python/paddle/v2/fluid/tests/test_tensor_array.py
--- a/python/paddle/v2/fluid/tests/test_variable.py
+++ b/python/paddle/v2/fluid/tests/test_variable.py
--- a/python/paddle/v2/fluid/tests/test_while_op.py
+++ b/python/paddle/v2/fluid/tests/test_while_op.py