Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into adadelta

de2d7299 · wanghaoshuang · 72847ad0 · 484cff6e · de2d7299 · de2d7299
419 changed file
--- a/.travis.yml
+++ b/.travis.yml
@@ -56,7 +56,7 @@ script:
    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
    export DOCS_DIR=`pwd`
    cd ..
-    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/v2   
+    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
 notifications:
  email:
    on_success: change

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,7 +53,7 @@ option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 # TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option. 
-option(WITH_FLUID       "Compile PaddlePaddle fluid only(TODO)"         ON)
+option(WITH_FLUID       "Compile PaddlePaddle fluid only(TODO)"         OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)

--- a/Dockerfile
+++ b/Dockerfile
@@ -53,10 +53,14 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
 # version util jupyter fixes this issue.
+# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
+# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
+# version(1.7.1 for now), which causes building documentation failed.
 RUN pip install --upgrade pip && \
    pip install -U wheel && \
-    pip install -U docopt PyYAML sphinx && \
+    pip install -U docopt PyYAML sphinx==1.5.6 && \
-    pip install -U sphinx-rtd-theme==0.1.9 recommonmark
+    pip install sphinx-rtd-theme==0.1.9 recommonmark
 RUN pip install pre-commit 'ipython==5.3.0' && \
    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -78,6 +78,7 @@ IF(NOT ${CBLAS_FOUND})
        BUILD_IN_SOURCE     1
        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS}
        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR> 
+                            && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig
        UPDATE_COMMAND      ""
        CONFIGURE_COMMAND   ""
    )
@@ -100,11 +101,6 @@ IF(NOT ${CBLAS_FOUND})
                \"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\"
            )"
        )
-        INSTALL(CODE "execute_process(
-            COMMAND rm -r ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/cmake
-                    ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/pkgconfig
-            )"
-        )
    ENDIF()
 ENDIF(NOT ${CBLAS_FOUND})

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -186,7 +186,9 @@ function(cc_library TARGET_NAME)
      add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
    else()
      add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
+      find_fluid_modules(${TARGET_NAME})
    endif()
    if(cc_library_DEPS)
      # Don't need link libwarpctc.so
      if("${cc_library_DEPS};" MATCHES "warpctc;")
@@ -242,11 +244,11 @@ function(cc_test TARGET_NAME)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
    # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
-    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
    if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
      list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
    endif()
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
    add_test(NAME ${TARGET_NAME}
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
@@ -264,6 +266,7 @@ function(nv_library TARGET_NAME)
        cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
      else()
        cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
+        find_fluid_modules(${TARGET_NAME})
      endif()
      if (nv_library_DEPS)
        add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
@@ -308,8 +311,8 @@ function(nv_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
    add_test(${TARGET_NAME} ${TARGET_NAME})
  endif()
 endfunction(nv_test)

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
+set_property(GLOBAL PROPERTY FLUID_MODULES "")
+# find all fluid modules is used for paddle fluid static library
+function(find_fluid_modules TARGET_NAME)
+  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(FIND "${__target_path}" "fluid" pos)
+  if(pos GREATER 1)
+    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
+  endif()
+endfunction(find_fluid_modules)
 # make package for paddle fluid shared and static library
 function(copy TARGET)
    set(options "")
    set(oneValueArgs "")
    set(multiValueArgs SRCS DSTS DEPS)
    cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE)
    list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
    list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
@@ -42,13 +55,21 @@ copy(glog_lib
  DSTS ${dst_dir} ${dst_dir}/lib
 )
-IF(NOT PROTOBUF_FOUND)
+if(NOT PROTOBUF_FOUND)
    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
    copy(protobuf_lib
-      SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LITE_LIBRARY}
+      SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
      DSTS ${dst_dir} ${dst_dir}/lib
    )
-ENDIF(NOT PROTOBUF_FOUND)
+endif()
+if(NOT CBLAS_FOUND)
+    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/openblas")
+    copy(openblas_lib
+      SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
+      DSTS ${dst_dir} ${dst_dir}
+    )
+endif()
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
@@ -66,8 +87,8 @@ copy(memory_lib
 )
 set(module "inference")
-copy(inference_lib DEPENDS paddle_fluid_shared
+copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
-  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.so
+  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
 )
@@ -83,6 +104,4 @@ copy(string_lib
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
 )
-add_custom_target(inference_lib_dist DEPENDS 
+add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) 
-  inference_lib framework_lib memory_lib platform_lib string_lib
-  gflags_lib glog_lib protobuf_lib eigen3_lib)
--- a/doc/design/cpp_data_feeding.md
+++ b/doc/design/cpp_data_feeding.md
-# C++ Data Feeding
-In training with Paddle V2 API, data feeding wholly dependents on Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required. 
-In this document we show the fundamental design of C++ data feeding process, which includes the data reading, shuffling and batching.
-## Reader
-A new concept named 'Reader' is introduced. `Reader` is a series of inherited classes which can be hold by our `Variable` and they are used to read or process file data.
-### `ReaderBase`
-`ReaderBase` is the abstract base class of all readers. It defines the all readers' interfaces.
-```cpp
-class ReaderBase {
- public:
-  explicit ReaderBase(const std::vector<DDim>& shapes) : shapes_(shapes) {
-    PADDLE_ENFORCE(!shapes_.empty());
-  }
-  // Read the next batch of data. (A 'batch' can be only one instance)
-  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
-  // Show whether the next bacth exists.
-  virtual bool HasNext() const = 0;
-  // Reinitialize the reader and read the file from the begin.
-  virtual void ReInit() = 0;
-  // Get a certain read in data's shape.
-  DDim shape(size_t idx) const;
-  // Get shapes of all read in data.
-  std::vector<DDim> shapes() const { return shapes_; }
-  // Set shapes of read in data.
-  void set_shapes(const std::vector<DDim>& shapes) { shapes_ = shapes; }
-  virtual ~ReaderBase() {}
- protected:
-  std::vector<DDim> shapes_;
-};
-```
-### `FileReader` and `DecoratedReader`
-These two classes are derived from the `ReaderBase` and will further be derived by respective specific readers. That is to say, in our design, there are two kinds of readers: file readers and decorated readers. A file reader reads from a file of some specific format, and yield only one instance of data at a time. e.g. RecordIO reader, jpg reader, .... A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some process on them(shuffling, or batching), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers.
-All the readers share exactly the same interfaces defined in `ReaderBase`. So they can be decorated for more than one time: We can **shuffle** a reader's outputs and then **batch** the shuffle outputs. The interface consistency also allows related ops use readers without knowing what they are exactly.
-### `ReaderHolder`
-Different readers belong to different class types. It leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code:
-```cpp
-var->Get<ReaderBase>("batch_reader");
-```
-we have to write:
-```cpp
-var->Get<BatchReader>("batch_reader");
-```
-This requires each time getting a reader from a variable we must know the reader's type exactly. It is nearly impossible.
-To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which erases reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get<ReaderHolder>("...")` and regard the obtained object as a reader.
-## Related Operators
-To create and invoke readers, some now ops are introduced:
-### `CreateReaderOp`
-Each reader has its creating op. File readers' creating ops have no input and yield the created file reader as its output. Decorated readers' creating ops take the underlying readers as inputs and then yield new decorated readers.
-### `ReadOp`
-A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector<LoDTenosr>`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables.
--- a/doc/design/distributed_lookup_table_design.md
+++ b/doc/design/distributed_lookup_table_design.md
+## Design Doc: Distributed Lookup Table Operator
+A lookup table operator in PaddlePaddle where the table could be out
+of the memory of a computer.
+## Background
+A lookup table operator is well-used in deep learning for learning the
+representation, or the
+[*embedding*](http://www.cs.toronto.edu/~fritz/absps/ieee-lre.pdf), of
+symbols.
+### The Forward Algorithm
+The forward algorithm of the lookup table is a multiplication of the
+input vector x and the lookup table matrix W:
+$$y = x * W$$
+When x is a sparse vector of symbols, the above multiplication
+simplifies into looking up rows in W that correspond to symbols in x,
+denoted by W(x).  Please be aware that W could be huge and out of the
+memory, so we'd need a distributed storage service, which supports the
+lookup of rows.
+The following figure illustrates the multiplication of x with two
+non-zero elements, or say, two symbols, and a lookup table W:
+![lookup table](./lookup_table.png)
+### The Backward Algorithm
+The backward algorithm computes W'(x) using W(x).  W'(x) has the same
+scale of size as W(x) and is much smaller than W.
+To optimize W given W', we can do simple SGD update:
+$$W = f(W') = \lambda * W'$$
+or some more sophisticated algorithms that rely on both W' and W:
+$$W = f(W, W')$$
+The following figure illustrates the backward pass of the lookup
+operator: ![lookup table training](./lookup_table_training.png)
+## Distributed Storage Service
+The forward algorithm requires a distributed storage service for W.
+The backward algorithm prefers that the storage system can apply the
+optimization algorithm on W.  The following two sections describe two
+solutions -- the former doesn't require that the storage service can
+do optimization, the latter does.
+### Storage Service Doesn't Optimize
+In this design, we use highly-optimized distributed storage, e.g.,
+memcached, as the storage service, and we run the optimization
+algorithm on parameter servers of PaddlePaddle.  The following figure
+illustrates the training process.
+<!--
+Note: please update the following URL when update this digraph.
+<img src='https://g.gravizo.com/svg?
+digraph G {
+  rankdir="LR";
+  subgraph cluster1 {
+  P1 [label="pserver 1"];
+  P2 [label="pserver 2"];
+  T1 [label="trainer 1"];
+  T2 [label="trainer 2"];
+  T3 [label="trainer 3"];
+  }
+  KV [label="memcached"];
+  T1 -> P1;
+  T1 -> P2;
+  T2 -> P1;
+  T2 -> P2;
+  T3 -> P1;
+  T3 -> P2;
+  P1 -> KV [color=gray, weight=0.1];
+  KV -> P1 [color=gray, weight=0.1];
+  P2 -> KV [color=gray, weight=0.1];
+  KV -> P2 [color=gray, weight=0.1];
+  KV -> T1 [color=gray, weight=0.1];
+  KV -> T2 [color=gray, weight=0.1];
+  KV -> T3 [color=gray, weight=0.1];
+}
+)
+'/>
+-->
+<img src='https://g.gravizo.com/svg?%20digraph%20G%20{%20rankdir=%22LR%22;%20subgraph%20cluster1%20{%20P1%20[label=%22pserver%201%22];%20P2%20[label=%22pserver%202%22];%20T1%20[label=%22trainer%201%22];%20T2%20[label=%22trainer%202%22];%20T3%20[label=%22trainer%203%22];%20}%20KV%20[label=%22memcached%22];%20T1%20-%3E%20P1;%20T1%20-%3E%20P2;%20T2%20-%3E%20P1;%20T2%20-%3E%20P2;%20T3%20-%3E%20P1;%20T3%20-%3E%20P2;%20P1%20-%3E%20KV%20[color=gray,%20weight=0.1];%20KV%20-%3E%20P1%20[color=gray,%20weight=0.1];%20P2%20-%3E%20KV%20[color=gray,%20weight=0.1];%20KV%20-%3E%20P2%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T1%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T2%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T3%20[color=gray,%20weight=0.1];%20}'/>
+Each trainer runs the forward and backward passes using their local
+data:
+1. In the forward pass, when a trainer runs the forward algorithm of a
+   lookup operator, it retrieves W(x) from the storage service.
+1. The trainer computes W'(x) in the backward pass using W(x).
+During the global update process:
+1. Each trainer uploads its W'(x) to parameter servers.
+1. The parameter server runs the optimization algorithm, e.g., the
+   Adam optimization algorithm, which requires that
+   1. The parameter server retrieves W(x) from memcached, and
+   1. The parameter server pushes $\Delta W(x)=f(W(x), lambda \sum_j
+      W'(x))$ to memcached, where $f$ denotes the optimization
+      algorithm.
+### Storage Service Does Optimize
+This design is very similar to the above one, except that the
+optimization algorithm $f$ runs on the storage service.
+- Pro: parameter servers do not retrieve W(x) from the storage
+  service, thus saves half network communication.
+- Con: the storage service needs to be able to run the optimization
+  algorithm.
+## Conclusion
+Let us do the "storage service does not optimize" solution first, as a
+baseline at least, because it is easier to use a well-optimized
+distributed storage service like memcached.  We can do the "storage
+service does optimize" solution later or at the same time, which, if
+implemented carefully, should have better performance than the former.
--- a/doc/design/images/duplicate_op.graffle
+++ b/doc/design/images/duplicate_op.graffle
--- a/doc/design/images/duplicate_op.png
+++ b/doc/design/images/duplicate_op.png
--- a/doc/design/images/duplicate_op2.graffle
+++ b/doc/design/images/duplicate_op2.graffle
--- a/doc/design/images/duplicate_op2.png
+++ b/doc/design/images/duplicate_op2.png
--- a/doc/design/images/replica.png
+++ b/doc/design/images/replica.png
--- a/doc/design/images/two_phase_commit.png
+++ b/doc/design/images/two_phase_commit.png
--- a/doc/design/lookup_table.png
+++ b/doc/design/lookup_table.png
--- a/doc/design/lookup_table_training.png
+++ b/doc/design/lookup_table_training.png
--- a/doc/design/images/asgd.gif
+++ b/doc/design/images/asgd.gif
--- a/doc/design/images/theta_star.gif
+++ b/doc/design/images/theta_star.gif
--- a/doc/design/parameter_average.md
+++ b/doc/design/parameter_average.md
--- a/doc/design/build_system/README.md
+++ b/doc/design/build_system/README.md
--- a/doc/design/block.md
+++ b/doc/design/block.md
--- a/doc/fluid/design/concepts/cpp_data_feeding.md
+++ b/doc/fluid/design/concepts/cpp_data_feeding.md
+# C++ Data Feeding
+While using Paddle V2 API for training, data feeding completely depends on the Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required.
+In this document, we show the fundamental design of a C++ data feeding process, which includes data reading, shuffling and batching.
+## Overview
+![](images/readers.png)
+## Reader
+In order to handle the above-mentioned problem, a new concept called 'Reader' is introduced. `Reader` is a series of inherited classes which can be held by our `Variable` and they are used to read or process file data.
+### ReaderBase
+`ReaderBase` is the abstract base class for all readers. It defines the interface for all readers.
+```cpp
+class ReaderBase {
+ public:
+  // Reads the next batch of data. (A 'batch' can be only one instance)
+  // If the next batch doesn't exist, it throws an exception
+  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
+  // Checks whether the next instance exists.
+  virtual bool HasNext() = 0;
+  // Reinitializes the reader and read the file from the beginning.
+  virtual void ReInit() = 0;
+  virtual ~ReaderBase();
+};
+```
+### FileReader
+`FileReader` is derived from the `ReaderBase`. It is still an abstract class and will further be derived by Readers of respective specific format.
+```cpp
+class FileReader : public ReaderBase {
+ public:
+  explicit FileReader(const std::vector<DDim>& dims);
+  void ReadNext(std::vector<LoDTensor>* out) override;
+ protected:
+  virtual void ReadNextImpl(std::vector<LoDTensor>* out) = 0;
+ private:
+  std::vector<DDim> dims_;
+};
+```
+A file reader binds with a single file and reads one data instance at a time. Each type of file reader shall implement its own `ReadNextImpl()`, `HasNext()` and `ReInit()`.
+The `ReadNextImpl()` is invoked by `ReadNext()`. Besides invoking `ReadNextImpl()`, `ReadNext()` is also responsible for checking the output, making sure that each shape of `LoDTensor` in `*out` is consistent with the one in `dims_`.  
+### DecoratedReader
+A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some processing on them(shuffling,  batching or something else), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers.
+```cpp
+class DecoratedReader : public ReaderBase {
+ public:
+  explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) {
+    PADDLE_ENFORCE_NOT_NULL(reader_);
+  }
+  void ReInit() override { reader_->ReInit(); }
+  bool HasNext() const override { return reader_->HasNext(); }
+ protected:
+  ReaderBase* reader_;
+};
+```
+Both the `FileReader` and `DecoratedReader` share exactly the same interface as defined in `ReaderBase`. So they can be decorated for multiple times: We can **shuffle** a reader's outputs and then **batch** the shuffled outputs. The interface consistency also allows related ops use readers without knowing their underlying type.
+### MultipleReader
+All `FileReader` binds with a single file and are single-threaded. However, sometimes we need to read data from more than one file. In this case, it's not enough to only have `FileReader` and `DecoratedReader`.
+So `MultipleReader` is introduced. It is also derived from `ReaderBase`. A `MultipleReader` holds several prefetching `FileReaders` and these readers run concurrently. Another pivotal part of a `MultipleReader` is a buffer channel. The channel collects data yield by all prefetching readers and makes subsequent OPs or decorated readers be able to fetch data without concerning about multiple readers scheduling.
+![](images/multiple_reader.png)
+This graph shows how a `MultipleReader` works with three prefetching file readers and two GPUs. There is a queue of files which are going to be read. Each time when a prefetching file reader is free(complete reading from one file), it fetches a new file from the queue. Each prefetching file reader runs in a separated prefetch thread and dumps their outputs to the same channel.
+To the subsequent two decorated readers, the `MultipleReader` is **a single reader**. They don't need to concern about how prefetch readers are scheduled. They only need to invoke `MultipleReader::ReadNext()` to get the next data from the buffer channel. 
+### ReaderHolder
+Different readers belong to different class types. This leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code:
+```cpp
+var->Get<ReaderBase>("batch_reader");
+```
+We would have to write:
+```cpp
+var->Get<BatchReader>("batch_reader");
+```
+This requires that in order to get a reader from a variable, every time, we must know the reader's type exactly. This is nearly impossible.
+To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which hides reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get<ReaderHolder>("...")` and regard the obtained object as a reader.
+## Related Operators
+To create and invoke readers, some new ops are introduced:
+### CreateReaderOp
+Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers.
+However, direct usage of file readers' creation ops is not recommended because a file reader can only read one file via a single thread. Using `OpenFilesOp` is a better choice.
+### OpenFilesOp
+The `OpenFilesOp` is the creation op of `MultipleReader`. It takes no input but requires a list of file names as one of its attributes. The newly created `MultipleReader` then creates its own prefetching readers according to given file names.
+To make sure that created prefetching readers match file formats, we need a name prefix rule to append file format tags to file names, as well as a file reader registry mechanism to map file format tags to their corresponding file readers' constructors.
+### HasNextOp
+`HasNextOp` is used to check whether the next data batch exists via the reader's `HasNext()` interface.
+### ResetOp
+`ResetOp` is used to reset a reader via its `ReInit()` interface.
+### ReadOp
+A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector<LoDTenosr>`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables.
+## Program with Readers
+A `Program` holds readers as its persistable variables. These variables are created by `CreateReaderOp` or `OpenFilesOp`. These ops shall run only once. So they shall be settled in the `startup_program`. `HasNextOp`, `ResetOp` and `ReadOp` are required by training loop, so they shall be in the `main_program`.
+The ops of a `startup_program` with readers would be like this:
+```
+multiple_reader = open_files_op(...)
+batch_reader = create_batch_reader_op(multiple_reader)
+double_buffer_reader = create_double_buffer_op(batch_reader)
+... (other initializers)
+```
+The forwarding ops of the corresponding `main_program` would be like this:
+```
+while_op {
+    has_next = has_next_op(double_buffer_reader)
+    if_else_op(has_next) {
+        batch_data = read_op(double_buffer_reader)
+        ... (subsequent training ops)
+    } else {
+        reset_op(double_buffer_reader)
+    }
+}
+```
+Two important considerations for these programs are as follows:
+1. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader.
+2. All readers exist in both `startup_program` and `main_program`. And they are persistable.
--- a/doc/design/executor.md
+++ b/doc/design/executor.md
--- a/doc/design/functions_operators_layers.md
+++ b/doc/design/functions_operators_layers.md
--- a/doc/fluid/design/concepts/images/multiple_reader.png
+++ b/doc/fluid/design/concepts/images/multiple_reader.png
--- a/doc/fluid/design/concepts/images/readers.png
+++ b/doc/fluid/design/concepts/images/readers.png
--- a/paddle/fluid/framework/lod_tensor.md
+++ b/paddle/fluid/framework/lod_tensor.md
--- a/doc/design/program.md
+++ b/doc/design/program.md
--- a/doc/design/scope.md
+++ b/doc/design/scope.md
--- a/paddle/fluid/framework/tensor.md
+++ b/paddle/fluid/framework/tensor.md
--- a/doc/design/tensor_array.md
+++ b/doc/design/tensor_array.md
--- a/doc/design/var_desc.md
+++ b/doc/design/var_desc.md
--- a/paddle/fluid/framework/variable.md
+++ b/paddle/fluid/framework/variable.md
--- a/doc/design/concurrent_programming.md
+++ b/doc/design/concurrent_programming.md
--- a/doc/design/csp.md
+++ b/doc/design/csp.md
--- a/doc/design/parallel_do.md
+++ b/doc/design/parallel_do.md
--- a/doc/design/float16.md
+++ b/doc/design/float16.md
--- a/doc/design/dist_refactor/distributed_architecture.md
+++ b/doc/design/dist_refactor/distributed_architecture.md
-# Design Doc: Distributed Training Architecture
+# Design Doc: Fluid Distributed Training Architecture
 ## Abstract
@@ -155,7 +155,7 @@ Cluster environment.
 <img src="src/remote_executor.png" width="500" align="center" />
 `RemoteExecutor.run` sends the `ProgramDesc` and
-[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource)
+[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/unreleased-tpr/doc/autoscale/README.md#training-job-resource)
 to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
 to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`.

--- a/doc/design/dist_refactor/multi_cpu.md
+++ b/doc/design/dist_refactor/multi_cpu.md
--- a/doc/design/dist_refactor/parameter_server.md
+++ b/doc/design/dist_refactor/parameter_server.md
@@ -59,6 +59,17 @@ After converting:
     queue. It will block until the queue has the required number of
     tensors.
+### Sparse Update
+For embedding layers, the gradient may have many rows containing only 0 when training,
+if the gradient uses a dense tensor to do parameter optimization,
+it could spend unnecessary memory, slow down the calculations and waste
+the bandwidth while doing distributed training.
+In Fluid, we introduce [SelectedRows](../selected_rows.md) to represent a list of rows containing
+non-zero gradient data. So when we do parameter optimization both locally and remotely,
+we only need to send those non-zero rows to the optimizer operators:
+<img src="src/sparse_update.png" width="700" />
 ### Benefits
@@ -91,6 +102,6 @@ After converting:
  `min_count` attribute), does our current design support it? (similar
  question for the *Add* OP)
+### References
-### References:
 [1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
--- a/doc/design/dist_refactor/src/compiler.graffle
+++ b/doc/design/dist_refactor/src/compiler.graffle
--- a/doc/design/dist_refactor/src/compiler.png
+++ b/doc/design/dist_refactor/src/compiler.png
--- a/doc/design/dist_refactor/src/dist-graph.graffle
+++ b/doc/design/dist_refactor/src/dist-graph.graffle
--- a/doc/design/dist_refactor/src/dist-graph.png
+++ b/doc/design/dist_refactor/src/dist-graph.png
--- a/doc/design/dist_refactor/src/distributed_architecture.graffle
+++ b/doc/design/dist_refactor/src/distributed_architecture.graffle
--- a/doc/design/dist_refactor/src/distributed_architecture.png
+++ b/doc/design/dist_refactor/src/distributed_architecture.png
--- a/doc/design/dist_refactor/src/local-graph.graffle
+++ b/doc/design/dist_refactor/src/local-graph.graffle
--- a/doc/design/dist_refactor/src/local-graph.png
+++ b/doc/design/dist_refactor/src/local-graph.png
--- a/doc/design/dist_refactor/src/local_architecture.graffle
+++ b/doc/design/dist_refactor/src/local_architecture.graffle
--- a/doc/design/dist_refactor/src/local_architecture.png
+++ b/doc/design/dist_refactor/src/local_architecture.png
--- a/doc/design/dist_refactor/src/multi-threads.graffle
+++ b/doc/design/dist_refactor/src/multi-threads.graffle
--- a/doc/design/dist_refactor/src/multi-threads/multi-threads@3x.png
+++ b/doc/design/dist_refactor/src/multi-threads/multi-threads@3x.png
--- a/doc/design/dist_refactor/src/multi-threads/single-thread@3x.png
+++ b/doc/design/dist_refactor/src/multi-threads/single-thread@3x.png
--- a/doc/design/dist_refactor/src/paddle-compile.graffle
+++ b/doc/design/dist_refactor/src/paddle-compile.graffle
--- a/doc/design/dist_refactor/src/paddle-compile.png
+++ b/doc/design/dist_refactor/src/paddle-compile.png
--- a/doc/design/dist_refactor/src/remote_executor.graffle
+++ b/doc/design/dist_refactor/src/remote_executor.graffle
--- a/doc/design/dist_refactor/src/remote_executor.png
+++ b/doc/design/dist_refactor/src/remote_executor.png
--- a/doc/fluid/design/dist_train/src/sparse_update.graffle
+++ b/doc/fluid/design/dist_train/src/sparse_update.graffle
--- a/doc/fluid/design/dist_train/src/sparse_update.png
+++ b/doc/fluid/design/dist_train/src/sparse_update.png
--- a/doc/design/ops/images/2_level_rnn.dot
+++ b/doc/design/ops/images/2_level_rnn.dot
--- a/doc/design/ops/images/2_level_rnn.png
+++ b/doc/design/ops/images/2_level_rnn.png
--- a/doc/design/ops/images/rnn.dot
+++ b/doc/design/ops/images/rnn.dot
--- a/doc/design/ops/images/rnn.jpg
+++ b/doc/design/ops/images/rnn.jpg
--- a/doc/design/ops/rnn.md
+++ b/doc/design/ops/rnn.md
--- a/doc/design/ops/images/rnn.png
+++ b/doc/design/ops/images/rnn.png
--- a/doc/design/ops/images/rnn_2level_data.dot
+++ b/doc/design/ops/images/rnn_2level_data.dot
--- a/doc/design/ops/images/rnn_2level_data.png
+++ b/doc/design/ops/images/rnn_2level_data.png
--- a/paddle/fluid/operators/op_documentation/rnn_design.md
+++ b/paddle/fluid/operators/op_documentation/rnn_design.md
--- a/doc/design/if_else_op.md
+++ b/doc/design/if_else_op.md
--- a/doc/design/switch.md
+++ b/doc/design/switch.md
--- a/doc/design/multi_language_interface/00.why_plain_c.md
+++ b/doc/design/multi_language_interface/00.why_plain_c.md
--- a/doc/design/multi_language_interface/01.inference_implementation.md
+++ b/doc/design/multi_language_interface/01.inference_implementation.md
--- a/paddle/fluid/memory/README.md
+++ b/paddle/fluid/memory/README.md
--- a/doc/design/images/control_flow_graph.png
+++ b/doc/design/images/control_flow_graph.png
--- a/doc/design/images/dataflow_equations.png
+++ b/doc/design/images/dataflow_equations.png
--- a/doc/design/images/deep_learning.png
+++ b/doc/design/images/deep_learning.png
--- a/doc/design/memory_optimization.md
+++ b/doc/design/memory_optimization.md
--- a/doc/design/backward.md
+++ b/doc/design/backward.md
--- a/paddle/fluid/operators/op_documentation/batch_norm_op.md
+++ b/paddle/fluid/operators/op_documentation/batch_norm_op.md
--- a/doc/design/evaluator.md
+++ b/doc/design/evaluator.md
--- a/paddle/fluid/operators/images/batch_norm_fork.dot
+++ b/paddle/fluid/operators/images/batch_norm_fork.dot
--- a/paddle/fluid/operators/images/batch_norm_fork.png
+++ b/paddle/fluid/operators/images/batch_norm_fork.png
--- a/paddle/fluid/operators/images/batch_norm_op_kernel.png
+++ b/paddle/fluid/operators/images/batch_norm_op_kernel.png
--- a/doc/design/images/feed_forward.png
+++ b/doc/design/images/feed_forward.png
--- a/doc/design/images/feed_forward_regularized.png
+++ b/doc/design/images/feed_forward_regularized.png
--- a/doc/design/images/l1_regularization.png
+++ b/doc/design/images/l1_regularization.png
--- a/doc/design/images/l2_regularization.png
+++ b/doc/design/images/l2_regularization.png
--- a/doc/design/images/loss_equation.png
+++ b/doc/design/images/loss_equation.png
--- a/doc/design/infer_var_type.md
+++ b/doc/design/infer_var_type.md
--- a/paddle/fluid/operators/op_documentation/net_op_design.md
+++ b/paddle/fluid/operators/op_documentation/net_op_design.md
--- a/doc/design/optimizer.md
+++ b/doc/design/optimizer.md
--- a/doc/design/prune.md
+++ b/doc/design/prune.md
--- a/doc/design/python_api.md
+++ b/doc/design/python_api.md
--- a/doc/design/register_grad_op.md
+++ b/doc/design/register_grad_op.md
--- a/doc/design/regularization.md
+++ b/doc/design/regularization.md
--- a/doc/design/selected_rows.md
+++ b/doc/design/selected_rows.md
--- a/doc/design/api.md
+++ b/doc/design/api.md
--- a/doc/design/fluid-compiler.graffle
+++ b/doc/design/fluid-compiler.graffle
--- a/doc/design/fluid-compiler.png
+++ b/doc/design/fluid-compiler.png
--- a/doc/design/fluid.md
+++ b/doc/design/fluid.md
--- a/doc/design/fluid_compiler.md
+++ b/doc/design/fluid_compiler.md
--- a/doc/design/refactorization.md
+++ b/doc/design/refactorization.md
--- a/doc/design/kernel_hint_design.md
+++ b/doc/design/kernel_hint_design.md
--- a/doc/design/kernel_selection.md
+++ b/doc/design/kernel_selection.md
--- a/doc/design/operator_kernel_type.md
+++ b/doc/design/operator_kernel_type.md
--- a/doc/design/speech/deep_speech_2.md
+++ b/doc/design/speech/deep_speech_2.md
--- a/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg
+++ b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg
--- a/doc/design/speech/image/beam_search.png
+++ b/doc/design/speech/image/beam_search.png
--- a/doc/design/speech/image/ds2_network.png
+++ b/doc/design/speech/image/ds2_network.png
--- a/doc/design/ops/sequence_decoder.md
+++ b/doc/design/ops/sequence_decoder.md
--- a/doc/design/auto_gradient_check.md
+++ b/doc/design/auto_gradient_check.md
--- a/doc/design/dcgan.png
+++ b/doc/design/dcgan.png
--- a/doc/design/gan_api.md
+++ b/doc/design/gan_api.md
--- a/doc/design/graph.md
+++ b/doc/design/graph.md
--- a/doc/design/graph_survey.md
+++ b/doc/design/graph_survey.md
--- a/doc/design/images/graph_construction_example.bash
+++ b/doc/design/images/graph_construction_example.bash
--- a/doc/design/images/graph_construction_example.dot
+++ b/doc/design/images/graph_construction_example.dot
--- a/doc/design/images/graph_construction_example_all.png
+++ b/doc/design/images/graph_construction_example_all.png
--- a/doc/design/images/graph_construction_example_forward_backward.png
+++ b/doc/design/images/graph_construction_example_forward_backward.png
--- a/doc/design/images/graph_construction_example_forward_only.png
+++ b/doc/design/images/graph_construction_example_forward_only.png
--- a/doc/design/parameters_in_cpp.md
+++ b/doc/design/parameters_in_cpp.md
--- a/doc/design/simple_op_design.md
+++ b/doc/design/simple_op_design.md
--- a/doc/design/test.dot
+++ b/doc/design/test.dot
--- a/doc/design/test.dot.png
+++ b/doc/design/test.dot.png
--- a/doc/fluid/dev/api_doc_std_cn.md
+++ b/doc/fluid/dev/api_doc_std_cn.md
--- a/doc/design/ci_build_whl.png
+++ b/doc/design/ci_build_whl.png
--- a/paddle/fluid/operators/op_documentation/name_convention.md
+++ b/paddle/fluid/operators/op_documentation/name_convention.md
--- a/paddle/fluid/operators/op_documentation/op_markdown_format.md
+++ b/paddle/fluid/operators/op_documentation/op_markdown_format.md
--- a/doc/design/releasing_process.md
+++ b/doc/design/releasing_process.md
--- a/doc/fluid/dev/src/fc.py
+++ b/doc/fluid/dev/src/fc.py
--- a/doc/design/support_new_device.md
+++ b/doc/design/support_new_device.md
--- a/doc/fluid/dev/use_eigen_cn.md
+++ b/doc/fluid/dev/use_eigen_cn.md
--- a/doc/fluid/dev/use_eigen_en.md
+++ b/doc/fluid/dev/use_eigen_en.md
--- a/doc/design/reader/README.md
+++ b/doc/design/reader/README.md
--- a/doc/design/model_format.md
+++ b/doc/design/model_format.md
--- a/doc/v2/howto/optimization/pprof_1.png
+++ b/doc/v2/howto/optimization/pprof_1.png
--- a/doc/v2/howto/optimization/pprof_2.png
+++ b/doc/v2/howto/optimization/pprof_2.png
--- a/doc/fluid/howto/optimization/timeline.jpeg
+++ b/doc/fluid/howto/optimization/timeline.jpeg
--- a/doc/fluid/howto/optimization/timeline.md
+++ b/doc/fluid/howto/optimization/timeline.md
--- a/doc/fluid/howto/optimization/tracing.jpeg
+++ b/doc/fluid/howto/optimization/tracing.jpeg
--- a/doc/design/error_clip.md
+++ b/doc/design/error_clip.md
--- a/doc/design/images/profiler.png
+++ b/doc/design/images/profiler.png
--- a/doc/design/profiler.md
+++ b/doc/design/profiler.md
--- a/doc/design/images/multigpu_allreduce.graffle
+++ b/doc/design/images/multigpu_allreduce.graffle
--- a/doc/design/images/multigpu_allreduce.png
+++ b/doc/design/images/multigpu_allreduce.png
--- a/doc/design/images/multigpu_before_convert.graffle
+++ b/doc/design/images/multigpu_before_convert.graffle
--- a/doc/design/images/multigpu_before_convert.png
+++ b/doc/design/images/multigpu_before_convert.png
--- a/doc/design/mkl/mkldnn_fluid.md
+++ b/doc/design/mkl/mkldnn_fluid.md
--- a/doc/design/paddle_nccl.md
+++ b/doc/design/paddle_nccl.md
--- a/doc/fluid/read_source.md
+++ b/doc/fluid/read_source.md
--- a/doc/v2/build_and_install/pip_install_cn.rst
+++ b/doc/v2/build_and_install/pip_install_cn.rst
--- a/doc/v2/build_and_install/pip_install_en.rst
+++ b/doc/v2/build_and_install/pip_install_en.rst
--- a/doc/design/cluster_train/README.md
+++ b/doc/design/cluster_train/README.md
--- a/doc/design/cluster_train/checkpointing.md
+++ b/doc/design/cluster_train/checkpointing.md
--- a/doc/design/cluster_train/data_dispatch.md
+++ b/doc/design/cluster_train/data_dispatch.md
--- a/doc/design/cluster_train/large_model_dist_train.md
+++ b/doc/design/cluster_train/large_model_dist_train.md
--- a/doc/design/cluster_train/master_server.md
+++ b/doc/design/cluster_train/master_server.md
--- a/doc/design/cluster_train/pserver_client.md
+++ b/doc/design/cluster_train/pserver_client.md
--- a/doc/design/cluster_train/remote_parameter_updater.md
+++ b/doc/design/cluster_train/remote_parameter_updater.md
--- a/doc/design/cluster_train/save_model.md
+++ b/doc/design/cluster_train/save_model.md
--- a/doc/design/cluster_train/src/checkpointing.png
+++ b/doc/design/cluster_train/src/checkpointing.png
--- a/doc/design/cluster_train/src/data_dispatch.png
+++ b/doc/design/cluster_train/src/data_dispatch.png
--- a/doc/design/cluster_train/src/dataset.graffle
+++ b/doc/design/cluster_train/src/dataset.graffle
--- a/doc/design/cluster_train/src/dataset.png
+++ b/doc/design/cluster_train/src/dataset.png
--- a/doc/design/cluster_train/src/file_storage.graffle
+++ b/doc/design/cluster_train/src/file_storage.graffle
--- a/doc/design/cluster_train/src/file_storage.png
+++ b/doc/design/cluster_train/src/file_storage.png
--- a/doc/design/cluster_train/src/init_lock.graffle
+++ b/doc/design/cluster_train/src/init_lock.graffle
--- a/doc/design/cluster_train/src/init_lock.png
+++ b/doc/design/cluster_train/src/init_lock.png
--- a/doc/design/cluster_train/src/paddle-cloud-in-data-center.png
+++ b/doc/design/cluster_train/src/paddle-cloud-in-data-center.png
--- a/doc/design/cluster_train/src/paddle-etcd.graffle
+++ b/doc/design/cluster_train/src/paddle-etcd.graffle
--- a/doc/design/cluster_train/src/paddle-etcd.png
+++ b/doc/design/cluster_train/src/paddle-etcd.png
--- a/doc/design/cluster_train/src/paddle-model-sharding.graffle
+++ b/doc/design/cluster_train/src/paddle-model-sharding.graffle
--- a/doc/design/cluster_train/src/paddle-model-sharding.png
+++ b/doc/design/cluster_train/src/paddle-model-sharding.png
--- a/doc/design/cluster_train/src/paddle-ps-0.png
+++ b/doc/design/cluster_train/src/paddle-ps-0.png
--- a/doc/design/cluster_train/src/paddle-ps-1.png
+++ b/doc/design/cluster_train/src/paddle-ps-1.png
--- a/doc/design/cluster_train/src/paddle-ps.graffle
+++ b/doc/design/cluster_train/src/paddle-ps.graffle
--- a/doc/design/cluster_train/src/paddle-task-queues.graffle
+++ b/doc/design/cluster_train/src/paddle-task-queues.graffle
--- a/doc/design/cluster_train/src/paddle-task-queues.png
+++ b/doc/design/cluster_train/src/paddle-task-queues.png
--- a/doc/design/cluster_train/src/paddle-task-states.graffle
+++ b/doc/design/cluster_train/src/paddle-task-states.graffle
--- a/doc/design/cluster_train/src/paddle-task-states.png
+++ b/doc/design/cluster_train/src/paddle-task-states.png
--- a/doc/design/cluster_train/src/pserver_init.graffle
+++ b/doc/design/cluster_train/src/pserver_init.graffle
--- a/doc/design/cluster_train/src/pserver_init.png
+++ b/doc/design/cluster_train/src/pserver_init.png
--- a/doc/design/cluster_train/src/submit-job.graffle
+++ b/doc/design/cluster_train/src/submit-job.graffle
--- a/doc/design/cluster_train/src/submit-job.png
+++ b/doc/design/cluster_train/src/submit-job.png
--- a/doc/design/cluster_train/src/trainer.graffle
+++ b/doc/design/cluster_train/src/trainer.graffle
--- a/doc/design/cluster_train/src/trainer.png
+++ b/doc/design/cluster_train/src/trainer.png
--- a/doc/design/cluster_train/submit-job.md
+++ b/doc/design/cluster_train/submit-job.md
--- a/doc/design/mkl/image/engine.png
+++ b/doc/design/mkl/image/engine.png
--- a/doc/design/mkl/image/gradients.png
+++ b/doc/design/mkl/image/gradients.png
--- a/doc/design/mkl/image/layers.png
+++ b/doc/design/mkl/image/layers.png
--- a/doc/design/mkl/image/matrix.png
+++ b/doc/design/mkl/image/matrix.png
--- a/doc/design/mkl/image/overview.png
+++ b/doc/design/mkl/image/overview.png
--- a/doc/design/mkl/mkl_packed.md
+++ b/doc/design/mkl/mkl_packed.md
--- a/doc/design/mkl/mkldnn.md
+++ b/doc/design/mkl/mkldnn.md
--- a/doc/v2/dev/index_cn.rst
+++ b/doc/v2/dev/index_cn.rst
--- a/doc/v2/dev/new_layer_cn.rst
+++ b/doc/v2/dev/new_layer_cn.rst
--- a/doc/v2/dev/new_layer_en.rst
+++ b/doc/v2/dev/new_layer_en.rst
--- a/doc/v2/dev/FullyConnected.jpg
+++ b/doc/v2/dev/FullyConnected.jpg
--- a/doc/v2/dev/src/doc_en.png
+++ b/doc/v2/dev/src/doc_en.png
--- a/doc/v2/dev/write_docs_cn.rst
+++ b/doc/v2/dev/write_docs_cn.rst
--- a/doc/v2/getstarted/index_cn.rst
+++ b/doc/v2/getstarted/index_cn.rst
--- a/doc/v2/howto/capi/compile_paddle_lib_cn.md
+++ b/doc/v2/howto/capi/compile_paddle_lib_cn.md
--- a/doc/v2/howto/cluster/cmd_argument_cn.md
+++ b/doc/v2/howto/cluster/cmd_argument_cn.md
--- a/doc/v2/howto/cluster/cmd_argument_en.md
+++ b/doc/v2/howto/cluster/cmd_argument_en.md
--- a/doc/v2/howto/cluster/index_en.rst
+++ b/doc/v2/howto/cluster/index_en.rst
--- a/doc/v2/howto/index_cn.rst
+++ b/doc/v2/howto/index_cn.rst
--- a/doc/v2/howto/rnn/hierarchical_layer_cn.rst
+++ b/doc/v2/howto/rnn/hierarchical_layer_cn.rst
--- a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
+++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
--- a/doc/v2/howto/rnn/index_cn.rst
+++ b/doc/v2/howto/rnn/index_cn.rst
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
--- a/paddle/fluid/framework/concurrency_test.cc
+++ b/paddle/fluid/framework/concurrency_test.cc
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
--- a/paddle/fluid/framework/data_type_transform_test.cc
+++ b/paddle/fluid/framework/data_type_transform_test.cc
--- a/paddle/fluid/framework/data_type_transform_test.cu
+++ b/paddle/fluid/framework/data_type_transform_test.cu
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
--- a/paddle/fluid/framework/init.cc
+++ b/paddle/fluid/framework/init.cc
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
--- a/paddle/fluid/inference/paddle_fluid.map
+++ b/paddle/fluid/inference/paddle_fluid.map
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
--- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
--- a/paddle/fluid/operators/channel_recv_op.cc
+++ b/paddle/fluid/operators/channel_recv_op.cc
--- a/paddle/fluid/operators/channel_send_op.cc
+++ b/paddle/fluid/operators/channel_send_op.cc
--- a/paddle/fluid/operators/concurrency/CMakeLists.txt
+++ b/paddle/fluid/operators/concurrency/CMakeLists.txt
--- a/paddle/fluid/operators/concurrency/channel_util.cc
+++ b/paddle/fluid/operators/concurrency/channel_util.cc
--- a/paddle/fluid/operators/detection_output_op.cu.cc
+++ b/paddle/fluid/operators/detection_output_op.cu.cc
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
--- a/paddle/fluid/operators/delete_var_op.cc
+++ b/paddle/fluid/operators/delete_var_op.cc
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
--- a/paddle/fluid/operators/detail/bytebuffer_stream.cc
+++ b/paddle/fluid/operators/detail/bytebuffer_stream.cc
--- a/paddle/fluid/operators/detail/bytebuffer_stream.h
+++ b/paddle/fluid/operators/detail/bytebuffer_stream.h
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
--- a/paddle/fluid/operators/detail/proto_encoder_helper.h
+++ b/paddle/fluid/operators/detail/proto_encoder_helper.h
--- a/paddle/fluid/operators/detail/safe_ref.h
+++ b/paddle/fluid/operators/detail/safe_ref.h
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
--- a/paddle/fluid/operators/detail/sendrecvop_utils.h
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.h
--- a/paddle/fluid/operators/detail/test_serde.cc
+++ b/paddle/fluid/operators/detail/test_serde.cc
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
--- a/paddle/fluid/operators/detection_output_op.cc
+++ b/paddle/fluid/operators/detection_output_op.cc
--- a/paddle/fluid/operators/detection_output_op.h
+++ b/paddle/fluid/operators/detection_output_op.h
--- a/paddle/fluid/operators/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise_add_op.cc
--- a/paddle/fluid/operators/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise_div_op.h
--- a/paddle/fluid/operators/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise_max_op.h
--- a/paddle/fluid/operators/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise_min_op.h
--- a/paddle/fluid/operators/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise_mul_op.h
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
--- a/paddle/fluid/operators/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise_sub_op.h
--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
--- a/paddle/fluid/operators/lod_reset_op.cu
+++ b/paddle/fluid/operators/lod_reset_op.cu
--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
--- a/paddle/fluid/operators/math/detection_util.h
+++ b/paddle/fluid/operators/math/detection_util.h
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
--- a/paddle/fluid/operators/math/math_function_test.cu
+++ b/paddle/fluid/operators/math/math_function_test.cu
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
--- a/paddle/fluid/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
--- a/paddle/fluid/operators/mul_op.cu.cc
+++ b/paddle/fluid/operators/mul_op.cu.cc
--- a/paddle/fluid/operators/mul_op.h
+++ b/paddle/fluid/operators/mul_op.h
--- a/paddle/fluid/operators/nccl_op.cc
+++ b/paddle/fluid/operators/nccl_op.cc
--- a/paddle/fluid/operators/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl_op.cu.cc
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
--- a/paddle/fluid/operators/prior_box_op.h
+++ b/paddle/fluid/operators/prior_box_op.h
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
--- a/paddle/fluid/operators/reader/create_random_data_generator_op.cc
+++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
--- a/paddle/fluid/operators/reduce_op.cc
+++ b/paddle/fluid/operators/reduce_op.cc
--- a/paddle/fluid/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
--- a/paddle/fluid/operators/scatter_op.cu
+++ b/paddle/fluid/operators/scatter_op.cu
--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
--- a/paddle/fluid/operators/select_op.cc
+++ b/paddle/fluid/operators/select_op.cc
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
--- a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
--- a/paddle/fluid/operators/sequence_softmax_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cu.cc
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ b/paddle/fluid/operators/split_selected_rows_op.h
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
--- a/paddle/fluid/platform/nccl_test.cu
+++ b/paddle/fluid/platform/nccl_test.cu
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/pybind/recordio.cc
+++ b/paddle/fluid/pybind/recordio.cc
--- a/paddle/fluid/pybind/recordio.h
+++ b/paddle/fluid/pybind/recordio.h
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
--- a/paddle/fluid/recordio/CMakeLists.txt
+++ b/paddle/fluid/recordio/CMakeLists.txt
--- a/paddle/fluid/recordio/chunk.cc
+++ b/paddle/fluid/recordio/chunk.cc
--- a/paddle/fluid/recordio/chunk.h
+++ b/paddle/fluid/recordio/chunk.h
--- a/paddle/fluid/recordio/chunk_test.cc
+++ b/paddle/fluid/recordio/chunk_test.cc
--- a/paddle/fluid/recordio/header.cc
+++ b/paddle/fluid/recordio/header.cc
--- a/paddle/fluid/recordio/header.h
+++ b/paddle/fluid/recordio/header.h
--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
--- a/paddle/fluid/recordio/scanner.h
+++ b/paddle/fluid/recordio/scanner.h
--- a/paddle/fluid/recordio/writer.cc
+++ b/paddle/fluid/recordio/writer.cc
--- a/paddle/fluid/recordio/writer.h
+++ b/paddle/fluid/recordio/writer.h
--- a/paddle/fluid/recordio/writer_scanner_test.cc
+++ b/paddle/fluid/recordio/writer_scanner_test.cc
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
--- a/python/paddle/fluid/debuger.py
+++ b/python/paddle/fluid/debuger.py
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
--- a/python/paddle/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/memory_optimization_transpiler.py
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
--- a/python/paddle/fluid/tests/test_concurrency.py
+++ b/python/paddle/fluid/tests/test_concurrency.py
--- a/python/paddle/fluid/tests/test_cpp_reader.py
+++ b/python/paddle/fluid/tests/test_cpp_reader.py
--- a/python/paddle/fluid/tests/unittests/.gitignore
+++ b/python/paddle/fluid/tests/unittests/.gitignore
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/test_cast_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cast_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_debugger.py
+++ b/python/paddle/fluid/tests/unittests/test_debugger.py
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
--- a/python/paddle/fluid/tests/unittests/test_detection_output_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_output_op.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
--- a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
--- a/python/paddle/fluid/tests/unittests/test_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
--- a/python/setup.py.in
+++ b/python/setup.py.in
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
--- a/tools/manylinux1/build_scripts/install_nccl2.sh
+++ b/tools/manylinux1/build_scripts/install_nccl2.sh
--- a/tools/timeline.py
+++ b/tools/timeline.py