diff --git a/.travis.yml b/.travis.yml
index 8c772030925dcad3909f142b08e4d8057a3f89b7..a406841f6abf01f15826f34fe4c63b4c24486ccd 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -31,7 +31,7 @@ script:
     if [[ "$JOB" != "doc" ]]; then exit 0; fi;
     # For document only
     if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
-    if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
+    if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
     export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
     export DOCS_DIR=`pwd`
     cd ..
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 997672169fbb4d24028a4529b1a97880b7480503..23bb27e77b9eab0c322a71a8ff570d12d1050377 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,6 +65,7 @@ option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better d
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
+option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
diff --git a/Dockerfile b/Dockerfile
index fc5069a6c080ed23317695e6822c4c46b5b5c7f9..48c750358cfcb227667c429f19befcaa2f51ebbd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -23,7 +23,7 @@ ENV HOME /root
 COPY ./paddle/scripts/docker/root/ /root/
 
 RUN apt-get update && \
-    apt-get install -y --allow-downgrades \
+    apt-get install -y --allow-downgrades patchelf \
     git python-pip python-dev python-opencv openssh-server bison \
     libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
     wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
diff --git a/README.md b/README.md
index 63abca069a6629ac59739224ded9cd9f06207d0a..eb99ed21d02650ef16cc7da91836909c02895be9 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,8 @@ learning to many products at Baidu.
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
+### Lastest PaddlePaddle Version: [Fluid](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid)
+
 ## Features
 
 - **Flexibility**
diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py
index 99c9d79b068f5886012fd702d84d0666b9d197b5..a79f25ccc6ace1594f3f331633130eaace5e175b 100644
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -125,6 +125,10 @@ def parse_args():
     parser.add_argument(
         '--use_inference_transpiler',
         action='store_true',
-        help='If set, uses inference transpiler to optimize the program.')
+        help='If set, use inference transpiler to optimize the program.')
+    parser.add_argument(
+        '--no_random',
+        action='store_true',
+        help='If set, keep the random seed and do not shuffle the data.')
     args = parser.parse_args()
     return args
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
old mode 100755
new mode 100644
index dcd4d9ea95d816029317a29055b5ca8273ac9f43..94ea7bd6aca7c9595037a2dacc5e36d4c77827e7
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -132,10 +132,6 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
     exe.run(startup_prog)
 
     # Use inference_transpiler to speedup
-    if args.use_inference_transpiler:
-        t = fluid.InferenceTranspiler()
-        t.transpile(infer_prog, place)
-
     if not args.use_reader_op:
         feed_var_list = [
             var for var in train_prog.global_block().vars.itervalues()
@@ -186,6 +182,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
         print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
         # evaluation
         if not args.no_test and batch_acc and not args.use_reader_op:
+            if args.use_inference_transpiler:
+                t = fluid.InferenceTranspiler()
+                t.transpile(infer_prog, place)
+
             pass_test_acc = test(exe, infer_prog, test_reader, feeder,
                                  batch_acc)
             print(", Test Accuracy: %f" % pass_test_acc)
@@ -316,6 +316,8 @@ def main():
     args = parse_args()
     print_arguments(args)
     print_paddle_envs()
+    if args.no_random:
+        fluid.default_startup_program().random_seed = 1
 
     # the unique trainer id, starting from 0, needed by trainer
     # only
diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
index 9ed1093c54a501cc93dbbf9c3651fe70914ce26b..d44a9c07d31cfae9d54ad5949b85c77e60eae258 100644
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -197,12 +197,12 @@ def get_model(args):
     optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
 
     batched_train_reader = paddle.batch(
-        paddle.reader.shuffle(
+        train_reader if args.no_random else paddle.reader.shuffle(
             train_reader, buf_size=5120),
         batch_size=args.batch_size * args.gpus,
         drop_last=True)
     batched_test_reader = paddle.batch(
-        train_reader, batch_size=args.batch_size, drop_last=True)
+        test_reader, batch_size=args.batch_size, drop_last=True)
 
     return avg_cost, inference_program, optimizer, batched_train_reader,\
                    batched_test_reader, batch_acc
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index e3b9d94215a858c5c9a34e1b7e97540f1876801d..6ed51c648478efb9784d0c43b169c285e740e0f3 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -83,18 +83,20 @@ else()
   set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
 endif()
 
-find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
+if(WITH_SYSTEM_BLAS)
+  find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
         ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
-find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
+  find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
         ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
 
-if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER REFERENCE)
-  set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
-  set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
-  add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
-  message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
+    set(CBLAS_FOUND ON)
+    set(CBLAS_PROVIDER REFERENCE)
+    set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
+    set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
+    add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
+    message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  endif()
 endif()
 
 if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND)
diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
index d205e3958234cabfbfeba8c3d725fe618ce48ace..fb3d8ef8d53436f387acc3069a0eb887e6f07c59 100644
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -7,7 +7,17 @@ set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH
 set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files")
 set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library")
 
-set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp)
+set(ANAKIN_COMPILE_EXTRA_FLAGS 
+    -Wno-error=unused-variable -Wno-unused-variable 
+    -Wno-error=format-extra-args -Wno-format-extra-args
+    -Wno-error=comment -Wno-comment 
+    -Wno-error=format -Wno-format 
+    -Wno-error=switch -Wno-switch
+    -Wno-error=return-type -Wno-return-type 
+    -Wno-error=non-virtual-dtor -Wno-non-virtual-dtor
+    -Wno-sign-compare
+    -Wno-reorder 
+    -Wno-error=cpp)
 
 set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz")
 
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index fd7fc16bff5651f022b484623243048fbd225b5a..eafb11b6f21e226fc68556a78d675dea94080140 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -257,8 +257,8 @@ function(cc_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_test(NAME ${TARGET_NAME}
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
@@ -324,8 +324,8 @@ function(nv_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_test(${TARGET_NAME} ${TARGET_NAME})
     if (nv_test_SERIAL)
         set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
diff --git a/cmake/version.cmake b/cmake/version.cmake
index cde650128a068faf32f4abfff5cdfdeb656d8577..79b8e8ac496250d85427b77fbd6a9924a962a15b 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -1,16 +1,21 @@
 # Get the latest git tag.
 set(PADDLE_VERSION $ENV{PADDLE_VERSION})
 set(tmp_version "HEAD")
+set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
+set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+")
 while ("${PADDLE_VERSION}" STREQUAL "")
   execute_process(
-    COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 ${tmp_version}
+    COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version}
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
     OUTPUT_VARIABLE GIT_TAG_NAME
     RESULT_VARIABLE GIT_RESULT
     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
   if (NOT ${GIT_RESULT})
     # Check the tag is a correct version
-    if (${GIT_TAG_NAME} MATCHES "v[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
+    if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
+      # if no tag was found, set PADDLE_VERSION to latest
+      set(PADDLE_VERSION "latest")
+    elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
       string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
     else()  # otherwise, get the previous git tag name.
       set(tmp_version "${GIT_TAG_NAME}~1")
diff --git a/doc/fluid/api/transpiler.rst b/doc/fluid/api/transpiler.rst
index 943d39331d26c05764c90cb24f6774997c976bfe..d2ac04f1449c32cb414cea1b76d7469bbe9ccb85 100644
--- a/doc/fluid/api/transpiler.rst
+++ b/doc/fluid/api/transpiler.rst
@@ -14,6 +14,15 @@ DistributeTranspiler
     :members:
     :noindex:
 
+.. _api_fluid_transpiler_InferenceTranspiler:
+
+InferenceTranspiler
+-------------------
+
+..  autoclass:: paddle.fluid.transpiler.InferenceTranspiler
+    :members:
+    :noindex:
+
 .. _api_fluid_transpiler_memory_optimize:
 
 memory_optimize
diff --git a/doc/fluid/design/dist_train/dist_train_nccl2.md b/doc/fluid/design/dist_train/dist_train_nccl2.md
new file mode 100644
index 0000000000000000000000000000000000000000..aa7455ec5de0d46d7c2b0cef3b7ebf4754af3cb1
--- /dev/null
+++ b/doc/fluid/design/dist_train/dist_train_nccl2.md
@@ -0,0 +1,35 @@
+# Distributed Training with NCCL2
+
+We design a pattern that can enable training with `ParallelExecutor` and
+using [NCCL2](https://developer.nvidia.com/nccl) as it's collective
+communication library.
+
+In `ParallelExecutor` we can use `AllReduce` or `Reduce` and `Broadcast`
+to do multi GPU training. And if we initialize NCCL2 communicators as
+ranks in a distributed environment, we can simply run the `ParallelExecutor`
+as a distributed program! The only thing that may be different than in
+the single node version is that we need to broadcast the NCCL unique ID
+to all the nodes, and initialize communicators using that ID, so NCCL2
+will know each other as ranks.
+
+To achieve this feature, we introduce a new operator: `gen_nccl_id` op,
+so we are ***not*** "bind to" running NCCL2 with MPI, we can run it in
+what ever platform you like.
+
+It have two running modes:
+
+1. Generate and broadcast mode, which should be used on trainer 0;
+1. Listen and fetch mode, which should be used on trainers other than 0.
+
+In both two modes, this op can save the NCCL ID into current scope as a
+persistable variable, Then we can insert this op at the end of
+"startup program" of fluid, so that all workers can get the same ID to
+initialize NCCL communicator objects.
+
+<img src="src/ncc2_design.png">
+
+The above figure indicates the general process when training with NCCL2
+distributed. Each trainer have the number of communicators equal to the
+number of GPUs, but the ranks should match the global ranks number: here
+we have total 8 GPUs, so `nranks==8`, for each trainer, the ranks should
+be from 0 ~ 3 on trainer 0 and 4 ~ 7 on trainer 1.
diff --git a/doc/fluid/design/dist_train/distributed_lookup_table_design.md b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
index 988729138926f035750b59eb245dde82502a3ad2..e284e1ec5cdd18d0049ce3c1a8349bbe1248cb48 100644
--- a/doc/fluid/design/dist_train/distributed_lookup_table_design.md
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
@@ -1,6 +1,6 @@
 # Design Doc: Distributed Lookup Table Operator
 
-A lookup table operator in PaddlePaddle where the table could be out
+A distribute lookup table operator in PaddlePaddle where the table could be out
 of the memory of a computer.
 
 ## Background
@@ -24,14 +24,14 @@ memory, so we'd need a distributed storage service, which supports the
 lookup of rows.
 
 The following figure illustrates the multiplication of x with two
-non-zero elements, or say, two symbols, and a lookup table W:
+non-zero elements, or say two symbols, and a lookup table W:
 
 ![lookup table](./src/lookup_table.png)
 
 ### The Backward Algorithm
 
 The backward algorithm computes W'(x) using W(x).  W'(x) has the same
-scale of size as W(x) and is much smaller than W.
+the scale of size as W(x) and is much smaller than W.
 
 To optimize W given W', we can do simple SGD update:
 
@@ -44,85 +44,46 @@ $$W = f(W, W')$$
 The following figure illustrates the backward pass of the lookup
 operator: ![lookup table training](./src/lookup_table_training.png)
 
-## Distributed Storage Service
-
-The forward algorithm requires a distributed storage service for W.
-The backward algorithm prefers that the storage system can apply the
-optimization algorithm on W.  The following two sections describe two
-solutions -- the former doesn't require that the storage service can
-do optimization, the latter does.
-
-### Storage Service Doesn't Optimize
-
-In this design, we use highly-optimized distributed storage, e.g.,
-memcached, as the storage service, and we run the optimization
-algorithm on parameter servers of PaddlePaddle.  The following figure
-illustrates the training process.
-
-<!--
-Note: please update the following URL when update this digraph.
-<img src='https://g.gravizo.com/svg?
-digraph G {
-  rankdir="LR";
-  subgraph cluster1 {
-  P1 [label="pserver 1"];
-  P2 [label="pserver 2"];
-  T1 [label="trainer 1"];
-  T2 [label="trainer 2"];
-  T3 [label="trainer 3"];
-  }
-  KV [label="memcached"];
-  T1 -> P1;
-  T1 -> P2;
-  T2 -> P1;
-  T2 -> P2;
-  T3 -> P1;
-  T3 -> P2;
-  P1 -> KV [color=gray, weight=0.1];
-  KV -> P1 [color=gray, weight=0.1];
-  P2 -> KV [color=gray, weight=0.1];
-  KV -> P2 [color=gray, weight=0.1];
-  KV -> T1 [color=gray, weight=0.1];
-  KV -> T2 [color=gray, weight=0.1];
-  KV -> T3 [color=gray, weight=0.1];
-}
-)
-'/>
--->
-
-<img src='https://g.gravizo.com/svg?%20digraph%20G%20{%20rankdir=%22LR%22;%20subgraph%20cluster1%20{%20P1%20[label=%22pserver%201%22];%20P2%20[label=%22pserver%202%22];%20T1%20[label=%22trainer%201%22];%20T2%20[label=%22trainer%202%22];%20T3%20[label=%22trainer%203%22];%20}%20KV%20[label=%22memcached%22];%20T1%20-%3E%20P1;%20T1%20-%3E%20P2;%20T2%20-%3E%20P1;%20T2%20-%3E%20P2;%20T3%20-%3E%20P1;%20T3%20-%3E%20P2;%20P1%20-%3E%20KV%20[color=gray,%20weight=0.1];%20KV%20-%3E%20P1%20[color=gray,%20weight=0.1];%20P2%20-%3E%20KV%20[color=gray,%20weight=0.1];%20KV%20-%3E%20P2%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T1%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T2%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T3%20[color=gray,%20weight=0.1];%20}'/>
-
-Each trainer runs the forward and backward passes using their local
-data:
-
-1. In the forward pass, when a trainer runs the forward algorithm of a
-   lookup operator, it retrieves W(x) from the storage service.
-1. The trainer computes W'(x) in the backward pass using W(x).
-
-During the global update process:
-
-1. Each trainer uploads its W'(x) to parameter servers.
-1. The parameter server runs the optimization algorithm, e.g., the
-   Adam optimization algorithm, which requires that
-   1. The parameter server retrieves W(x) from memcached, and
-   1. The parameter server pushes $\Delta W(x)=f(W(x), lambda \sum_j
-      W'(x))$ to memcached, where $f$ denotes the optimization
-      algorithm.
-
-### Storage Service Does Optimize
-
-This design is very similar to the above one, except that the
-optimization algorithm $f$ runs on the storage service.
-
-- Pro: parameter servers do not retrieve W(x) from the storage
-  service, thus saves half network communication.
-- Con: the storage service needs to be able to run the optimization
-  algorithm.
-
-## Conclusion
-
-Let us do the "storage service does not optimize" solution first, as a
-baseline at least, because it is easier to use a well-optimized
-distributed storage service like memcached.  We can do the "storage
-service does optimize" solution later or at the same time, which, if
-implemented carefully, should have better performance than the former.
+## Distributed Lookup Table
+### Problem 1: The lookup table may be very large.
+
+ In the condition like the search engine and recommendation system, the number of feature Id may be very large, say 100,000,000,000, then for a float value lookup table of size 8, the total size of the table is:
+
+ ```
+ 100,000,000,000 * 8 * 4(Bytes) = 2980.23 GB
+ ```
+
+### Solution: Distributed storage
+
+1. Paddle use [SelectedRows](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/selected_rows.md) as the storage format for the lookup table, the lookup table parameter will be split to multi-machine according to the hash of the feature ID, and data will also be split and send to the same machine to prefetch the parameter.
+
+1. For common parameters, the trainer will get the whole parameter for training, but for the big lookup table, the trainer can not store the whole parameter. Because the input data feature is very sparse, every time we only need a few parameters for training, so we use `prefetch_op` to only prefetch the parameter needed to trainer.
+
+### Problem 2. The Id in the lookup table is not sure before training.
+
+ The feature Id is calculated by the hash function because the feature data source is so large, we can not get all the Id before training. So we can not initialize the table before training.
+
+### Solution: Id auto growth
+
+At the beginning of training, paddle only malloc the memory for the lookup table at parameter server side, the Id and it's value will not be initialized. During training, when a parameter server received an Id, if it is already in the lookup table, it will return the existing parameter, if the Id does not exist, paddle will add it into the lookup table and initialize the value for it.
+
+### Problem 3: parameter load and save
+
+For common parameters, paddle use trainer to save and load them. But for distributed lookup table, trainer cannot do this because it's large size.
+
+### Solution: Parameter server side save and load
+
+Paddle support parameter server side save and load for distribute lookup table. Each machine of parameter servers will only save and load part of the whole table.
+
+## Architecture
+The whole architecture of the distribute lookup table is as below:
+
+### Training steps:
+1. Read a batch of data, the data is feature ids.
+1. The input ids will be split by `split_ids_op` with the same hash function of the lookup table.
+1. The `prefetch_op` use the split result to prefetch parameters back from the lookup table.
+1. Run forward-backward to get the gradient of the lookup table.
+1. `split_ids_op` split the gradient and then use `send_op` to the parameter server.
+1. parameter server update the table with the received gradient.
+
+![distribute lookup table](./src/distributed_lookup_table.jpeg)
diff --git a/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle b/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..65dfdbbacd219739db6ddfdf243cc16c3c4e8d1e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle differ
diff --git a/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg b/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..5353a16fd329f62ff893d32706b9c3c0bcc46a07
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg differ
diff --git a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..96ca6d48f43bd9f49c6861dab006e2037873db87
Binary files /dev/null and b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle differ
diff --git a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png
new file mode 100644
index 0000000000000000000000000000000000000000..afa25ab3b4e427bc595a855b12ab966478e01ed0
Binary files /dev/null and b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png differ
diff --git a/doc/fluid/design/dist_train/src/ncc2_design.graffle b/doc/fluid/design/dist_train/src/ncc2_design.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..7d2753bbb03bc28c7a0054bb0aa424deb072ffbf
Binary files /dev/null and b/doc/fluid/design/dist_train/src/ncc2_design.graffle differ
diff --git a/doc/fluid/design/dist_train/src/ncc2_design.png b/doc/fluid/design/dist_train/src/ncc2_design.png
new file mode 100644
index 0000000000000000000000000000000000000000..da0d5ee81f5dfeb4ca1356601b0bb5870456e3d6
Binary files /dev/null and b/doc/fluid/design/dist_train/src/ncc2_design.png differ
diff --git a/doc/v2/design/cluster_train/large_model_dist_train.md b/doc/v2/design/cluster_train/large_model_dist_train.md
index 0c4b5bc24c854b7062d509249bea9c50d42bd5f1..edb0245ea083e791b7f32ac57a330698299fceda 100644
--- a/doc/v2/design/cluster_train/large_model_dist_train.md
+++ b/doc/v2/design/cluster_train/large_model_dist_train.md
@@ -52,7 +52,7 @@ In `trainer_internal.cpp:L93 trainOneBatch`:
 
 When doing actual network forward and backward, at the beginning of each batch, the trainer will try to download one row of data from pserver.
 
-In `trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`:
+In `legacy/trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`:
 
 ```c++
 if (fullSize) {
diff --git a/doc/v2/design/mkl/mkldnn.md b/doc/v2/design/mkl/mkldnn.md
index bd5bcf6f67168c21cebb046a629b948d1661e75c..4876de0045979be20fa45bdc84d2594516f71c03 100644
--- a/doc/v2/design/mkl/mkldnn.md
+++ b/doc/v2/design/mkl/mkldnn.md
@@ -18,20 +18,20 @@ Figure 1. PaddlePaddle on IA
 具体的完成状态可以参见[这里](https://github.com/PaddlePaddle/Paddle/projects/21)。
 
 ## Contents
-
-- [Overview](#overview)
-- [Actions](#actions)
- 	- [CMake](#cmake)
- 	- [Matrix](#matrix)
-	- [Layers](#layers)
-	- [Activations](#activations)
-	- [Parameters](#parameters)
-	- [Gradients](#gradients)
-	- [Unit Tests](#unit-tests)
-	- [Python API](#python-api)
-	- [Benchmarking](#benchmarking)
-	- [Others](#others)
-- [Design Concerns](#design-concerns)
+
+- [Overview](#overview)
+- [Actions](#actions)
+ 	- [CMake](#cmake)
+ 	- [Matrix](#matrix)
+	- [Layers](#layers)
+	- [Activations](#activations)
+	- [Parameters](#parameters)
+	- [Gradients](#gradients)
+	- [Unit Tests](#unit-tests)
+	- [Python API](#python-api)
+	- [Benchmarking](#benchmarking)
+	- [Others](#others)
+- [Design Concerns](#design-concerns)
 
 ## Overview
 
@@ -218,20 +218,20 @@ if use_mkldnn
 我们总结出一些特别需要注意的点：
 
 1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，
-我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MKLDNNLayer`特有的设备ID。
-2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
+我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MKLDNNLayer`特有的设备ID。
+2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
 3. 创建`MKLDNNBase`，定义一些除了layer和memory相关的类和函数。
-包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`，和未来可能还会用到`FPGAEngine`等。
+包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`，和未来可能还会用到`FPGAEngine`等。
 4. 如果MKL-DNN layer的后面接有cpu device，那么就会使`output_.value`与`extOutVal_`共享内存，
 同时数据格式就是`NCHW`，这样下一个cpu device就能拿到正确的数据。
 在有普通的CPU layer时， `extOutVal_`和`extOutGrad_`的格式始终是`NCHW`或者`NC`。
 
 ## References
 1. [MKL small library](https://github.com/01org/mkl-dnn#linking-your-application)是[Intel MKL](https://software.intel.com/en-us/mkl)的一个子集。
-主要包括了深度学习相关的数学原语与操作，一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。
+主要包括了深度学习相关的数学原语与操作，一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。
 2. [MKL-DNN System Requirements](https://github.com/01org/mkl-dnn#system-requirements)。
 目前在PaddlePaddle中，仅会在支持AVX2指令集及以上的机器才使用MKL-DNN。
 3. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。
-但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
+但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
 4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`，所以不存在这个问题)。
-所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
+所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
diff --git a/doc/v2/dev/new_layer_en.rst b/doc/v2/dev/new_layer_en.rst
index 6a848a020df343c14601b9c3fcb5fb6fcde7f880..ad723738801908a5f48343574c204bdbfc97ee08 100644
--- a/doc/v2/dev/new_layer_en.rst
+++ b/doc/v2/dev/new_layer_en.rst
@@ -339,7 +339,7 @@ If you are creating a new file for the test, such as :code:`paddle/legacy/gserve
 Implement Python Wrapper
 ========================
 
-Implementing Python wrapper allows us to use the added layer in configuration files. All the Python wrappers are in file :code:`python/paddle/trainer/config_parser.py`. An example of the Python wrapper for fully connected layer is listed below. It has the following steps:
+Implementing Python wrapper allows us to use the added layer in configuration files. All the Python wrappers are in file :code:`python/paddle/legacy/trainer/config_parser.py`. An example of the Python wrapper for fully connected layer is listed below. It has the following steps:
 
 - Use :code:`@config_layer('fc')` at the decorator for all the Python wrapper class. :code:`fc` is the identifier of the layer.
 - Implements :code:`__init__` constructor function.
diff --git a/doc/v2/howto/capi/compile_paddle_lib_cn.md b/doc/v2/howto/capi/compile_paddle_lib_cn.md
index e223fd33a8420abcdfdad53d1cfc5ed160a1b37e..2c87e9afc6911526cd51d6c691f262960accc9e8 100644
--- a/doc/v2/howto/capi/compile_paddle_lib_cn.md
+++ b/doc/v2/howto/capi/compile_paddle_lib_cn.md
@@ -18,7 +18,7 @@
 </tr>
 <tr>
 <td>cpu_avx_openblas</td>
-<td>暂无</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cpu_noavx_openblas</td>
@@ -35,7 +35,12 @@
 <tr>
 <td>cuda8.0_cudnn7_avx_mkl</td>
 <td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
-</tr></tbody></table>
+</tr>
+<tr>
+<td>cuda9.0_cudnn7_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+</tbody></table>
 
 ### 从源码编译
 
diff --git a/doc/v2/howto/capi/compile_paddle_lib_en.md b/doc/v2/howto/capi/compile_paddle_lib_en.md
index 6212a3081116d988630706e83d2349dd200b73ab..3fa8a18a9fbea21b494c416e6b938990fbb68337 100644
--- a/doc/v2/howto/capi/compile_paddle_lib_en.md
+++ b/doc/v2/howto/capi/compile_paddle_lib_en.md
@@ -17,7 +17,7 @@
 </tr>
 <tr>
 <td>cpu_avx_openblas</td>
-<td>-</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cpu_noavx_openblas</td>
@@ -34,7 +34,12 @@
 <tr>
 <td>cuda8.0_cudnn7_avx_mkl</td>
 <td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
-</tr></tbody></table>
+</tr>
+<tr>
+<td>cuda9.0_cudnn7_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+</tbody></table>
 
 ### From source
 
diff --git a/doc/v2/howto/capi/workflow_of_capi_cn.md b/doc/v2/howto/capi/workflow_of_capi_cn.md
index 3acdbae28e9b35f8a9104a89c9a5799f8c892334..db1568a2afbea3cca0d4e1fe053ba9536a60ab3d 100644
--- a/doc/v2/howto/capi/workflow_of_capi_cn.md
+++ b/doc/v2/howto/capi/workflow_of_capi_cn.md
@@ -28,9 +28,9 @@
 
 ### 准备预测模型
 
-准备预测模型部分，我们以手写数字识别任务为例进行介绍。手写数字识别任务定义了一个含有[两个隐层的简单全连接网络](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md#softmax回归softmax-regression)，网络接受一幅图片作为输入，将图片分类到 0 ~ 9 类别标签之一。完整代码可以查看[此目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense) 中的相关脚本。
+准备预测模型部分，我们以手写数字识别任务为例进行介绍。手写数字识别任务定义了一个含有[两个隐层的简单全连接网络](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md#softmax回归softmax-regression)，网络接受一幅图片作为输入，将图片分类到 0 ~ 9 类别标签之一。完整代码可以查看[此目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense) 中的相关脚本。
 
-调用C-API开发预测程序需要一个训练好的模型，运行[MNIST手写数字识别目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)下的[mnist_v2.py](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)脚本，在终端执行`python mnist_v2.py`，会使用 PaddlePaddle 内置的 [MNIST 数据集](http://yann.lecun.com/exdb/mnist/)进行训练。训练好的模型默认保存在当前运行目录下的`models`目录中。
+调用C-API开发预测程序需要一个训练好的模型，运行[MNIST手写数字识别目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)下的[mnist_v2.py](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py)脚本，在终端执行`python mnist_v2.py`，会使用 PaddlePaddle 内置的 [MNIST 数据集](http://yann.lecun.com/exdb/mnist/)进行训练。训练好的模型默认保存在当前运行目录下的`models`目录中。
 
 下面，我们将训练结束后存储下来的模型转换成预测模型。
 
@@ -48,7 +48,7 @@
     dump_v2_config(predict, "trainer_config.bin", True)
     ```
 
-    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例，[`mnist_v2.py`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)脚本集成了序列化神经网络结构的过程，可以直接运行 `python mnist_v2.py --task dump_config` 对神经网络结构进行序列化，结果会写入当前运行目录下的`trainer_config.bin`文件中。
+    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)这个示例，[`mnist_v2.py`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py)脚本集成了序列化神经网络结构的过程，可以直接运行 `python mnist_v2.py --task dump_config` 对神经网络结构进行序列化，结果会写入当前运行目录下的`trainer_config.bin`文件中。
 
     使用这种方式，需要**在运行时将神经网络的多个可学习参数放在同一个目录中**，C-API可以通过分别指定序列化后的网络结构文件和参数目录来加载训练好的模型。
 
@@ -68,7 +68,7 @@
     merge_v2_model(net, param_file, output_file)
     ```
 
-    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例，可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式，运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。
+    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)这个示例，可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式，运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。
 
 #### 注意事项
 1. 为使用C-API，在调用`dump_v2_config`序列化神经网络结构时，参数`binary`必须指定为`True`。
@@ -77,10 +77,10 @@
 
 ### 编写预测代码
 
-预测代码更多详细示例代码请参考[C-API使用示例](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference) 目录下的代码示例。这一节对图1中预测代码编写的5个步骤进行介绍和说明。
+预测代码更多详细示例代码请参考[C-API使用示例](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference) 目录下的代码示例。这一节对图1中预测代码编写的5个步骤进行介绍和说明。
 
 #### step 1. 初始化PaddlePaddle运行环境
-第一步需调用[`paddle_init`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/main.h#L27) 初始化PaddlePaddle运行环境，该接口接受两个参数：参数的个数和参数列表。
+第一步需调用[`paddle_init`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/main.h#L27) 初始化PaddlePaddle运行环境，该接口接受两个参数：参数的个数和参数列表。
 
 #### step2. 加载模型
 
@@ -88,8 +88,8 @@
 
 概念上，在 PaddlePaddle 内部，一个GradientMachine类的对象管理着一组计算层（PaddlePaddle Layers）来完成前向和反向计算，并处理与之相关的所有细节。在调用C-API预测时，只需进行前向计算而无需调用反向计算。这篇文档之后部分会使用`gradient machine`来特指调用PaddlePaddle C-API创建的GradientMachine类的对象。每一个 `gradient machine` 都会管理维护一份训练好的模型，下面是C-API提供的，两种常用的模型加载方式：
 
-1. 调用[`paddle_gradient_machine_load_parameter_from_disk`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L61)接口，从磁盘加载预测模型。这时`gradient machine`会独立拥有一份训练好的模型；
-1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L88)接口，与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时，通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/multi_thread/main.c)。
+1. 调用[`paddle_gradient_machine_load_parameter_from_disk`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L61)接口，从磁盘加载预测模型。这时`gradient machine`会独立拥有一份训练好的模型；
+1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L88)接口，与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时，通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/examples/model_inference/multi_thread/main.c)。
 
 - 注意事项
 
@@ -117,7 +117,7 @@ C-API支持的所有输入数据类型和他们的组织方式，请参考“输
 
 #### step 4. 前向计算
 
-完成上述准备之后，通过调用 [`paddle_gradient_machine_forward`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L73) 接口完成神经网络的前向计算。
+完成上述准备之后，通过调用 [`paddle_gradient_machine_forward`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L73) 接口完成神经网络的前向计算。
 
 #### step 5. 清理
 
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index efa59fc4a5cf21e885435f564d2a19f892cb534b..6653244507742b33d9524a7a0e4a5b2b575d358a 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,24 +1,24 @@
 if(NOT WITH_FLUID_ONLY)
   add_subdirectory(legacy/cuda)
   add_subdirectory(legacy/function)
-  add_subdirectory(utils)
+  add_subdirectory(legacy/utils)
   add_subdirectory(legacy/math)
   add_subdirectory(legacy/gserver)
   add_subdirectory(legacy/parameter)
 
   if(MOBILE_INFERENCE)
-    add_subdirectory(capi)
+    add_subdirectory(legacy/capi)
   else()
     add_subdirectory(legacy/pserver)
-    add_subdirectory(trainer)
+    add_subdirectory(legacy/trainer)
     add_subdirectory(scripts)
 
     if(WITH_C_API)
-      add_subdirectory(capi)
+      add_subdirectory(legacy/capi)
     endif()
 
     if(WITH_SWIG_PY)
-      add_subdirectory(api)
+      add_subdirectory(legacy/api)
     endif()
   endif()
 endif()
diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index a8bbb4eb8081420ae0bbaf761bd27303c0d043cb..c30eff5010748685838feb984c9c817ffcf14c11 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -46,9 +46,14 @@ cc_library(paddle_inference_api
     SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
+# Here the shared library doesn't depend on other fluid libraries, or double free will occur.
 cc_library(paddle_inference_api_shared SHARED
-    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc)
+set_target_properties(paddle_inference_api_shared PROPERTIES OUTPUT_NAME paddle_inference_api)
+if(NOT APPLE)
+  set(LINK_FLAGS "-fPIC -fvisibility=hidden")
+  set_target_properties(paddle_inference_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+endif()
 
 cc_test(test_paddle_inference_api
         SRCS test_paddle_inference_api.cc
diff --git a/paddle/contrib/inference/paddle_inference_api.cc b/paddle/contrib/inference/paddle_inference_api.cc
index ea46b3006f8d0964cc8229d3683ee7b602d6ef0d..4fe198ad7d4a752882965e9e7fc460741de53d22 100644
--- a/paddle/contrib/inference/paddle_inference_api.cc
+++ b/paddle/contrib/inference/paddle_inference_api.cc
@@ -23,7 +23,6 @@ int PaddleDtypeSize(PaddleDType dtype) {
     case PaddleDType::INT64:
       return sizeof(int64_t);
     default:
-      //
       assert(false);
       return -1;
   }
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.h b/paddle/contrib/inference/paddle_inference_api_impl.h
index ba266b608da342fb71faf05d02ddf74330e21e98..f9ec6f55449fc46b4a44b9563980cb5f8e80a951 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.h
+++ b/paddle/contrib/inference/paddle_inference_api_impl.h
@@ -22,9 +22,9 @@
 #include "paddle/contrib/inference/paddle_inference_api.h"
 
 #include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
index 88c4e665a3daed0ed34b23b75d360acbd586401f..c3649dcb96c77f449d876bef34c4aea7afb31daa 100644
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -249,7 +249,7 @@ void MainThreadsImageClassification(bool use_gpu) {
       const size_t len = local_outputs[0].data.length();
       float* data = static_cast<float*>(local_outputs[0].data.data());
       float* ref_data = refs[tid].data<float>();
-      EXPECT_EQ(refs[tid].numel(), len / sizeof(float));
+      EXPECT_EQ((size_t)refs[tid].numel(), len / sizeof(float));
       for (int i = 0; i < refs[tid].numel(); ++i) {
         EXPECT_NEAR(ref_data[i], data[i], 1e-3);
       }
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 6286dda4a54991b7a1042aed9886fdcb694198ba..ec252929d5584c211cea7fa52004ecdfdf586a85 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -21,12 +21,13 @@ endif()
 
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
-nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context init)
+nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context tensor)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
-nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
+nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
 
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
+cc_test(reader_test SRCS reader_test.cc DEPS reader)
 
 cc_test(variable_test SRCS variable_test.cc)
 
@@ -38,7 +39,7 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
 
 cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
 nv_test(data_device_transform_test SRCS data_device_transform_test.cu
-        DEPS operator op_registry init math_function)
+        DEPS operator op_registry device_context math_function)
 
 if(WITH_GPU)
   nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
@@ -63,7 +64,7 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
     shape_inference data_transform lod_tensor profiler)
-cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
@@ -101,14 +102,14 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
 
-cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece operator)
-cc_test(init_test SRCS init_test.cc DEPS init)
-
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
       
 # cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
-cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
-        channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
-        conditional_block_op while_op assign_op print_op executor proto_desc)
+
+# disable test temporarily.
+# TODO https://github.com/PaddlePaddle/Paddle/issues/11971
+# cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
+#         channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
+#         conditional_block_op while_op assign_op print_op executor proto_desc)
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index a91fe5c99d397ef1bf04f6d22e988b6d3f33e500..f2c55e533a2747325b1b16fdada37945a8ed3c42 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/init.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 3c73b6cc55c187c3f6e7edd1ce38cc58f4e8413d..4fb4ec38ee965a2790d11378a1ce6befa0ef5a00 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -25,11 +25,12 @@ else()
     cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 endif()
 
+cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor)
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
 
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle)
 
 
 cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 64e83acb4dc1995800c4ca3caf81668b24a7c9fe..b2e5399e2376a86c1cd310b29c768832665af87f 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -33,6 +33,8 @@ struct BuildStrategy {
   GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
 
   std::string debug_graphviz_path_{""};
+
+  bool enable_data_balance_{false};
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..68896c8ac1bae7d4bfcfa79cc8ec5c26bf2d93ee
--- /dev/null
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/data_balance_op_handle.h"
+#include <algorithm>
+#include "paddle/fluid/framework/details/container_cast.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+#ifdef PADDLE_WITH_CUDA
+DataBalanceOpHandle::DataBalanceOpHandle(
+    const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places,
+    const platform::NCCLContextMap *ctxs)
+    : local_scopes_(local_scopes), places_(places) {
+  if (ctxs) {
+    for (auto &p : places_) {
+      this->dev_ctxes_[p] = ctxs->DevCtx(p);
+    }
+  }
+}
+#else
+DataBalanceOpHandle::DataBalanceOpHandle(
+    const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places)
+    : local_scopes_(local_scopes), places_(places) {}
+#endif
+
+std::string DataBalanceOpHandle::Name() const { return "data balance"; }
+
+std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
+    const std::vector<int> &device_sizes) {
+  int device_num = device_sizes.size();
+  int total_size = 0;
+  int empty_num = 0;
+  std::vector<std::array<int, 2>> size_device_vec;
+  size_device_vec.reserve(device_num);
+  for (int i = 0; i < device_num; ++i) {
+    if (device_sizes[i] == 0) {
+      ++empty_num;
+    }
+    total_size += device_sizes[i];
+    size_device_vec.push_back({{device_sizes[i], i}});
+  }
+  std::vector<std::array<int, 3>> res;
+  if (empty_num == 0) {
+    // No need to do data balance.
+    return res;
+  }
+  if (total_size < device_num) {
+    // No enough data.
+    PADDLE_THROW_EOF();
+  }
+  std::sort(size_device_vec.begin(), size_device_vec.end(),
+            [](const std::array<int, 2> &a, const std::array<int, 2> &b) {
+              return a[0] > b[0];
+            });
+  int expected_device_size = total_size / device_num;
+  int src_idx = 0;
+  for (int dst_idx = device_num - empty_num; dst_idx < device_num; ++dst_idx) {
+    if (size_device_vec[src_idx][0] <= expected_device_size) {
+      ++src_idx;
+      PADDLE_ENFORCE_LT(
+          src_idx, device_num - empty_num,
+          "In current srategy an empty tensor should not be copy source.");
+    }
+    size_device_vec[src_idx][0] -= expected_device_size;
+    size_device_vec[dst_idx][0] += expected_device_size;
+    res.push_back({{size_device_vec[src_idx][1], size_device_vec[dst_idx][1],
+                    expected_device_size}});
+  }
+  return res;
+}
+
+void DataBalanceOpHandle::RunImpl() {
+  PADDLE_ENFORCE_GT(places_.size(), 1,
+                    "Data balance can only be enabled when the number of "
+                    "places to run larger than 1.");
+  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
+  auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+  PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), out_var_handles.size(),
+      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+  int data_num = in_var_handles.size() / places_.size();
+  WaitInputVarGenerated();
+  std::vector<std::vector<LoDTensor *>> lod_tensors(data_num);
+  std::vector<int> device_sizes;
+  for (int i = 0; i < static_cast<int>(in_var_handles.size()); ++i) {
+    PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
+                      "The name of input and output should be equal.");
+    int place_idx = i / data_num;
+    int data_idx = i % data_num;
+    auto *local_scope =
+        local_scopes_[place_idx]->FindVar(kLocalExecScopeName)->Get<Scope *>();
+    auto *tensor_var = local_scope->FindVar(in_var_handles[i]->name_);
+    PADDLE_ENFORCE(tensor_var->IsType<LoDTensor>());
+    auto *tensor = tensor_var->GetMutable<LoDTensor>();
+    lod_tensors[data_idx].push_back(tensor);
+    int ins_size =
+        tensor->lod().empty() ? tensor->dims()[0] : tensor->NumElements();
+    if (data_idx == 0) {
+      device_sizes.emplace_back(ins_size);
+    } else {
+      PADDLE_ENFORCE_EQ(
+          ins_size, device_sizes.at(place_idx),
+          "All data on the same device shall have the same batch size.");
+    }
+  }
+  const auto &balance_plan = GetBalancePlan(device_sizes);
+
+  for (const auto &trans : balance_plan) {
+    for (int data_idx = 0; data_idx < data_num; ++data_idx) {
+      LoDTensor *src_tensor = lod_tensors[data_idx][trans[0]];
+      LoDTensor *dst_tensor = lod_tensors[data_idx][trans[1]];
+      int trans_ins_size = trans[2];
+      LoD src_lod = src_tensor->lod();
+      int src_ins_size =
+          src_lod.empty() ? src_tensor->dims()[0] : src_tensor->NumElements();
+      int cut_point = src_ins_size - trans_ins_size;
+      if (!src_lod.empty()) {
+        for (auto &level : src_lod) {
+          cut_point = level[cut_point];
+        }
+      }
+      TensorCopySync(src_tensor->Slice(cut_point, src_tensor->dims()[0]),
+                     dst_tensor->place(), dst_tensor);
+      src_tensor->ShareDataWith(src_tensor->Slice(0, cut_point));
+      if (!src_lod.empty()) {
+        dst_tensor->set_lod(SliceInLevel(
+            src_lod, 0, src_ins_size - trans_ins_size, src_ins_size));
+        src_tensor->set_lod(
+            SliceInLevel(src_lod, 0, 0, src_ins_size - trans_ins_size));
+      }
+    }
+  }
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.h b/paddle/fluid/framework/details/data_balance_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..76a407e3610e8bb48facf1f814779f4c23f92d98
--- /dev/null
+++ b/paddle/fluid/framework/details/data_balance_op_handle.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct DataBalanceOpHandle : public OpHandleBase {
+ public:
+#ifdef PADDLE_WITH_CUDA
+  DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
+                      const std::vector<platform::Place> &places,
+                      const platform::NCCLContextMap *ctxs);
+#else
+  DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
+                      const std::vector<platform::Place> &places);
+#endif
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  // std::vector<(src_dev_id, dst_dev_id, trans_size)>
+  std::vector<std::array<int, 3>> GetBalancePlan(
+      const std::vector<int> &batch_size_per_device);
+
+  const std::vector<Scope *> local_scopes_;
+  const std::vector<platform::Place> places_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 224e8e1f6efd7a894591ac51c929517cae7539ce..d646c944601e81477787740189d7ac60ae97fa80 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -67,8 +67,8 @@ void FetchOpHandle::RunImpl() {
 #endif
     } else {
       tensors_[i].ShareDataWith(t);
-      tensors_[i].set_lod(t.lod());
     }
+    tensors_[i].set_lod(t.lod());
   }
 
   this->WaitAndMergeCPUTensors();
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index cc7b94d0653e34c8ac711a7db7ab6ab1a9ac46a2..b82c2ef4082110f1621eb38d50361396511a4825 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/data_balance_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
@@ -58,6 +59,11 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
     grad_names_.insert(GradVarName(p));
   }
   balance_vars_.resize(places_.size(), 0);
+  if (strategy_.enable_data_balance_ && places_.size() == 1) {
+    LOG(WARNING) << "It is no need to enable data balance when there is only "
+                    "one place. enable_data_balance is set to False.";
+    strategy_.enable_data_balance_ = false;
+  }
 }
 
 void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
@@ -215,7 +221,14 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
       } else {
         // This op runs on all devices, and its output may have parameter's
         // gradients.
-        CreateComputationalOps(&result, *op, places_.size());
+        if (op->Type() == "read" && strategy_.enable_data_balance_) {
+          op->SetAttr("throw_eof_exp", false);
+          CreateComputationalOps(&result, *op, places_.size());
+          const auto &data_var_names = op->Output("Out");
+          InsertDataBalanceOp(&result, data_var_names);
+        } else {
+          CreateComputationalOps(&result, *op, places_.size());
+        }
 
         if (!is_forwarding && places_.size() > 1) {
           // Currently, we assume that once gradient is generated, it can be
@@ -360,6 +373,29 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result,
   }
 }
 
+void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
+    SSAGraph *result, const std::vector<std::string> &datas) const {
+#ifdef PADDLE_WITH_CUDA
+  result->ops_.emplace_back(
+      new DataBalanceOpHandle(local_scopes_, places_, nccl_ctxs_));
+#else
+  result->ops_.emplace_back(new DataBalanceOpHandle(local_scopes_, places_));
+#endif
+  auto *op_handle = result->ops_.back().get();
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &p = places_[i];
+    SetCommunicationContext(op_handle, p);
+    for (const std::string &d_name : datas) {
+      auto &vars = result->vars_[i][d_name];
+      PADDLE_ENFORCE(!vars.empty());
+      op_handle->AddInput(vars.back().get());
+      auto var = new VarHandle(vars.size(), i, d_name, p);
+      vars.emplace_back(var);
+      op_handle->AddOutput(var);
+    }
+  }
+}
+
 bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
     const std::string &og,
     std::unordered_set<std::string> *og_has_been_broadcast) const {
@@ -512,7 +548,8 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
     op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
     // the variable name which contains .block means it was splited by
     // split_byref op
-    // so that we can balance the variable blocks to all the pserver instances.
+    // so that we can balance the variable blocks to all the pserver
+    // instances.
     if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
         op.InputArgumentNames()[0].find(".block") == std::string::npos) {
       op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index 0b6347bf51dc1c347073a0fdcf4ddd91865d846d..a964e024885e56693224a6199e00ff30beaa1df4 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -101,6 +101,9 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 
   void InsertAllReduceOp(SSAGraph *result, const std::string &og) const;
 
+  void InsertDataBalanceOp(SSAGraph *result,
+                           const std::vector<std::string> &datas) const;
+
   void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
                          size_t src_dev_id) const;
 
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 1f84c3b9e2d7ee9ae51959988fceeb3451b7b3b8..d80bdcf15d798925c137460125964d3d7e65f67e 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -58,8 +58,10 @@ void OpHandleBase::Run(bool use_cuda) {
 
 void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
 #ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE_NOT_NULL(waited_ctx);
   if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
     for (auto &dev_ctx : dev_ctxes_) {
+      PADDLE_ENFORCE_NOT_NULL(dev_ctx.second);
       dev_ctx.second->Wait();
     }
   } else {
@@ -122,16 +124,10 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 #ifdef PADDLE_WITH_CUDA
   if (!events_.empty()) {  // Use event
     std::function<void()> method = callback;
-    // NOTE(zcd): device context must be ordered here because RecordEvent
-    // will use a mutex to ensure the safe of multi-threads.
-    std::map<platform::DeviceContext *, platform::Place> ordered_ctxes;
     for (auto &p : dev_ctxes_) {
-      ordered_ctxes.emplace(p.second, p.first);
-    }
-    for (auto &p : ordered_ctxes) {
       method = [method, p, this]() {
-        static_cast<platform::CUDADeviceContext *>(p.first)->RecordEvent(
-            events_.at(boost::get<platform::CUDAPlace>(p.second).device),
+        static_cast<platform::CUDADeviceContext *>(p.second)->RecordEvent(
+            events_.at(boost::get<platform::CUDAPlace>(p.first).device),
             method);
       };
     }
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index fbd90a3296bca92b097cab925b218b91e7f4752f..6aec178831161f8ac1306fc3ed72e3267ca3c7e5 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #pragma once
+#include <map>
 #include <string>
 #include <vector>
-
 #include "paddle/fluid/framework/details/var_handle.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/macros.h"
@@ -92,9 +92,7 @@ class OpHandleBase {
 
   std::vector<VarHandleBase *> inputs_;
   std::vector<VarHandleBase *> outputs_;
-  std::unordered_map<platform::Place, platform::DeviceContext *,
-                     platform::PlaceHash>
-      dev_ctxes_;
+  std::map<platform::Place, platform::DeviceContext *> dev_ctxes_;
 
 #ifdef PADDLE_WITH_CUDA
   std::unordered_map<int, cudaEvent_t> events_;
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
index a6ffb37313a88120bc9e8d5ce326f60aeebdff69..c0cd873a1d83fa8c2c7b7cd5acfaad9949bcff7d 100644
--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -54,8 +54,7 @@ struct ReduceLoDTensor {
 inline void GatherSelectedRows(
     const std::vector<const SelectedRows *> &src_selecte_rows_,
     const std::vector<platform::Place> &in_places,
-    const std::unordered_map<platform::Place, platform::DeviceContext *,
-                             platform::PlaceHash> &dev_ctxes,
+    const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
     const platform::Place &out_place, SelectedRows *dst_selecte_rows) {
   PADDLE_ENFORCE(!src_selecte_rows_.empty());
 
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index eb4e7ec52f907f9403e21ec2734d61824f51a58b..e55858803612c595ffda4f4e50a7f8946ba91dff 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+#include <stdexcept>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
@@ -54,7 +55,13 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
     }
   }
 
-  auto fetch_data = underlying_executor_->Run(fetch_tensors);
+  std::vector<framework::LoDTensor> fetch_data;
+  std::exception_ptr eptr;
+  try {
+    fetch_data = underlying_executor_->Run(fetch_tensors);
+  } catch (...) {
+    eptr = std::current_exception();
+  }
   drop_scope_counter_ += 1;
   if (!fetch_tensors.empty() ||
       drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
@@ -69,7 +76,12 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
       scope->DeleteScope(local_scope);
     }
   }
-  return fetch_data;
+
+  if (eptr) {
+    std::rethrow_exception(eptr);
+  } else {
+    return fetch_data;
+  }
 }
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index b1706eb12d080364d04108c7ef4da31e1e7c1deb..96c5a206892224ec07848707ef80d647aa0b9afd 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -78,6 +78,9 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     set.clear();
   };
 
+  run_op_futures_.clear();
+  exception_.reset();
+
   // Step 3. Execution
   while (!pending_vars.empty()) {
     // 1. Run All Ready ops
@@ -96,11 +99,25 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
 
     if (timeout) {
-      std::lock_guard<std::mutex> l(exception_mu_);
+      std::unique_lock<std::mutex> l(exception_mu_);
       if (exception_) {
-        auto exp = *exception_;
-        exception_.reset();
-        throw exp;
+        l.unlock();
+        for (auto &run_op_future : run_op_futures_) {
+          run_op_future.wait();
+        }
+        l.lock();
+        std::exception *exp = exception_.get();
+        if (dynamic_cast<platform::EOFException *>(exp)) {
+          auto e = *static_cast<platform::EOFException *>(exp);
+          exception_.reset();
+          throw e;
+        } else if (dynamic_cast<platform::EnforceNotMet *>(exp)) {
+          auto e = *static_cast<platform::EnforceNotMet *>(exp);
+          exception_.reset();
+          throw e;
+        } else {
+          LOG(FATAL) << "Unknown exception.";
+        }
       } else {
         continue;
       }
@@ -199,6 +216,12 @@ void ThreadedSSAGraphExecutor::RunOp(
       running_ops_--;
       ready_var_q->Extend(op->Outputs());
       VLOG(10) << op << " " << op->Name() << "Signal posted";
+    } catch (platform::EOFException ex) {
+      std::lock_guard<std::mutex> l(exception_mu_);
+      // EOFException will not cover up existing EnforceNotMet.
+      if (exception_.get() == nullptr) {
+        exception_.reset(new platform::EOFException(ex));
+      }
     } catch (platform::EnforceNotMet ex) {
       std::lock_guard<std::mutex> l(exception_mu_);
       exception_.reset(new platform::EnforceNotMet(ex));
@@ -207,7 +230,7 @@ void ThreadedSSAGraphExecutor::RunOp(
     }
   };
   if (pool_) {
-    pool_->enqueue(op_run);
+    run_op_futures_.emplace_back(pool_->enqueue(op_run));
   } else {
     op_run();
   }
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 90430be996758364387b552019762d9c2e9dfe45..168293a3291283a9905c12b9ee22c3136157e9ee 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <deque>
+#include <list>
 #include <string>
 #include <unordered_set>
 #include <utility>
@@ -57,7 +58,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<platform::Place> places_;
   platform::DeviceContextPool fetch_ctxs_;
   std::mutex exception_mu_;
-  std::unique_ptr<platform::EnforceNotMet> exception_;
+  std::unique_ptr<std::exception> exception_;
   std::atomic<int> running_ops_;
 
   void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
@@ -77,6 +78,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
 
  private:
   ExecutionStrategy strategy_;
+  std::list<std::future<void>> run_op_futures_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 261e9c5a8c0f905e2d4492839d6e88ba93ff1988..84f67fafa19ac545ebb7a1019059e3c74c363c56 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -46,9 +46,16 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
 #ifdef PADDLE_WITH_DISTRIBUTE
-void Executor::Complete() {
-  ::paddle::operators::distributed::RPCClient::GetInstance<RPCCLIENT_T>()
-      ->SendComplete();
+void Executor::BeginPass() {
+  ::paddle::operators::distributed::RPCClient::GetInstance<
+      ::paddle::operators::distributed::GRPCClient>()
+      ->SendBeginPass();
+}
+
+void Executor::EndPass() {
+  ::paddle::operators::distributed::RPCClient::GetInstance<
+      ::paddle::operators::distributed::GRPCClient>()
+      ->SendEndPass();
 }
 #endif
 
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 3aa5ffef69cd29681f248e915575c5715ad0d3fa..563a4b2bb65dad481a755f67c7f23939816ce8e8 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -46,9 +46,14 @@ class Executor {
 
 #ifdef PADDLE_WITH_DISTRIBUTE
   /*
-   * Sending signal to pserver to mark current trainer stop.
+   * Sending signal to pserver to mark current pass started.
    */
-  void Complete();
+  void BeginPass();
+
+  /*
+   * Sending signal to pserver to mark current pass finished.
+   */
+  void EndPass();
 #endif
 
   /* @Brief
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 2d0611326be2fb2ddfef5cea0c11d2f935d20888..cba0064f38f89c1dd27cfac1ddb2339a5ee6c93f 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -90,6 +90,7 @@ std::string LoDToString(const LoD &lod) {
 LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
                  size_t elem_end) {
   PADDLE_ENFORCE_LT(level, in.size());
+  PADDLE_ENFORCE_LT(elem_begin, elem_end);
   PADDLE_ENFORCE_LT(elem_end, in[level].size());
 
   LoD res;
@@ -393,6 +394,7 @@ void LoDTensor::MergeLoDTensor(
     new_dim[0] += t->dims()[0];
 
     auto &lod = t->lod();
+    PADDLE_ENFORCE_EQ(new_lod.size(), lod.size());
     for (size_t j = 0; j < lod.size(); ++j) {
       auto &sub_lod = new_lod[j];
       auto &offset = sub_lod.back();
diff --git a/paddle/fluid/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu
index e3efbe4c464493af87e33510647d8c67d457a76d..b9950627ca378cb9607681799bd7fe5bfce2bf50 100644
--- a/paddle/fluid/framework/lod_tensor_test.cu
+++ b/paddle/fluid/framework/lod_tensor_test.cu
@@ -17,9 +17,9 @@
 #include <stdio.h>
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 
 __global__ void test(size_t* a, int size) {
diff --git a/paddle/fluid/framework/op_info.cc b/paddle/fluid/framework/op_info.cc
index f1261dee0319440995951d1bee145404186a8ad4..af75baa5c4b98f7d092834c05eb57e9c7e131b29 100644
--- a/paddle/fluid/framework/op_info.cc
+++ b/paddle/fluid/framework/op_info.cc
@@ -21,8 +21,8 @@ namespace framework {
 // a static local variable is already being initialized.
 // https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
 OpInfoMap& OpInfoMap::Instance() {
-  static OpInfoMap* g_op_info_map = new OpInfoMap();
-  return *g_op_info_map;
+  static OpInfoMap g_op_info_map;
+  return g_op_info_map;
 }
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 43ab227a9478707445892c14723801992d0041aa..e7dfa608b48f89a2155e43c7e63e31154675cd38 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -76,6 +76,20 @@ class OpRegistry {
 template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
 struct OpKernelRegistrarFunctor;
 
+template <typename PlaceType, typename T, typename Func>
+inline void RegisterKernelClass(const char* op_type, const char* library_type,
+                                Func func) {
+  std::string library(library_type);
+  std::string data_layout = "ANYLAYOUT";
+  if (library == "MKLDNN") {
+    data_layout = "MKLDNNLAYOUT";
+  }
+  OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
+                   StringToDataLayout(data_layout),
+                   StringToLibraryType(library_type));
+  OperatorWithKernel::AllOpKernels()[op_type][key] = func;
+}
+
 template <typename PlaceType, size_t I, typename... KernelTypes>
 struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
   using KERNEL_TYPE =
@@ -83,16 +97,10 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
 
   void operator()(const char* op_type, const char* library_type) const {
     using T = typename KERNEL_TYPE::ELEMENT_TYPE;
-    std::string library(library_type);
-    std::string data_layout = "ANYLAYOUT";
-    if (library == "MKLDNN") {
-      data_layout = "MKLDNNLAYOUT";
-    }
-    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
-                     StringToDataLayout(data_layout),
-                     StringToLibraryType(library_type));
-    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
-
+    RegisterKernelClass<PlaceType, T>(
+        op_type, library_type, [](const framework::ExecutionContext& ctx) {
+          KERNEL_TYPE().Compute(ctx);
+        });
     constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
     OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
         func;
@@ -116,6 +124,47 @@ class OpKernelRegistrar : public Registrar {
   }
 };
 
+template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
+struct OpKernelRegistrarFunctorEx;
+
+template <typename PlaceType, typename... DataTypeAndKernelType>
+class OpKernelRegistrarEx : public Registrar {
+ public:
+  explicit OpKernelRegistrarEx(const char* op_type, const char* library_type) {
+    OpKernelRegistrarFunctorEx<PlaceType, false, 0, DataTypeAndKernelType...>
+        func;
+    func(op_type, library_type);
+  }
+};
+
+template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
+struct OpKernelRegistrarFunctorEx<PlaceType, true, I,
+                                  DataTypeAndKernelType...> {
+  void operator()(const char* op_type, const char* library_type) const {}
+};
+
+template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
+struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+                                  DataTypeAndKernelType...> {
+  using Functor =
+      typename std::tuple_element<I + 1,
+                                  std::tuple<DataTypeAndKernelType...>>::type;
+  using T =
+      typename std::tuple_element<I,
+                                  std::tuple<DataTypeAndKernelType...>>::type;
+
+  void operator()(const char* op_type, const char* library_type) const {
+    RegisterKernelClass<PlaceType, T>(op_type, library_type, Functor());
+
+    constexpr auto size =
+        std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
+    OpKernelRegistrarFunctorEx<PlaceType, I + 2 >= size, I + 2,
+                               DataTypeAndKernelType...>
+        func;
+    func(op_type, library_type);
+  }
+};
+
 /**
  * check if MACRO is used in GLOBAL NAMESPACE.
  */
@@ -133,21 +182,15 @@ class OpKernelRegistrar : public Registrar {
     VarTypeInference
     InferShapeBase
 */
-#define REGISTER_OPERATOR(op_type, op_class, ...)                      \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
-      __reg_op__##op_type,                                             \
-      "REGISTER_OPERATOR must be called in global namespace");         \
-  class _OpClass_##op_type##_ : public op_class {                      \
-   public:                                                             \
-    DEFINE_OP_CLONE_METHOD(_OpClass_##op_type##_);                     \
-    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class);            \
-  };                                                                   \
-  static ::paddle::framework::OperatorRegistrar<_OpClass_##op_type##_, \
-                                                ##__VA_ARGS__>         \
-      __op_registrar_##op_type##__(#op_type);                          \
-  int TouchOpRegistrar_##op_type() {                                   \
-    __op_registrar_##op_type##__.Touch();                              \
-    return 0;                                                          \
+#define REGISTER_OPERATOR(op_type, op_class, ...)                        \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                        \
+      __reg_op__##op_type,                                               \
+      "REGISTER_OPERATOR must be called in global namespace");           \
+  static ::paddle::framework::OperatorRegistrar<op_class, ##__VA_ARGS__> \
+      __op_registrar_##op_type##__(#op_type);                            \
+  int TouchOpRegistrar_##op_type() {                                     \
+    __op_registrar_##op_type##__.Touch();                                \
+    return 0;                                                            \
   }
 
 #define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
@@ -174,6 +217,25 @@ class OpKernelRegistrar : public Registrar {
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
 
+#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, ...)      \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
+      __reg_op_kernel_##op_type##_##library_type##__,                       \
+      "REGISTER_OP_KERNEL_EX must be called in global namespace");          \
+  static ::paddle::framework::OpKernelRegistrarEx<place_class, __VA_ARGS__> \
+      __op_kernel_registrar_##op_type##_##library_type##__(#op_type,        \
+                                                           #library_type);  \
+  int TouchOpKernelRegistrar_##op_type##_##library_type() {                 \
+    __op_kernel_registrar_##op_type##_##library_type##__.Touch();           \
+    return 0;                                                               \
+  }
+
+#define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...)                 \
+  REGISTER_OP_KERNEL_EX(op_type, CUDA, ::paddle::platform::CUDAPlace, \
+                        __VA_ARGS__)
+
+#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...) \
+  REGISTER_OP_KERNEL_EX(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
+
 /**
  * Macro to mark what Operator and Kernel
  * we will use and tell the compiler to
diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc
index 18b1649cc71d5edd5b07740bbad1fe8f81128898..04996d7b09cecc3c330a47153c9b10310f1792f4 100644
--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -193,15 +193,10 @@ TEST(OpRegistry, CustomChecker) {
   ASSERT_EQ(test_attr, 4);
 }
 
-class CosineOpComplete : public paddle::framework::CosineOp {
- public:
-  DEFINE_OP_CONSTRUCTOR(CosineOpComplete, paddle::framework::CosineOp);
-  DEFINE_OP_CLONE_METHOD(CosineOpComplete);
-};
-
 TEST(OperatorRegistrar, Test) {
   paddle::framework::OperatorRegistrar<
-      CosineOpComplete, paddle::framework::CosineOpProtoAndCheckerMaker>
+      paddle::framework::CosineOp,
+      paddle::framework::CosineOpProtoAndCheckerMaker>
       reg("cos");
 }
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 71cd5a39083471af52598cc2a1d4c591d3780624..d1dc5fcd97b77fb7707c7d48f6eaeef140d3f306 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -633,6 +633,16 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
+#ifdef PADDLE_WITH_MKLDNN
+  // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
+  if (kernel_iter == kernels.end() &&
+      expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
+    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
+    expected_kernel_key.library_type_ = LibraryType::kPlain;
+    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
+#endif
   if (kernel_iter == kernels.end()) {
     PADDLE_THROW("op %s does not have kernel for %s", type_,
                  KernelTypeToString(expected_kernel_key));
@@ -651,7 +661,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     dev_ctx = pool.Get(expected_kernel_key.place_);
   }
 
-  kernel_iter->second->Compute(ExecutionContext(*this, exec_scope, *dev_ctx));
+  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx));
 
   if (!transfered_inplace_vars.empty()) {
     // there is inplace variable has been transfered.
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 1550d5df172f0599e1b42e7f1ccf51ac4dd1e0c3..1040eb882baea624e972faf4af3094119df72308 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -121,10 +121,6 @@ class OperatorBase {
   //! Get all outputs variable names
   virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
 
-  // Return a new operator instance, which is as same as this.
-  // Use unique_ptr to prevent caller forget to delete this pointer.
-  virtual std::unique_ptr<OperatorBase> Clone() const = 0;
-
  protected:
   std::string type_;
   // NOTE: in case of OpGrad, inputs_ contains:
@@ -145,37 +141,6 @@ class OperatorBase {
                        const platform::Place& place) const = 0;
 };
 
-// Macro for define a clone method.
-// If you are writing an kernel operator, `Clone` will be defined when you
-// register it. i.e. `Clone` method is not needed to define by yourself.
-#define DEFINE_OP_CLONE_METHOD(cls)                                            \
-  std::unique_ptr<::paddle::framework::OperatorBase> Clone() const final {     \
-    return std::unique_ptr<::paddle::framework::OperatorBase>(new cls(*this)); \
-  }
-
-// Macro for define a default constructor for Operator.
-// You can also use
-//   using PARENT_CLASS::PARENT_CLASS;
-// to use parent's constructor.
-#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls)             \
-  cls(const std::string& type,                             \
-      const ::paddle::framework::VariableNameMap& inputs,  \
-      const ::paddle::framework::VariableNameMap& outputs, \
-      const paddle::framework::AttributeMap& attrs)        \
-      : parent_cls(type, inputs, outputs, attrs) {}
-
-class NOP : public OperatorBase {
- public:
-  using OperatorBase::OperatorBase;
-  std::unique_ptr<OperatorBase> Clone() const override {
-    return std::unique_ptr<OperatorBase>(new NOP(*this));
-  }
-
- private:
-  void RunImpl(const Scope& scope,
-               const platform::Place& place) const override {}
-};
-
 class ExecutionContext {
  public:
   ExecutionContext(const OperatorBase& op, const Scope& scope,
@@ -347,9 +312,9 @@ class OpKernel : public OpKernelBase {
 
 class OperatorWithKernel : public OperatorBase {
  public:
+  using OpKernelFunc = std::function<void(const ExecutionContext&)>;
   using OpKernelMap =
-      std::unordered_map<OpKernelType, std::unique_ptr<OpKernelBase>,
-                         OpKernelType::Hash>;
+      std::unordered_map<OpKernelType, OpKernelFunc, OpKernelType::Hash>;
 
   OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
                      const VariableNameMap& outputs, const AttributeMap& attrs)
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index 74043b5d7990178976baf2fad991ae03f9c8dd25..ac9dd8245ad4e0e8842f219b23d3866b03fdaedb 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/init.h"
 
 namespace paddle {
 namespace framework {
@@ -247,26 +247,3 @@ TEST(OpKernel, multi_inputs) {
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   op->Run(scope, cpu_place);
 }
-
-class OperatorClone : public paddle::framework::OperatorBase {
- public:
-  DEFINE_OP_CLONE_METHOD(OperatorClone);
-  OperatorClone(const std::string& type,
-                const paddle::framework::VariableNameMap& inputs,
-                const paddle::framework::VariableNameMap& outputs,
-                const paddle::framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const paddle::framework::Scope& scope,
-               const paddle::platform::Place& place) const override {}
-};
-
-TEST(Operator, Clone) {
-  paddle::framework::InitDevices(true);
-  OperatorClone a("ABC", paddle::framework::VariableNameMap{},
-                  paddle::framework::VariableNameMap{},
-                  paddle::framework::AttributeMap{});
-  auto b = a.Clone();
-  ASSERT_EQ(a.Type(), b->Type());
-}
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index 0b36f1116d15004b355e854e101abb9ad3297836..5897d320a8b7e5af541098cadff8e78f8324949c 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -13,29 +13,61 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/reader.h"
+#include <deque>
 
 namespace paddle {
 namespace framework {
-ReaderBase::~ReaderBase() {}
 
-FileReader::FileReader(const std::vector<DDim> &dims) : dims_(dims) {}
-
-void FileReader::ReadNext(std::vector<LoDTensor> *out) {
+void ReaderBase::ReadNext(std::vector<LoDTensor> *out) {
+  std::lock_guard<std::mutex> lock(mu_);
+  PADDLE_ENFORCE_EQ(status_, ReaderStatus::kRunning);
   ReadNextImpl(out);
-  if (out->empty()) {
-    return;
-  }
+}
 
-  PADDLE_ENFORCE_EQ(out->size(), dims_.size());
-  for (size_t i = 0; i < dims_.size(); ++i) {
-    auto &actual = (*out)[i].dims();
-    auto &expect = dims_[i];
+void ReaderBase::InsertDecoratedReader(
+    const std::shared_ptr<ReaderBase> &decorated_reader) {
+  std::lock_guard<std::mutex> guard(mu_);
+  decorated_readers_.emplace_back(decorated_reader);
+}
 
-    PADDLE_ENFORCE_EQ(actual.size(), expect.size());
-    for (int j = 0; j < actual.size(); ++j) {
-      //      PADDLE_ENFORCE(actual[i] == expect[i] || expect[i] == -1);
+std::unordered_set<ReaderBase *> ReaderBase::GetEndPoints() {
+  std::unordered_set<ReaderBase *> result;
+  std::deque<ReaderBase *> queue;
+  queue.emplace_back(this);
+  while (!queue.empty()) {  // BFS search
+    auto *front = queue.front();
+    queue.pop_front();
+    if (front->decorated_readers_.empty()) {
+      result.emplace(front);
+    } else {
+      for (auto &reader : front->decorated_readers_) {
+        if (auto *reader_ptr = reader.lock().get()) {
+          queue.emplace_back(reader_ptr);
+        }
+      }
     }
   }
+
+  return result;
 }
+
+void ReaderBase::Shutdown() {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (status_ != ReaderStatus::kStopped) {
+    ShutdownImpl();
+    status_ = ReaderStatus::kStopped;
+  }
+}
+
+void ReaderBase::Start() {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (status_ != ReaderStatus::kRunning) {
+    StartImpl();
+    status_ = ReaderStatus::kRunning;
+  }
+}
+
+ReaderBase::~ReaderBase() { Shutdown(); }
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
index 64d4ceab624312ed366d7e835072899f1f033a88..a8d04feb42456607159bcbede0574fe90dfe995c 100644
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <memory>
+#include <unordered_set>
 #include <vector>
 
 #include "paddle/fluid/framework/ddim.h"
@@ -24,61 +25,116 @@
 namespace paddle {
 namespace framework {
 
+enum ReaderStatus { kRunning, kStopped };
+
 class ReaderBase {
  public:
-  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
+  virtual void ReadNext(std::vector<LoDTensor>* out);
+
+  virtual void Shutdown();
 
-  virtual void ReInit() = 0;
+  virtual void Start();
+
+  // Return the readers which are the end of decorating chain. Basically
+  // they are readers just before read op.
+  std::unordered_set<ReaderBase*> GetEndPoints();
 
   virtual ~ReaderBase();
+
+ protected:
+  virtual void ReadNextImpl(std::vector<LoDTensor>* out) {}
+
+  virtual void ShutdownImpl() {}
+
+  virtual void StartImpl() {}
+
+  ReaderStatus status_{kRunning};
+
+  mutable std::mutex mu_;
+
+ private:
+  friend class DecoratedReader;
+  // These methods can be only invoked inside DecoratedReader to record the
+  // decorating chain.
+  void InsertDecoratedReader(
+      const std::shared_ptr<ReaderBase>& decorated_reader);
+  // A set of which readers that decorated this reader.
+  std::vector<std::weak_ptr<ReaderBase>> decorated_readers_;
 };
 
-class DecoratedReader : public ReaderBase {
+class DecoratedReader : public ReaderBase,
+                        public std::enable_shared_from_this<DecoratedReader> {
  public:
   explicit DecoratedReader(const std::shared_ptr<ReaderBase>& reader)
       : ReaderBase(), reader_(reader) {
     PADDLE_ENFORCE_NOT_NULL(reader_);
   }
 
-  void ReInit() override { reader_->ReInit(); }
+  void RegisterDecorateChain() {
+    reader_->InsertDecoratedReader(shared_from_this());
+  }
 
  protected:
-  std::shared_ptr<ReaderBase> reader_;
-};
-
-class FileReader : public ReaderBase {
- public:
-  explicit FileReader(const std::vector<DDim>& dims);
-
-  void ReadNext(std::vector<LoDTensor>* out) override;
+  void ShutdownImpl() override { reader_->Shutdown(); }
 
- protected:
-  virtual void ReadNextImpl(std::vector<LoDTensor>* out) = 0;
+  void StartImpl() override { reader_->Start(); }
 
- private:
-  std::vector<DDim> dims_;
+  std::shared_ptr<ReaderBase> reader_;
 };
 
+// FileReader is just a conceptual class.
+class FileReader : public ReaderBase {};
+
 // The ReaderHolder is used as reader' unified wrapper,
 // making it easier to access different type reader in Variables.
 class ReaderHolder {
  public:
-  void Reset(ReaderBase* reader) { reader_.reset(reader); }
+  template <typename T>
+  void Reset(const std::shared_ptr<T>& reader) {
+    auto reader_base = std::dynamic_pointer_cast<ReaderBase>(reader);
+    PADDLE_ENFORCE_NOT_NULL(reader_base);
+    reader_ = reader_base;
+  }
 
-  std::shared_ptr<ReaderBase> Get() const { return reader_; }
+  const std::shared_ptr<ReaderBase>& Get() const { return reader_; }
 
   void ReadNext(std::vector<LoDTensor>* out) {
     PADDLE_ENFORCE_NOT_NULL(reader_);
     reader_->ReadNext(out);
   }
-  void ReInit() {
+
+  void ResetAll() {
+    auto end_readers = reader_->GetEndPoints();
+    for (auto* reader : end_readers) {
+      reader->Shutdown();
+    }
+    for (auto* reader : end_readers) {
+      reader->Start();
+    }
+  }
+
+  void Shutdown() {
     PADDLE_ENFORCE_NOT_NULL(reader_);
-    reader_->ReInit();
+    reader_->Shutdown();
   }
 
+  void Start() {
+    PADDLE_ENFORCE_NOT_NULL(reader_);
+    reader_->Start();
+  }
+
+  operator const std::shared_ptr<ReaderBase>&() const { return this->reader_; }
+
  private:
   std::shared_ptr<ReaderBase> reader_;
 };
 
+template <typename T, typename... ARGS>
+inline std::shared_ptr<DecoratedReader> MakeDecoratedReader(ARGS&&... args) {
+  std::shared_ptr<DecoratedReader> reader(new T(std::forward<ARGS>(args)...));
+  reader->RegisterDecorateChain();
+  return reader;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/reader_test.cc b/paddle/fluid/framework/reader_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f0d07cb7c1367576084b9494e7758103bb45d1e5
--- /dev/null
+++ b/paddle/fluid/framework/reader_test.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/reader.h"
+#include <memory>
+#include "gtest/gtest.h"
+
+class StubDecoratedReader : public paddle::framework::DecoratedReader {
+ public:
+  explicit StubDecoratedReader(const std::shared_ptr<ReaderBase> &reader)
+      : DecoratedReader(reader) {}
+
+  void ReadNextImpl(std::vector<paddle::framework::LoDTensor> *out) override {}
+};
+
+class StubRootReader : public paddle::framework::ReaderBase {
+ public:
+  void ReadNextImpl(std::vector<paddle::framework::LoDTensor> *out) override {}
+};
+
+TEST(READER, decorate_chain) {
+  auto root = std::make_shared<StubRootReader>();
+  auto end_point1 =
+      paddle::framework::MakeDecoratedReader<StubDecoratedReader>(root);
+  auto end_point2 =
+      paddle::framework::MakeDecoratedReader<StubDecoratedReader>(root);
+
+  {
+    auto endpoints = root->GetEndPoints();
+    ASSERT_EQ(endpoints.size(), 2U);
+    ASSERT_NE(endpoints.count(end_point1.get()), 0);
+    ASSERT_NE(endpoints.count(end_point2.get()), 0);
+  }
+
+  {
+    auto end_point3 =
+        paddle::framework::MakeDecoratedReader<StubDecoratedReader>(root);
+    ASSERT_EQ(root->GetEndPoints().size(), 3U);
+  }
+  { ASSERT_EQ(root->GetEndPoints().size(), 2U); }
+}
diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc
index 14b81ddfecb8c996ae8709910c022a074e91eb3c..7842168f603885ce7dc87d2a01dfa4f544389faa 100644
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@@ -22,6 +22,17 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class NOP : public OperatorBase {
+ public:
+  NOP(const std::string &type, const VariableNameMap &inputs,
+      const VariableNameMap &outputs, const AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const Scope &scope,
+               const platform::Place &place) const override {}
+};
+
 class SumOpMaker : public OpProtoAndCheckerMaker {
  public:
   void Make() {
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 7071eea19c355c04711a11c224985be96c6589f4..1895aea7f98cb1ad12b2ce16545339252349ea37 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor )
 
 # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
diff --git a/paddle/fluid/inference/analysis/README.md b/paddle/fluid/inference/analysis/README.md
index 6fd73958bc480fe3983b9622c03ac77fba9ec8a7..70adb4a974cc5f9911cb302840bbef7ec2591505 100644
--- a/paddle/fluid/inference/analysis/README.md
+++ b/paddle/fluid/inference/analysis/README.md
@@ -54,4 +54,5 @@ It can be used as a helper class that draws the modified graph after each pass.
 There is some helper legacy/function/class for analysis.
 
 - [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes,
-- [graph_traits.h](./graph_traits.h) contains the graph traversal algorithms, it uses `iterator` to make the algorithms easy to share across different passes.
+- [graph_traits.h](./graph_traits.h) contains the interfaces of the graph traversal algorithms, it uses `iterator`to make the algorithms easy to share across different passes,
+there are some implementations in  [data_flow_graph.cc](./data_flow_graph.cc) , such as BFS and DFS..
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
index cfbbc284e491bd62a6108d6d14e7896a57d1b63e..cbca5abdd5fff1672ba5d47a8876489c54ad6947 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -27,7 +27,7 @@ TEST_F(DFG_Tester, Init) {
   DataFlowGraph graph;
   pass.Run(&graph);
   // Analysis is sensitive to ProgramDesc, careful to change the original model.
-  ASSERT_EQ(graph.nodes.size(), 37);
+  ASSERT_EQ(graph.nodes.size(), 37UL);
   pass.Finalize();
   LOG(INFO) << '\n' << graph.DotString();
 }
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h
index 25c566ebfa41abe3a247bc6c6e5583c8620a6abb..6b4dbb3bb5ddd9f15f26758beef1d1b5bbf49142 100644
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -32,19 +32,6 @@ class Pass {
  public:
   Pass() = default;
   virtual ~Pass() = default;
-  // Virtual method overridden by subclasses to do only necessary initialization
-  // before any pass is run.
-  // virtual bool Initialize() { return false; }
-  // There is some passes such as FlowToDataFlowGraphPass that needs a
-  // ProgramDesc. Here use the native ProgramDesc ProtoBuf message, so that it
-  // only couple with the proto file.
-  // virtual bool Initialize(const framework::proto::ProgramDesc &desc) { return
-  // false; }
-  // There are some Passes such as DataFlowGraphToFluidPass that will output a
-  // ProgramDesc.
-  // virtual bool Initialize(framework::proto::ProgramDesc *desc) { return
-  // false; }
-
   // Mutable Pass.
   virtual bool Initialize(Argument *argument) { return false; }
   // Readonly Pass.
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
index 8134494f8bccb132f2ed7d1ba1fb615a298596ed..67dd4da54b95add703428e1fded61065f60353e8 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -82,7 +82,7 @@ TEST_F(DFG_Tester, Fuse) {
 
   // At least one nodes should be deleted.
   ASSERT_EQ(dfg.nodes.size(), count0 + 1);  // added a new FunctionBlock
-  ASSERT_EQ(6UL, count1);
+  ASSERT_EQ(6, count1);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 6b03ac7119b117e442e6af34c719c8a4f736bde9..181868977dd8f2568486ed0c4e1f260a69795896 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/pybind/pybind.h"
 
 DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
@@ -33,7 +33,7 @@ namespace inference {
 
 void Init(const std::vector<std::string> argv) {
   framework::InitGflags(argv);
-  operators::math::SetNumThreads(FLAGS_math_num_threads);
+  platform::SetNumThreads(FLAGS_math_num_threads);
   // init devices
   std::vector<int> devices;
   std::string token;
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
index caf599b1a68783f155cd134c2a29e9ffa49a0895..01b50b3670cb9da2e0be232a61ea6129dd83aa20 100644
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/init.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 03b0b6946339772ac535b3471d50fbd74554239d..5cc1db12bb71e428d493e7c6f718b1c6ed431858 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #ifdef PADDLE_WITH_MKLML
 #include <omp.h>
 #endif
@@ -164,7 +164,7 @@ TEST(inference, nlp) {
   // only use 1 thread number per std::thread
   omp_set_dynamic(0);
   omp_set_num_threads(1);
-  paddle::operators::math::SetNumThreads(1);
+  paddle::platform::SetNumThreads(1);
 #endif
 
   double start_ms = 0, stop_ms = 0;
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 4194ba197948b47003863196efdac1c08a7ae4f6..01a8501dd4abe73cbc71dc4c08734cae66df08ef 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -19,8 +19,9 @@ namespace paddle {
 namespace memory {
 namespace detail {
 
-BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
-                               size_t min_chunk_size, size_t max_chunk_size)
+BuddyAllocator::BuddyAllocator(
+    std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size,
+    size_t max_chunk_size)
     : min_chunk_size_(min_chunk_size),
       max_chunk_size_(max_chunk_size),
       cache_(system_allocator->UseGpu()),
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index 2f39d774d6fb6a2bc37877eb2f8b90bebd3cda28..f0c83efc23ce39c4fc89296d672e1e55751851bf 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <mutex>  // NOLINT
 #include <set>
 #include <tuple>
@@ -32,8 +33,8 @@ namespace detail {
 
 class BuddyAllocator {
  public:
-  BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size,
-                 size_t max_chunk_size);
+  BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator,
+                 size_t min_chunk_size, size_t max_chunk_size);
 
   ~BuddyAllocator();
 
@@ -103,7 +104,7 @@ class BuddyAllocator {
 
  private:
   /*! Allocate CPU/GPU memory from system */
-  SystemAllocator* system_allocator_;
+  std::unique_ptr<SystemAllocator> system_allocator_;
   std::mutex mutex_;
 };
 
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index bd98ed81899440a46415d30b6d74fec2dac4c155..7c800b3c164049244770ceb2070b177d8307e85e 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <vector>
+
 #include "paddle/fluid/memory/malloc.h"
 
 #include "glog/logging.h"
@@ -34,12 +36,15 @@ namespace memory {
 using BuddyAllocator = detail::BuddyAllocator;
 
 BuddyAllocator* GetCPUBuddyAllocator() {
+  static std::once_flag init_flag;
   static detail::BuddyAllocator* a = nullptr;
-  if (a == nullptr) {
-    a = new detail::BuddyAllocator(new detail::CPUAllocator,
-                                   platform::CpuMinChunkSize(),
-                                   platform::CpuMaxChunkSize());
-  }
+
+  std::call_once(init_flag, []() {
+    a = new detail::BuddyAllocator(
+        std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
+        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
+  });
+
   return a;
 }
 
@@ -68,27 +73,33 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
 #ifdef PADDLE_WITH_CUDA
 
 BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator** as = NULL;
-  if (as == NULL) {
+  static std::once_flag init_flag;
+  static detail::BuddyAllocator** a_arr = nullptr;
+
+  std::call_once(init_flag, [gpu_id]() {
     int gpu_num = platform::GetCUDADeviceCount();
-    as = new BuddyAllocator*[gpu_num];
-    for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] = nullptr;
+    PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id,
+                   gpu_num);
+
+    a_arr = new BuddyAllocator*[gpu_num];
+    for (int i = 0; i < gpu_num; i++) {
+      a_arr[i] = nullptr;
+      platform::SetDeviceId(i);
+      a_arr[i] = new BuddyAllocator(
+          std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)),
+          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+
+      VLOG(10) << "\n\nNOTE: each GPU device use "
+               << FLAGS_fraction_of_gpu_memory_to_use * 100
+               << "% of GPU memory.\n"
+               << "You can set GFlags environment variable '"
+               << "FLAGS_fraction_of_gpu_memory_to_use"
+               << "' to change the fraction of GPU usage.\n\n";
     }
-  }
+  });
+
   platform::SetDeviceId(gpu_id);
-  if (!as[gpu_id]) {
-    as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator(gpu_id),
-                                    platform::GpuMinChunkSize(),
-                                    platform::GpuMaxChunkSize());
-    VLOG(10) << "\n\nNOTE: each GPU device use "
-             << FLAGS_fraction_of_gpu_memory_to_use * 100
-             << "% of GPU memory.\n"
-             << "You can set GFlags environment variable '"
-             << "FLAGS_fraction_of_gpu_memory_to_use"
-             << "' to change the fraction of GPU usage.\n\n";
-  }
-  return as[gpu_id];
+  return a_arr[gpu_id];
 }
 
 template <>
@@ -125,12 +136,16 @@ void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
 }
 
 BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
-  static BuddyAllocator* ba = NULL;
-  if (ba == NULL) {
-    ba = new BuddyAllocator(new detail::CUDAPinnedAllocator,
+  static std::once_flag init_flag;
+  static BuddyAllocator* ba = nullptr;
+
+  std::call_once(init_flag, []() {
+    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
+                                new detail::CUDAPinnedAllocator),
                             platform::CUDAPinnedMinChunkSize(),
                             platform::CUDAPinnedMaxChunkSize());
-  }
+  });
+
   return ba;
 }
 
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 693bf973c2b8790d2c50cee9b86b365493e8c754..5912a1a17cbd29c3ebd83f37133c044f0905c8bd 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -216,6 +216,18 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
       saved_mean_e.setZero();
       saved_variance_e.setZero();
 
+      EigenVectorArrayMap<T> running_mean_arr(
+          mean_out->mutable_data<T>(ctx.GetPlace()), C);
+      EigenVectorArrayMap<T> running_var_arr(
+          variance_out->mutable_data<T>(ctx.GetPlace()), C);
+
+      if ((N * sample_size) == 1) {
+        LOG(WARNING) << "Only 1 element in normalization dimension, "
+                     << "we skip the batch norm calculation, let y = x.";
+        framework::TensorCopySync(*x, ctx.GetPlace(), y);
+        return;
+      }
+
       switch (data_layout) {
         case DataLayout::kNCHW: {
           ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
@@ -247,10 +259,6 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
           PADDLE_THROW("Unknown storage order: %s", data_layout_str);
       }
 
-      EigenVectorArrayMap<T> running_mean_arr(
-          mean_out->mutable_data<T>(ctx.GetPlace()), C);
-      EigenVectorArrayMap<T> running_var_arr(
-          variance_out->mutable_data<T>(ctx.GetPlace()), C);
       running_mean_arr =
           running_mean_arr * momentum + saved_mean_e * (1. - momentum);
       running_var_arr =
@@ -427,6 +435,11 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
     d_bias_arr.setZero();
     d_scale_arr.setZero();
 
+    if ((N * sample_size) == 1) {
+      framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x);
+      return;
+    }
+
     const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size);
 
     switch (data_layout) {
diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc
index 550dd32d36767f90e880415bfffaf01aeb623609..ca6cd8669352fd5814f25a04433ca97fe4abe9ff 100644
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@@ -72,6 +72,9 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     int N, C, H, W, D;
     ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
+    auto *y = ctx.Output<Tensor>("Y");
+    y->mutable_data<T>(ctx.GetPlace());
+
     // ------------------- cudnn descriptors ---------------------
     cudnnTensorDescriptor_t data_desc_;
     cudnnTensorDescriptor_t bn_param_desc_;
@@ -93,7 +96,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     mode_ = CUDNN_BATCHNORM_SPATIAL;
 #endif
 
-    VLOG(1) << "Setting descriptors.";
+    VLOG(3) << "Setting descriptors.";
     std::vector<int> dims;
     std::vector<int> strides;
     if (data_layout == DataLayout::kNCHW) {
@@ -113,11 +116,6 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     const auto *scale = ctx.Input<Tensor>("Scale");
     const auto *bias = ctx.Input<Tensor>("Bias");
 
-    auto *y = ctx.Output<Tensor>("Y");
-
-    // alloc memory
-    y->mutable_data<T>(ctx.GetPlace());
-
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
 
     auto handle = dev_ctx.cudnn_handle();
@@ -162,22 +160,28 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
       functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
       functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
 
-      double this_factor = 1. - momentum;
-
-      CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
-          handle, mode_, CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
-          data_desc_, x->template data<T>(), data_desc_,
-          y->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
-          scale->template data<BatchNormParamType<T>>(),
-          bias->template data<BatchNormParamType<T>>(), this_factor,
-          mean_out->template mutable_data<BatchNormParamType<T>>(
-              ctx.GetPlace()),
-          variance_out->template mutable_data<BatchNormParamType<T>>(
-              ctx.GetPlace()),
-          epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
-                       ctx.GetPlace()),
-          saved_variance->template mutable_data<BatchNormParamType<T>>(
-              ctx.GetPlace())));
+      if ((N * H * W * D) == 1) {
+        LOG(WARNING) << "Only 1 element in normalization dimension, "
+                     << "we skip the batch norm calculation, let y = x.";
+        framework::TensorCopySync(*x, ctx.GetPlace(), y);
+      } else {
+        double this_factor = 1. - momentum;
+
+        CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
+            handle, mode_, CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
+            data_desc_, x->template data<T>(), data_desc_,
+            y->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
+            scale->template data<BatchNormParamType<T>>(),
+            bias->template data<BatchNormParamType<T>>(), this_factor,
+            mean_out->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            variance_out->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
+                         ctx.GetPlace()),
+            saved_variance->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace())));
+      }
     }
 
     // clean when exit.
@@ -209,6 +213,25 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     int N, C, H, W, D;
     ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    d_x->mutable_data<T>(ctx.GetPlace());
+    d_scale->mutable_data<T>(ctx.GetPlace());
+    d_bias->mutable_data<T>(ctx.GetPlace());
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    if ((N * H * W * D) == 1) {
+      framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x);
+      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+          functor;
+      functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
+      functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
+      return;
+    }
+
     PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
     PADDLE_ENFORCE_EQ(scale->dims()[0], C);
 
@@ -247,21 +270,11 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
         bn_param_desc_, data_desc_, mode_));
 
-    // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    d_scale->mutable_data<T>(ctx.GetPlace());
-    d_bias->mutable_data<T>(ctx.GetPlace());
-
     const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
     const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
     const void *saved_mean_data = saved_mean->template data<T>();
     const void *saved_var_data = saved_var->template data<T>();
 
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc
index 8cc1d94260baccfe28d213b7e021956819e2e79e..580fde753816c30b188b8a99cc63fcbafde64e25 100644
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
@@ -205,9 +205,10 @@ class ConditionalBlockGradInferShape : public framework::InferShapeBase {
       context->SetOutputsDim(framework::GradVarName("Params"),
                              context->GetInputsDim("Params"));
     }
-    PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("X")));
-    context->SetOutputsDim(framework::GradVarName("X"),
-                           context->GetInputsDim("X"));
+    if (context->HasOutputs(framework::GradVarName("X"))) {
+      context->SetOutputsDim(framework::GradVarName("X"),
+                             context->GetInputsDim("X"));
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index d5e095f9cad95b74b8ff79e4a60ccbdf11512a5a..a3bec3da45136bca5cb2763e7ffd6b67703a1813 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -124,8 +124,7 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
              "Tensor<float/double> with shape [N x D].");
     AddOutput("Y",
               "(Tensor, default Tensor<float>), a 2-D tensor with shape "
-              "[N x 1]. The cross entropy loss.")
-        .Reuse("X");
+              "[N x 1]. The cross entropy loss.");
     AddAttr<bool>("soft_label",
                   "(bool, default false), a flag indicating whether to "
                   "interpretate the given labels as soft labels.")
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 312f80e09077f21a47985c1c936c2ac41c292ead..675ca36774beb72cc1e9b136ad0b18ce061689ac 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -5,7 +5,7 @@ if(WITH_GRPC)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   cc_test(serde_test SRCS grpc_serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
-          cares zlib protobuf sendrecvop_grpc SERIAL)
+          cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
   cc_test(grpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc
           grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
           proto_desc lookup_table_op SERIAL)
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index 8228a8c5a3eae73fe82551c8aad55290b0d54ef0..4a09f3870d64d8e14b2db41ff3ea7c2f9e67b558 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -35,10 +35,20 @@ void GRPCClient::InitEventLoop() {
   client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
 }
 
-void GRPCClient::SendComplete() {
+void GRPCClient::SendBeginPass() {
   for (auto& it : channels_) {
-    this->AsyncSendComplete(it.first);
+    VLOG(3) << "send begin pass to: " << it.first;
+    this->AsyncSendBeginPass(it.first);
   }
+  this->Wait();
+}
+
+void GRPCClient::SendEndPass() {
+  for (auto& it : channels_) {
+    VLOG(3) << "send end pass to " << it.first;
+    this->AsyncSendEndPass(it.first);
+  }
+  this->Wait();
 }
 
 GRPCClient::~GRPCClient() {
@@ -226,19 +236,32 @@ void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
   req_count_++;
 }
 
-void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) {
+void GRPCClient::AsyncSendBeginPass(const std::string& ep, int64_t time_out) {
   const auto ch = GetChannel(ep);
 
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
   s->Prepare(time_out);
 
   sendrecv::VariableMessage req;
-  req.set_varname(COMPLETE_MESSAGE);
+  req.set_varname(BEGIN_PASS_MESSAGE);
   auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   req_count_++;
 }
 
+void GRPCClient::AsyncSendEndPass(const std::string& ep, int64_t time_out) {
+  const auto ch = GetChannel(ep);
+
+  FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
+  s->Prepare(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(END_PASS_MESSAGE);
+  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+}
+
 void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
                                        const std::string& dir,
                                        int64_t time_out) {
diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h
index 7a08f2d3a4a28a4323723e6b887c50588eed2bce..5dae20155edcf9edd746a5d9a9bbe0ccd789f431 100644
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -77,11 +77,12 @@ class BaseProcessor {
     context_.reset(new grpc::ClientContext());
     var_h_ = var_info;
     context_->set_wait_for_ready(true);
-
-    std::chrono::system_clock::time_point deadline =
-        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
-
-    context_->set_deadline(deadline);
+    if (time_out) {
+      std::chrono::system_clock::time_point deadline =
+          std::chrono::system_clock::now() +
+          std::chrono::milliseconds(time_out);
+      context_->set_deadline(deadline);
+    }
   }
 
   virtual void Prepare(int64_t time_out) {
@@ -214,9 +215,17 @@ class GRPCClient : public RPCClient {
   void AsyncCheckpointNotify(const std::string& ep, const std::string& dir,
                              int64_t time_out = FLAGS_rpc_deadline) override;
 
+  void AsyncSendBeginPass(const std::string& ep,
+                          int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void AsyncSendEndPass(const std::string& ep,
+                        int64_t time_out = FLAGS_rpc_deadline) override;
+
   void Wait() override;
 
-  void SendComplete() override;
+  void SendBeginPass() override;
+
+  void SendEndPass() override;
 
  protected:
   void InitImpl() override;
@@ -227,9 +236,6 @@ class GRPCClient : public RPCClient {
 
   void Proceed();
 
-  void AsyncSendComplete(const std::string& ep,
-                         int64_t time_out = FLAGS_rpc_deadline);
-
   std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
 
  private:
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 90742a201ad46447d6fbbe2137aa40fabc2f9983..271306d5d20f1b849a81a9bfa6436f2faf261204 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -37,11 +37,14 @@ constexpr char kRequestSend[] = "RequestSend";
 constexpr char kRequestGet[] = "RequestGet";
 constexpr char kRequestPrefetch[] = "RequestPrefetch";
 constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
+constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
 
 #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
 #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
 #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
 #define COMPLETE_MESSAGE "COMPLETE@RECV"
+#define BEGIN_PASS_MESSAGE "BEGIN_PASS@RECV"
+#define END_PASS_MESSAGE "END_PASS@RECV"
 
 #define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
 #define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 163154c678f65b08981041d647b11f4b2b5860ba..5e6bff20f5f8c06e1497c697e3aabf7b9cb94ad6 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -55,14 +55,14 @@ bool RequestSendHandler::Handle(const std::string& varname,
   if (varname == BATCH_BARRIER_MESSAGE) {
     VLOG(3) << "sync: recv batch barrier message";
     rpc_server_->IncreaseBatchBarrier(kRequestSend);
-  } else if (varname == COMPLETE_MESSAGE) {
-    VLOG(3) << "sync: recv complete message";
-    rpc_server_->DecreaseClientNum();
+  } else if (varname == BEGIN_PASS_MESSAGE) {
+    VLOG(3) << "sync: recv begin pass message";
+    rpc_server_->WaitCond(kRequestSend);
+    rpc_server_->BeginPass();
   } else {
     VLOG(3) << "sync: received var_name: " << varname;
-    if (sync_mode_) {
-      rpc_server_->WaitCond(kRequestSend);
-    }
+    rpc_server_->WaitCond(kRequestSend);
+    VLOG(3) << "sync: processing received var: " << varname;
 
     if (invar == nullptr) {
       LOG(ERROR) << "sync: Can not find server side var: " << varname;
@@ -91,21 +91,21 @@ bool RequestGetHandler::Handle(const std::string& varname,
                                framework::Variable** outvar,
                                const std::string& out_var_name) {
   VLOG(4) << "RequestGetHandler:" << varname;
-
-  if (varname != FETCH_BARRIER_MESSAGE) {
-    if (sync_mode_) {
+  if (sync_mode_) {
+    if (varname == FETCH_BARRIER_MESSAGE) {
+      VLOG(3) << "sync: recv fetch barrier message";
+      rpc_server_->IncreaseBatchBarrier(kRequestGet);
+    } else if (varname == END_PASS_MESSAGE) {
+      rpc_server_->EndPass();
+    } else {
       rpc_server_->WaitCond(kRequestGet);
+      *outvar = scope_->FindVar(varname);
+    }
+  } else {
+    if (varname != FETCH_BARRIER_MESSAGE && varname != END_PASS_MESSAGE) {
+      *outvar = scope_->FindVar(varname);
     }
-    *outvar = scope_->FindVar(varname);
-    return true;
-  }
-
-  // FETCH_BARRIER_MESSAGE
-  if (sync_mode_) {
-    VLOG(3) << "sync: recv fetch barrier message";
-    rpc_server_->IncreaseBatchBarrier(kRequestGet);
   }
-
   return true;
 }
 
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index 37783b78ecc5c58aab3e358066bd7f2fba861799..6479d3a97bafba37b74a1d1c04852a6e60e01be8 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -60,10 +60,17 @@ class RPCClient {
                                      const std::string& dir,
                                      int64_t time_out = FLAGS_rpc_deadline) = 0;
 
-  // SendComplete tells all the server that current trainer have no more data
-  // to train, so that the pserver can reduce it's barrier count, and continue
-  // to train with other trainers.
-  virtual void SendComplete() = 0;
+  virtual void AsyncSendBeginPass(const std::string& ep,
+                                  int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual void AsyncSendEndPass(const std::string& ep,
+                                int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  // BeginePass/EndPass tells all the pserver that start/end a pass, so that
+  // the pserver can increase/reduce it's barrier count, and continue to train
+  // with other trainers.
+  virtual void SendBeginPass() = 0;
+  virtual void SendEndPass() = 0;
 
   virtual void Wait() = 0;
 
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
index c0520e248d49f4f390af9075fc6f87ec4bd74c39..d49ee34eeaf4e80f6fd4f8cdc548cc2b938d0f2a 100644
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -44,7 +44,8 @@ void RPCServer::SavePort() const {
 void RPCServer::WaitBarrier(const std::string& rpc_name) {
   std::unique_lock<std::mutex> lock(this->mutex_);
   barrier_cond_.wait(lock, [this, &rpc_name] {
-    return (barrier_counter_[rpc_name] >= client_num_ || exit_flag_.load());
+    return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) ||
+            exit_flag_.load());
   });
 
   VLOG(3) << "batch_barrier_: " << rpc_name << " "
@@ -63,10 +64,25 @@ void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
   }
 }
 
-void RPCServer::DecreaseClientNum() {
+void RPCServer::BeginPass() {
+  VLOG(4) << "RPCServer begin increase pass barrier";
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    client_num_++;
+    VLOG(4) << "increase client_num to: " << client_num_;
+  }
+  barrier_cond_.notify_all();
+}
+
+void RPCServer::EndPass() {
+  VLOG(4) << "RPCServer begin increase pass barrier";
   {
     std::unique_lock<std::mutex> lock(mutex_);
     client_num_--;
+    VLOG(4) << "decrease client_num to: " << client_num_;
+    if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
+      barrier_counter_[kRequestGet]--;
+    }
   }
   barrier_cond_.notify_all();
 }
diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
index cf25e78435bb470b25a46db647ca818571cc83a5..833991c8aa6e7cfd10f2aa52f9218be7ff8ccebf 100644
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -43,6 +43,9 @@ class RPCServer {
   bool IsExit() { return exit_flag_.load(); }
 
   int GetSelectedPort() const { return selected_port_; }
+
+  int GetClientNum() const;
+
   void SavePort() const;
 
   // RegisterRPC, register the rpc method name to a handler
@@ -60,7 +63,10 @@ class RPCServer {
   void SetCond(const std::string& rpc_name);
   void WaitCond(const std::string& rpc_name);
   void IncreaseBatchBarrier(const std::string rpc_name);
-  void DecreaseClientNum();
+
+  void BeginPass();
+  void EndPass();
+
   void ResetBarrierCounter();
 
  protected:
diff --git a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
index 3f612256840825a75f49944ab97ff957d572a863..1a5427b39241b666eeaf12b173ea00443bb5f6e4 100644
--- a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
@@ -85,7 +85,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
                      "Wrong layout/format set for X tensor");
       PADDLE_ENFORCE(y->layout() == DataLayout::kMKLDNN &&
                          y->format() != memory::format::format_undef,
-                     "Wrong layout/format set for X tensor");
+                     "Wrong layout/format set for Y tensor");
 
       std::vector<int> src_x_tz = framework::vectorize2int(x_dims);
       std::vector<int> src_y_tz = framework::vectorize2int(y_dims);
diff --git a/paddle/fluid/operators/fc_mkldnn_op.cc b/paddle/fluid/operators/fc_mkldnn_op.cc
index 847b7b0c12e1679501dbe83d578b23ca2aef3e9e..99fa659a351249a4a93f71700e1c646465861aba 100644
--- a/paddle/fluid/operators/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/fc_mkldnn_op.cc
@@ -115,6 +115,7 @@ class MKLDNNMemory {
 
 template <typename T>
 class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 53a478c1ac0bdf8c0a3f3721161779ef10cb14f8..5571ff9a7151c1f971ad1805bf001815a651202b 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -54,13 +54,13 @@ math_library(softmax DEPS math_function)
 math_library(unpooling)
 math_library(vol2col)
 
-cc_test(math_function_test SRCS math_function_test.cc)
+cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
 cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
 cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
 cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
 if(WITH_GPU)
-    nv_test(math_function_gpu_test SRCS math_function_test.cu)
-    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
+    nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
+    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor math_function)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat)
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index a907d6a71b7a16983e601073b039b48406853a0b..9f6c1e5c35f02cd4bc729eea78b17fac017aa90e 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -23,41 +23,12 @@
 
 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
-#ifdef LAPACK_FOUND
-#include <lapacke.h>
-#endif
-#endif
-
-#ifndef LAPACK_FOUND
-extern "C" {
-#include <cblas.h>  // NOLINT
-int LAPACKE_sgetrf(int matrix_layout, int m, int n, float* a, int lda,
-                   int* ipiv);
-int LAPACKE_dgetrf(int matrix_layout, int m, int n, double* a, int lda,
-                   int* ipiv);
-int LAPACKE_sgetri(int matrix_layout, int n, float* a, int lda,
-                   const int* ipiv);
-int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
-                   const int* ipiv);
-}
 #endif
 
 namespace paddle {
 namespace operators {
 namespace math {
 
-static void SetNumThreads(int num_threads) {
-#ifdef PADDLE_USE_OPENBLAS
-  int real_num_threads = num_threads > 1 ? num_threads : 1;
-  openblas_set_num_threads(real_num_threads);
-#elif defined(PADDLE_WITH_MKLML)
-  int real_num_threads = num_threads > 1 ? num_threads : 1;
-  platform::dynload::MKL_Set_Num_Threads(real_num_threads);
-#else
-  PADDLE_ENFORCE(false, "To be implemented.");
-#endif
-}
-
 /**
  * Matrix Descriptor of a memory buffer.
  *
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
index 56a039d3cec7375517573c9429801945bf99741e..7ec78d9ef8e7ff966674b043c017f2fbedb77bb9 100644
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -19,23 +19,6 @@ limitations under the License. */
 
 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
-#ifdef LAPACK_FOUND
-#include <lapacke.h>
-#endif
-#endif
-
-#ifndef LAPACK_FOUND
-extern "C" {
-#include <cblas.h>  // NOLINT
-int LAPACKE_sgetrf(int matrix_layout, int m, int n, float* a, int lda,
-                   int* ipiv);
-int LAPACKE_dgetrf(int matrix_layout, int m, int n, double* a, int lda,
-                   int* ipiv);
-int LAPACKE_sgetri(int matrix_layout, int n, float* a, int lda,
-                   const int* ipiv);
-int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
-                   const int* ipiv);
-}
 #endif
 
 #include <cmath>
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index a16861b3b77fc980ab932b9d88859b38ec36108b..2dc1467b0d4816d5cc0535eb62e936cf342a241c 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -44,8 +44,10 @@ class MergeLoDTensorOp : public framework::OperatorBase {
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
     auto level = static_cast<size_t>(Attr<int>("level"));
 
-    auto &mask_dim = mask.dims();
+    PADDLE_ENFORCE(in_true.numel() || in_false.numel(),
+                   "Input(InTrue) or Input(InFalse) should be initialized.");
 
+    auto &mask_dim = mask.dims();
     std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
     if (platform::is_cpu_place(mask.place())) {
       cpu_mask->ShareDataWith(mask);
@@ -59,19 +61,27 @@ class MergeLoDTensorOp : public framework::OperatorBase {
     }
     auto *mask_data = cpu_mask->data<bool>();
 
-    int rank = in_true.dims().size();
-    platform::Place place = in_true.place();
-    std::type_index data_type = in_true.type();
-    framework::DDim in_true_dims =
-        framework::slice_ddim(in_true.dims(), 1, rank);
-
+    platform::Place place = dev_place;
     int64_t batch_size = in_true.dims()[0] + in_false.dims()[0];
 
-    auto in_true_dim_vec = framework::vectorize(in_true_dims);
-    in_true_dim_vec.insert(in_true_dim_vec.begin(), batch_size);
+    std::type_index data_type =
+        in_true.IsInitialized() ? in_true.type() : in_false.type();
+    int rank;
+    framework::DDim in_dims;
+    if (in_true.IsInitialized()) {
+      rank = in_true.dims().size();
+      in_dims = framework::slice_ddim(in_true.dims(), 1, rank);
+    } else {
+      rank = in_false.dims().size();
+      in_dims = framework::slice_ddim(in_false.dims(), 1, rank);
+    }
+
+    auto in_dim_vec = framework::vectorize(in_dims);
+    in_dim_vec.insert(in_dim_vec.begin(), batch_size);
 
-    framework::DDim out_dims = framework::make_ddim(in_true_dim_vec);
+    framework::DDim out_dims = framework::make_ddim(in_dim_vec);
     out->Resize(out_dims);
+
     out->mutable_data(place, data_type);
 
     auto *out_lod = out->mutable_lod();
diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc
index ef54d79fdf2becde98c68044d14bd4347773b975..d5fb7a12e5d9757f3e639f6de7f0129bd531e2a1 100644
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include <vector>
 
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -27,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 
 USE_NO_KERNEL_OP(ncclInit);
diff --git a/paddle/fluid/operators/positive_negative_pair_op.h b/paddle/fluid/operators/positive_negative_pair_op.h
index f20f33bbeb19766d6974ea17b155cac363c01fb2..db0a1002f47944c5d926fb5a51b84536dcf446b8 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.h
+++ b/paddle/fluid/operators/positive_negative_pair_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
index 72a27d43584d55cd0859c63577ae85ff0f5fdfa8..65fcce8bb019965a805ad09d50be0aba64e4f24e 100644
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -66,9 +66,19 @@ class ReadOp : public framework::OperatorBase {
     std::vector<std::string> out_arg_names = Outputs("Out");
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
-    PADDLE_ENFORCE(!ins.empty(), "There is no next data.");
+    if (ins.empty()) {
+      if (Attr<bool>("throw_eof_exp")) {
+        PADDLE_THROW_EOF();
+      } else {
+        ins.resize(out_arg_names.size());
+        for (auto& tensor : ins) {
+          // data type is not important for subsequent DataBalanceOpHandle
+          tensor.mutable_data<float>(framework::make_ddim({0}), dev_place);
+        }
+      }
+    }
     PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
-    for (size_t i = 0; i < ins.size(); ++i) {
+    for (size_t i = 0; i < out_arg_names.size(); ++i) {
       auto* out =
           scope.FindVar(out_arg_names[i])->GetMutable<framework::LoDTensor>();
       out->ShareDataWith(ins[i]);
@@ -82,6 +92,14 @@ class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("Reader", "(ReaderHolder) The executed reader.");
     AddOutput("Out", "(LoDTensor) The output data.").AsDuplicable();
+    AddAttr<bool>(
+        "throw_eof_exp",
+        "If set true, an exception will be thrown when the Reader "
+        "yields empty (which means there is no next data).\n"
+        "NOTES: This flag must be true always. It will be set to false"
+        " only when the data-balance is enabled in ParallelExecutor"
+        " and it is set by ParallelExecutor instance, not users.")
+        .SetDefault(true);
     AddComment(R"DOC(
       Read Operator
 
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index a39c8a00538875e4e3284898230a6cb0693b7a12..9dbcc35e6f5bb01c159980a49dd4b4c9d37d2aab 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -22,7 +22,6 @@ reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc)
 reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc)
 reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
-reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
 reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
 reader_library(create_py_reader_op SRCS create_py_reader_op.cc)
 
diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc
index ecbae3894d551186f53625a6cc9cfdb36adc8d2d..1dbafd23e92732bdaf0d263a01e267227786d839 100644
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -20,15 +20,19 @@ namespace reader {
 
 class BatchReader : public framework::DecoratedReader {
  public:
-  BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size)
-      : DecoratedReader(reader), batch_size_(batch_size) {
+  BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size,
+              bool discard_leftover)
+      : DecoratedReader(reader),
+        batch_size_(batch_size),
+        discard_leftover_(discard_leftover) {
     buffer_.reserve(batch_size_);
   }
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
 
  private:
   int batch_size_;
+  bool discard_leftover_;
   std::vector<std::vector<framework::LoDTensor>> buffer_;
 };
 
@@ -46,8 +50,9 @@ class CreateBatchReaderOp : public framework::OperatorBase {
     }
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
-    out->Reset(
-        new BatchReader(underlying_reader.Get(), Attr<int>("batch_size")));
+    out->Reset(framework::MakeDecoratedReader<BatchReader>(
+        underlying_reader, Attr<int>("batch_size"),
+        Attr<bool>("discard_leftover")));
   }
 };
 
@@ -57,6 +62,10 @@ class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase {
     AddAttr<int>("batch_size",
                  "How many instances the batch reader yields each time.")
         .GreaterThan(0);
+    AddAttr<bool>("discard_leftover",
+                  "If true, the leftover instances that are not enough for a "
+                  "new batch will be discarded.")
+        .SetDefault(true);
     AddComment(R"DOC(
       CreateBatchReader Operator
 
@@ -66,7 +75,7 @@ class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase {
   }
 };
 
-void BatchReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+void BatchReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
   buffer_.clear();
   buffer_.reserve(batch_size_);
   for (int i = 0; i < batch_size_; ++i) {
@@ -77,6 +86,9 @@ void BatchReader::ReadNext(std::vector<framework::LoDTensor>* out) {
       break;
     }
   }
+  if (discard_leftover_ && buffer_.size() < batch_size_) {
+    buffer_.clear();
+  }
   // Concat instances
   out->clear();
   if (buffer_.empty()) {
diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc
index a75c6d4c567ac93f37b38070421133af305f20a3..85394b336fc967fc6973131fbedda4c796825185 100644
--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -33,7 +33,7 @@ class CustomReader : public framework::DecoratedReader {
         source_var_names_(source_var_names),
         sink_var_names_(sink_var_names) {}
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
 
  private:
   const framework::ProgramDesc program_;
@@ -60,10 +60,10 @@ class CreateCustomReaderOp : public framework::OperatorBase {
     }
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
-    out->Reset(
-        new CustomReader(underlying_reader.Get(), *sub_block,
-                         Attr<std::vector<std::string>>("source_var_names"),
-                         Attr<std::vector<std::string>>("sink_var_names")));
+    out->Reset(framework::MakeDecoratedReader<CustomReader>(
+        underlying_reader, *sub_block,
+        Attr<std::vector<std::string>>("source_var_names"),
+        Attr<std::vector<std::string>>("sink_var_names")));
   }
 };
 
@@ -143,7 +143,7 @@ class CustomReaderInferVarType : public framework::VarTypeInference {
   }
 };
 
-void CustomReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+void CustomReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
   out->clear();
   std::vector<framework::LoDTensor> underlying_outs;
   reader_->ReadNext(&underlying_outs);
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 5f734489a81764875988f440696682570ff4d1d7..7b14370f4fd64e8fd5b8d9038006494b88d671dc 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -23,13 +23,13 @@ namespace reader {
 
 // 'Double buffer' means we shall maintain two batches of input data at the same
 // time. So the kCacheSize shoul be at least 2.
-static constexpr size_t kCacheSize = 5;
+static constexpr size_t kCacheSize = 3;
 // There will be two bacthes out of the channel during training:
 // 1. the one waiting to be sent to the channel
 // 2. the one just be received from the channel, which is also being used by
 // subsequent operators.
 // So the channel size should be kChacheSize - 2
-static constexpr size_t kChannelSize = 3;  // kCacheSize - 2
+static constexpr size_t kChannelSize = 1;  // kCacheSize - 2
 
 class DoubleBufferReader : public framework::DecoratedReader {
  public:
@@ -50,12 +50,21 @@ class DoubleBufferReader : public framework::DecoratedReader {
     StartPrefetcher();
   }
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override;
-  void ReInit() override;
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
 
   ~DoubleBufferReader() { EndPrefetcher(); }
 
  private:
+  void ShutdownImpl() override {
+    EndPrefetcher();
+    reader_->Shutdown();
+  }
+
+  void StartImpl() override {
+    reader_->Start();
+    StartPrefetcher();
+  }
+
   void StartPrefetcher() {
     channel_ = new reader::BlockingQueue<size_t>(kChannelSize);
     prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
@@ -109,7 +118,8 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
       place = platform::CUDAPlace(static_cast<int>(num));
     }
 
-    out->Reset(new DoubleBufferReader(underlying_reader.Get(), place));
+    out->Reset(framework::MakeDecoratedReader<DoubleBufferReader>(
+        underlying_reader, place));
   }
 };
 
@@ -136,7 +146,7 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
   }
 };
 
-void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+void DoubleBufferReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
   size_t cached_tensor_id;
   if (channel_->Receive(&cached_tensor_id)) {
     if (platform::is_gpu_place(place_)) {
@@ -150,12 +160,6 @@ void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
   }
 }
 
-void DoubleBufferReader::ReInit() {
-  reader_->ReInit();
-  EndPrefetcher();
-  StartPrefetcher();
-}
-
 void DoubleBufferReader::PrefetchThreadFunc() {
   VLOG(5) << "A new prefetch thread starts.";
   size_t cached_tensor_id = 0;
diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
index 19b54110b9aeece33b8d6c73612ae0e12dbfafbd..0a225597d34f43c7fb82aeae2552cdf16c8ba566 100644
--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -24,23 +24,22 @@ class MultiPassReader : public framework::DecoratedReader {
   MultiPassReader(const std::shared_ptr<ReaderBase>& reader, int pass_num)
       : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {}
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
     reader_->ReadNext(out);
-    if (out->empty()) {
+    if (out->empty() && pass_count_ < pass_num_ - 1) {
+      reader_->Shutdown();
+      reader_->Start();
+      reader_->ReadNext(out);
       ++pass_count_;
-      if (pass_count_ < pass_num_) {
-        reader_->ReInit();
-        reader_->ReadNext(out);
-      }
     }
   }
 
-  void ReInit() override {
+ private:
+  void StartImpl() override {
     pass_count_ = 0;
-    reader_->ReInit();
+    reader_->Start();
   }
 
- private:
   int pass_num_;
   mutable int pass_count_;
 };
@@ -60,7 +59,8 @@ class CreateMultiPassReaderOp : public framework::OperatorBase {
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
     int pass_num = Attr<int>("pass_num");
-    out->Reset(new MultiPassReader(underlying_reader.Get(), pass_num));
+    out->Reset(framework::MakeDecoratedReader<MultiPassReader>(
+        underlying_reader, pass_num));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
index af1b2fbe14b0368c2959623aa70c240fc25bdef3..833776f56eef0ffb2ae5e963919f0482bcd511b8 100644
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -19,9 +19,10 @@ namespace paddle {
 namespace operators {
 namespace reader {
 
-class PyReader : public framework::ReaderBase {
+class PyReader : public framework::FileReader {
  public:
-  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue) {
+  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue)
+      : framework::FileReader() {
     PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
     queue_ = queue;
   }
@@ -32,7 +33,9 @@ class PyReader : public framework::ReaderBase {
     if (!success) out->clear();
   }
 
-  void ReInit() override { queue_->ReOpen(); }
+  void Shutdown() override { queue_->Close(); }
+
+  void Start() override { queue_->ReOpen(); }
 
  private:
   std::shared_ptr<LoDTensorBlockingQueue> queue_;
@@ -51,14 +54,14 @@ class CreatePyReaderOp : public framework::OperatorBase {
 
     const std::string& queue_name = Input("blocking_queue");
     auto* queue_holder_var = scope.FindVar(queue_name);
-    PADDLE_ENFORCE(
-        queue_holder_var != nullptr,
+    PADDLE_ENFORCE_NOT_NULL(
+        queue_holder_var,
         "No LoDTensorBlockingQueueHolder variable with name %s found",
         queue_name);
     auto* queue_holder =
         queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
 
-    out->Reset(new PyReader(queue_holder->GetQueue()));
+    out->Reset(std::make_shared<PyReader>(queue_holder->GetQueue()));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_random_data_generator_op.cc b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
index 5b7e8a063a034f0be056065826fca0fe807bc9a7..e5c116dfcd71ef40597ca19d1da0b51038baaad1 100644
--- a/paddle/fluid/operators/reader/create_random_data_generator_op.cc
+++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
@@ -19,11 +19,11 @@ namespace operators {
 namespace reader {
 
 template <typename T>
-class RandomDataGenerator : public framework::ReaderBase {
+class RandomDataGenerator : public framework::FileReader {
  public:
   RandomDataGenerator(const std::vector<framework::DDim>& shapes, float low,
                       float high)
-      : framework::ReaderBase(), low_(low), high_(high), shapes_(shapes) {
+      : framework::FileReader(), low_(low), high_(high), shapes_(shapes) {
     PADDLE_ENFORCE_LE(low, high,
                       "'low' shouldn't be greater than 'high'.(%f vs %f)", low,
                       high);
@@ -32,7 +32,7 @@ class RandomDataGenerator : public framework::ReaderBase {
     dist_ = std::uniform_real_distribution<float>(low_, high_);
   }
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
     out->clear();
     out->reserve(shapes_.size());
     for (const framework::DDim& shape : shapes_) {
@@ -51,8 +51,6 @@ class RandomDataGenerator : public framework::ReaderBase {
     }
   }
 
-  void ReInit() override { return; }
-
  private:
   float low_;
   float high_;
@@ -79,8 +77,8 @@ class CreateRandomDataGeneratorOp : public framework::OperatorBase {
     std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new RandomDataGenerator<T>(shapes, Attr<float>("low"),
-                                          Attr<float>("high")));
+    out->Reset(std::make_shared<RandomDataGenerator<T>>(
+        shapes, Attr<float>("low"), Attr<float>("high")));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
index 559827f08494af6730aafa1e67c46a47c21dedf6..b32f09b22524c8b67ce57cc6022ef46efc2e828d 100644
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@@ -21,10 +21,8 @@ namespace reader {
 template <bool ThreadSafe>
 class RecordIOFileReader : public framework::FileReader {
  public:
-  explicit RecordIOFileReader(const std::string& filename,
-                              const std::vector<framework::DDim>& dims)
-      : FileReader(dims),
-        scanner_(filename),
+  explicit RecordIOFileReader(const std::string& filename)
+      : scanner_(filename),
         dev_ctx_(*platform::DeviceContextPool::Instance().Get(
             platform::CPUPlace())) {
     if (ThreadSafe) {
@@ -33,8 +31,6 @@ class RecordIOFileReader : public framework::FileReader {
     LOG(INFO) << "Creating file reader" << filename;
   }
 
-  void ReInit() override { scanner_.Reset(); }
-
  protected:
   void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
     if (ThreadSafe) {
@@ -45,6 +41,8 @@ class RecordIOFileReader : public framework::FileReader {
     }
   }
 
+  void StartImpl() override { scanner_.Reset(); }
+
  private:
   std::unique_ptr<std::mutex> mutex_;
   recordio::Scanner scanner_;
@@ -58,20 +56,11 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
-    const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
-    const auto& ranks = Attr<std::vector<int>>("ranks");
-    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
-    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
-                      static_cast<int>(shape_concat.size()),
-                      "The accumulate of all ranks should be equal to the "
-                      "shape concat's length.");
     std::string filename = Attr<std::string>("filename");
-
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
 
-    out->Reset(new RecordIOFileReader<true>(
-        filename, RestoreShapes(shape_concat, ranks)));
+    out->Reset(std::make_shared<RecordIOFileReader<true>>(filename));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
index 57e8e21214b7c99e52550fe51a67c9b5201cb46f..4b308abc290c10a8a5846672e719b503dfc79b21 100644
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -34,7 +34,7 @@ class ShuffleReader : public framework::DecoratedReader {
     ReloadBuffer();
   }
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
     out->clear();
     if (iteration_pos_ >= buffer_.size()) {
       VLOG(10) << "Resetting shuffle buffer";
@@ -47,6 +47,17 @@ class ShuffleReader : public framework::DecoratedReader {
   }
 
  private:
+  void ShutdownImpl() override {
+    buffer_.clear();
+    iteration_pos_ = 0;
+    reader_->Shutdown();
+  }
+
+  void StartImpl() override {
+    reader_->Start();
+    ReloadBuffer();
+  }
+
   void ReloadBuffer() {
     buffer_.clear();
     buffer_.reserve(buffer_size_);
@@ -86,9 +97,8 @@ class CreateShuffleReaderOp : public framework::OperatorBase {
     }
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
-    out->Reset(
-        new ShuffleReader(underlying_reader.Get(),
-                          static_cast<size_t>(Attr<int>("buffer_size"))));
+    out->Reset(framework::MakeDecoratedReader<ShuffleReader>(
+        underlying_reader, static_cast<size_t>(Attr<int>("buffer_size"))));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_threaded_reader_op.cc b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
deleted file mode 100644
index 3798015146f4ffb085aa82e23ca3f1fb3c5cf5a4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/reader/reader_op_registry.h"
-
-namespace paddle {
-namespace operators {
-namespace reader {
-
-class ThreadedReader : public framework::DecoratedReader {
- public:
-  explicit ThreadedReader(const std::shared_ptr<ReaderBase>& reader)
-      : DecoratedReader(reader) {}
-
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
-    std::lock_guard<std::mutex> lock(mutex_);
-    reader_->ReadNext(out);
-  }
-
-  void ReInit() override { reader_->ReInit(); }
-
- private:
-  std::mutex mutex_;
-};
-
-class CreateThreadedReaderOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    auto* out = detail::Ref(scope.FindVar(Output("Out")))
-                    .GetMutable<framework::ReaderHolder>();
-    if (out->Get() != nullptr) {
-      return;
-    }
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
-    out->Reset(new ThreadedReader(underlying_reader.Get()));
-  }
-};
-
-class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
- protected:
-  void Apply() override {
-    AddComment(R"DOC(
-      CreateThreadedReader Operator
-
-      This operator creates a threaded reader. A threaded reader's
-      'ReadNext()' can be invoked by several threads at the same
-      time.
-      When the attribute 'safe_mode' is true, the threaded reader's
-      'ReInit()' is disabled to avoid unexpected bugs in multi-thread
-      environment.
-    )DOC");
-  }
-};
-
-}  // namespace reader
-}  // namespace operators
-}  // namespace paddle
-
-namespace reader = paddle::operators::reader;
-REGISTER_DECORATED_READER_OPERATOR(create_threaded_reader,
-                                   reader::CreateThreadedReaderOp,
-                                   reader::CreateThreadedReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
index 31e5d81e55ed9703eb3a9ef2595fa2a280f1a734..9a8d203672fa2d560440d063d93fa5f8523690ef 100644
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -23,24 +23,26 @@ namespace reader {
 
 class MultiFileReader : public framework::ReaderBase {
  public:
-  MultiFileReader(const std::vector<std::string>& file_names,
-                  const std::vector<framework::DDim>& dims, size_t thread_num,
+  MultiFileReader(const std::vector<std::string>& file_names, size_t thread_num,
                   size_t buffer_size)
       : buffer_size_(buffer_size) {
     readers_.reserve(file_names.size());
     for (const std::string& f_name : file_names) {
-      readers_.emplace_back(CreateReaderByFileName(f_name, dims));
+      readers_.emplace_back(CreateReaderByFileName(f_name));
     }
     prefetchers_.resize(thread_num);
     StartNewScheduler();
   }
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override;
-  void ReInit() override;
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
 
   ~MultiFileReader() { EndScheduler(); }
 
  private:
+  void ShutdownImpl() override { EndScheduler(); }
+
+  void StartImpl() override { StartNewScheduler(); }
+
   void StartNewScheduler();
   void EndScheduler();
   void ScheduleThreadFunc();
@@ -55,17 +57,12 @@ class MultiFileReader : public framework::ReaderBase {
   reader::BlockingQueue<std::vector<framework::LoDTensor>>* buffer_;
 };
 
-void MultiFileReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+void MultiFileReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
   if (!buffer_->Receive(out)) {
     out->clear();
   }
 }
 
-void MultiFileReader::ReInit() {
-  EndScheduler();
-  StartNewScheduler();
-}
-
 void MultiFileReader::StartNewScheduler() {
   size_t thread_num = prefetchers_.size();
   waiting_reader_idx_ = new reader::BlockingQueue<size_t>(readers_.size());
@@ -120,7 +117,7 @@ void MultiFileReader::ScheduleThreadFunc() {
       }
     }
   }
-  // If users invoke ReInit() when scheduler is running, it will close the
+  // If users invoke Shutdown() when scheduler is running, it will close the
   // 'avaiable_thread_idx_' and prefecther threads have no way to tell scheduler
   // to release their resource. So a check is needed before scheduler ends.
   for (auto& p : prefetchers_) {
@@ -138,7 +135,8 @@ void MultiFileReader::PrefetchThreadFunc(size_t reader_idx, size_t thread_idx) {
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
     if (ins.empty()) {
-      reader->ReInit();
+      reader->Shutdown();
+      reader->Start();
       break;
     }
     try {
@@ -180,9 +178,8 @@ class OpenFilesOp : public framework::OperatorBase {
 
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new MultiFileReader(file_names,
-                                   RestoreShapes(shape_concat, ranks),
-                                   thread_num, buffer_size));
+    out->Reset(
+        std::make_shared<MultiFileReader>(file_names, thread_num, buffer_size));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index e11256a49ffa6adc9410376cc8a71fa017df7e9c..b82aab1214992be73d876a42424234e3cea46455 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -39,7 +39,7 @@ std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry() {
 }
 
 std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
-    const std::string& file_name, const std::vector<framework::DDim>& dims) {
+    const std::string& file_name) {
   size_t separator_pos = file_name.find_last_of(kFileFormatSeparator);
   PADDLE_ENFORCE_NE(separator_pos, std::string::npos,
                     "File name illegal! A legal file name should be like: "
@@ -49,7 +49,7 @@ std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
   auto itor = FileReaderRegistry().find(filetype);
   PADDLE_ENFORCE(itor != FileReaderRegistry().end(),
                  "No file reader registered for '%s' format.", filetype);
-  framework::ReaderBase* reader = (itor->second)(file_name, dims);
+  framework::ReaderBase* reader = (itor->second)(file_name);
   return std::unique_ptr<framework::ReaderBase>(reader);
 }
 
diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h
index 244bf15f068a47efc29ee54492cdbdeb10025020..25c3e7d77b788d38daf6dee1fc79e5c1c97e8842 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
@@ -25,22 +25,21 @@ namespace reader {
 
 static constexpr char kFileFormatSeparator[] = ".";
 
-using FileReaderCreator = std::function<framework::ReaderBase*(
-    const std::string&, const std::vector<framework::DDim>&)>;
+using FileReaderCreator =
+    std::function<framework::ReaderBase*(const std::string&)>;
 
 std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry();
 
 template <typename Reader>
 int RegisterFileReader(const std::string& filetype) {
-  FileReaderRegistry()[filetype] = [](
-      const std::string& fn, const std::vector<framework::DDim>& dims) {
-    return new Reader(fn, dims);
+  FileReaderRegistry()[filetype] = [](const std::string& fn) {
+    return new Reader(fn);
   };
   return 0;
 }
 
 std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
-    const std::string& file_name, const std::vector<framework::DDim>& dims);
+    const std::string& file_name);
 
 extern std::vector<framework::DDim> RestoreShapes(
     const std::vector<int>& shape_concat, const std::vector<int>& ranks);
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 7f743f577fbcdaf6f62e01031e25ef09a842c2e9..918f3be533d51367eade5f5108ad2eab954a9303 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -12,14 +12,108 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/reshape_op.h"
-
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
+class ReshapeOp : public framework::OperatorWithKernel {
+ public:
+  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReshapeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReshapeOp should not be null.");
+
+    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    PADDLE_ENFORCE(!shape.empty(),
+                   "The shape information must be set by Attr(shape).");
+
+    if (ctx->HasInput("Shape") && ctx->IsRuntime()) {
+      // If true, set the shape of Output(Out) according to Input(Shape) in
+      // ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel.
+      ctx->ShareLoD("X", /*->*/ "Out");
+      return;
+    }
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto out_dims = ValidateShape(shape, x_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    if (x_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
+  }
+
+  static framework::DDim ValidateShape(const std::vector<int> shape,
+                                       const framework::DDim &in_dims) {
+    const int64_t in_size = framework::product(in_dims);
+    // only one dimension can be set to -1, whose size will be automatically
+    // infered.
+    const int64_t unk_dim_val = -1;
+    const int64_t copy_dim_val = 0;
+
+    std::vector<int64_t> output_shape(shape.size(), 0);
+    int64_t capacity = 1;
+    int unk_dim_idx = -1;
+    for (size_t i = 0; i < shape.size(); ++i) {
+      if (shape[i] == unk_dim_val) {
+        PADDLE_ENFORCE(
+            unk_dim_idx == -1,
+            "Only one input dimension of Attr(shape) can be unknown.");
+        unk_dim_idx = i;
+      } else if (shape[i] == copy_dim_val) {
+        PADDLE_ENFORCE(
+            static_cast<int>(i) < in_dims.size(),
+            "The index of dimension to copy from input shape must be less "
+            "than the size of input shape.");
+      } else {
+        PADDLE_ENFORCE(
+            shape[i] > 0,
+            "Each input dimension of Attr(shape) must not be negtive except "
+            "one unknown dimension.");
+      }
+
+      capacity *= (shape[i] ? shape[i] : in_dims[i]);
+      output_shape[i] =
+          (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
+    }
+
+    if (unk_dim_idx != -1) {
+      if (in_size > 0) {
+        // in_size < 0 and is un-determinate in compile time, skip the check,
+        // for example, in_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
+        // capacity = -24, in_size = -8, output_shape[0] = 0
+        // the following check will fail.
+        output_shape[unk_dim_idx] = -in_size / capacity;
+        PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
+                          "Invalid shape is given.");
+      } else {
+        output_shape[unk_dim_idx] = -1;
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
+    }
+    return framework::make_ddim(output_shape);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
 class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -107,19 +201,93 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class ReshapeKernel {
+ public:
+  void operator()(const framework::ExecutionContext &ctx) const {
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *in = ctx.Input<framework::LoDTensor>("X");
+
+    auto *shape_tensor = ctx.HasInput("Shape")
+                             ? ctx.Input<framework::LoDTensor>("Shape")
+                             : nullptr;
+
+    framework::DDim out_dims = out->dims();
+
+    if (shape_tensor) {
+      auto *shape_data = shape_tensor->data<int>();
+      framework::Tensor cpu_shape_tensor;
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
+        shape_data = cpu_shape_tensor.data<int>();
+      }
+      auto shape =
+          std::vector<int>(shape_data, shape_data + shape_tensor->numel());
+      out_dims = ReshapeOp::ValidateShape(shape, in->dims());
+    }
+    if (!in->lod().empty()) {
+      PADDLE_ENFORCE_EQ(
+          out_dims[0], in->dims()[0],
+          "Reshape operator cannot reshape an input sequence batch "
+          "into an output sequence batch that has a different "
+          "number of time steps. Please consider using "
+          "sequence_reshape op.");
+    }
+
+    bool inplace = ctx.Attr<bool>("inplace");
+    out->Resize(out_dims);
+    if (!inplace) {
+      out->mutable_data(ctx.GetPlace(), in->type());
+      framework::TensorCopySync(*in, ctx.GetPlace(), out);
+      out->Resize(out_dims);
+    } else {
+      out->ShareDataWith(*in);
+      out->Resize(out_dims);
+    }
+  }
+};
+
+class ReshapeGradKernel {
+ public:
+  void operator()(const framework::ExecutionContext &ctx) const {
+    auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    d_x->mutable_data(ctx.GetPlace(), d_out->type());
+    bool inplace = ctx.Attr<bool>("inplace");
+
+    auto in_dims = d_x->dims();
+    if (!inplace) {
+      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
+      ctx.device_context().Wait();
+      d_x->Resize(in_dims);
+    } else {
+      d_x->ShareDataWith(*d_out);
+      d_x->Resize(in_dims);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
 
 REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp);
-REGISTER_OP_CPU_KERNEL(reshape, ops::ReshapeKernel<CPU, float>,
-                       ops::ReshapeKernel<CPU, double>,
-                       ops::ReshapeKernel<CPU, int>,
-                       ops::ReshapeKernel<CPU, int64_t>);
-REGISTER_OP_CPU_KERNEL(reshape_grad, ops::ReshapeGradKernel<CPU, float>,
-                       ops::ReshapeGradKernel<CPU, double>,
-                       ops::ReshapeGradKernel<CPU, int>,
-                       ops::ReshapeGradKernel<CPU, int64_t>);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
+                               ops::ReshapeKernel, int, ops::ReshapeKernel,
+                               int64_t, ops::ReshapeKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
+                               double, ops::ReshapeGradKernel, int,
+                               ops::ReshapeGradKernel, int64_t,
+                               ops::ReshapeGradKernel);
+
+#ifdef PADDLE_WITH_CUDA
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
+                                ops::ReshapeKernel, int, ops::ReshapeKernel,
+                                int64_t, ops::ReshapeKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
+                                double, ops::ReshapeGradKernel, int,
+                                ops::ReshapeGradKernel, int64_t,
+                                ops::ReshapeGradKernel);
+#endif
diff --git a/paddle/fluid/operators/reshape_op.cu b/paddle/fluid/operators/reshape_op.cu
deleted file mode 100644
index c628c634e2bc9ae260948a6e7ccf786cbd6c5c3c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reshape_op.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reshape_op.h"
-using CUDA = paddle::platform::CUDADeviceContext;
-
-REGISTER_OP_CUDA_KERNEL(reshape, paddle::operators::ReshapeKernel<CUDA, float>,
-                        paddle::operators::ReshapeKernel<CUDA, double>,
-                        paddle::operators::ReshapeKernel<CUDA, int>,
-                        paddle::operators::ReshapeKernel<CUDA, int64_t>);
-REGISTER_OP_CUDA_KERNEL(reshape_grad,
-                        paddle::operators::ReshapeGradKernel<CUDA, float>,
-                        paddle::operators::ReshapeGradKernel<CUDA, double>,
-                        paddle::operators::ReshapeGradKernel<CUDA, int>,
-                        paddle::operators::ReshapeGradKernel<CUDA, int64_t>);
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
deleted file mode 100644
index 3dd8c7c11eca241e747bfa129962032d882ce44c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reshape_op.h
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class ReshapeOp : public framework::OperatorWithKernel {
- public:
-  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ReshapeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ReshapeOp should not be null.");
-
-    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    PADDLE_ENFORCE(!shape.empty(),
-                   "The shape information must be set by Attr(shape).");
-
-    if (ctx->HasInput("Shape") && ctx->IsRuntime()) {
-      // If true, set the shape of Output(Out) according to Input(Shape) in
-      // ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel.
-      ctx->ShareLoD("X", /*->*/ "Out");
-      return;
-    }
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto out_dims = ValidateShape(shape, x_dims);
-    ctx->SetOutputDim("Out", out_dims);
-    if (x_dims[0] == out_dims[0]) {
-      // Only pass LoD when the first dimension of output and Input(X)
-      // are the same.
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
-  static framework::DDim ValidateShape(const std::vector<int> shape,
-                                       const framework::DDim &in_dims) {
-    const int64_t in_size = framework::product(in_dims);
-    // only one dimension can be set to -1, whose size will be automatically
-    // infered.
-    const int64_t unk_dim_val = -1;
-    const int64_t copy_dim_val = 0;
-
-    std::vector<int64_t> output_shape(shape.size(), 0);
-    int64_t capacity = 1;
-    int unk_dim_idx = -1;
-    for (size_t i = 0; i < shape.size(); ++i) {
-      if (shape[i] == unk_dim_val) {
-        PADDLE_ENFORCE(
-            unk_dim_idx == -1,
-            "Only one input dimension of Attr(shape) can be unknown.");
-        unk_dim_idx = i;
-      } else if (shape[i] == copy_dim_val) {
-        PADDLE_ENFORCE(
-            static_cast<int>(i) < in_dims.size(),
-            "The index of dimension to copy from input shape must be less "
-            "than the size of input shape.");
-      } else {
-        PADDLE_ENFORCE(
-            shape[i] > 0,
-            "Each input dimension of Attr(shape) must not be negtive except "
-            "one unknown dimension.");
-      }
-
-      capacity *= (shape[i] ? shape[i] : in_dims[i]);
-      output_shape[i] =
-          (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
-    }
-
-    if (unk_dim_idx != -1) {
-      if (in_size > 0) {
-        // in_size < 0 and is un-determinate in compile time, skip the check,
-        // for example, in_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
-        // capacity = -24, in_size = -8, output_shape[0] = 0
-        // the following check will fail.
-        output_shape[unk_dim_idx] = -in_size / capacity;
-        PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
-                          "Invalid shape is given.");
-      } else {
-        output_shape[unk_dim_idx] = -1;
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
-    }
-    return framework::make_ddim(output_shape);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReshapeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const {
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
-    auto *in = ctx.Input<framework::LoDTensor>("X");
-
-    auto *shape_tensor = ctx.HasInput("Shape")
-                             ? ctx.Input<framework::LoDTensor>("Shape")
-                             : nullptr;
-
-    framework::DDim out_dims = out->dims();
-
-    if (shape_tensor) {
-      auto *shape_data = shape_tensor->data<int>();
-      framework::Tensor cpu_shape_tensor;
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-        TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
-        shape_data = cpu_shape_tensor.data<int>();
-      }
-      auto shape =
-          std::vector<int>(shape_data, shape_data + shape_tensor->numel());
-      out_dims = ReshapeOp::ValidateShape(shape, in->dims());
-    }
-    if (!in->lod().empty()) {
-      PADDLE_ENFORCE_EQ(
-          out_dims[0], in->dims()[0],
-          "Reshape operator cannot reshape an input sequence batch "
-          "into an output sequence batch that has a different "
-          "number of time steps. Please consider using "
-          "sequence_reshape op.");
-    }
-
-    bool inplace = ctx.Attr<bool>("inplace");
-    out->Resize(out_dims);
-    if (!inplace) {
-      out->mutable_data<T>(ctx.GetPlace());
-      framework::TensorCopySync(*in, ctx.GetPlace(), out);
-      out->Resize(out_dims);
-    } else {
-      out->ShareDataWith(*in);
-      out->Resize(out_dims);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReshapeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const {
-    auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    bool inplace = ctx.Attr<bool>("inplace");
-
-    auto in_dims = d_x->dims();
-    if (!inplace) {
-      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
-      ctx.device_context().Wait();
-      d_x->Resize(in_dims);
-    } else {
-      d_x->ShareDataWith(*d_out);
-      d_x->Resize(in_dims);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index b29035bafd34fa81dc6b59691142fe74439202b8..20037d0764056c2a093af801c9cc1eb788dd46d6 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -28,6 +28,9 @@ cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 add_subdirectory(dynload)
 
+cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce)
+cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper)
+
 IF(WITH_GPU)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
 ELSE()
@@ -42,10 +45,12 @@ ENDIF()
 
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
-cc_library(device_context SRCS device_context.cc DEPS malloc
-    place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+cc_library(device_context SRCS device_context.cc init.cc DEPS malloc
+    place eigen3 stringpiece cpu_helper ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
+cc_test(init_test SRCS init_test.cc DEPS device_context)
+
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 
@@ -53,5 +58,5 @@ cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framewo
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 
-nv_test(float16_gpu_test SRCS float16_test.cu)
-cc_test(float16_test SRCS float16_test.cc)
+nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
+cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..77ecb170111d63f23312d06fa8a8172bc45f2a4e
--- /dev/null
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+#ifdef PADDLE_USE_OPENBLAS
+#include <cblas.h>
+#endif
+
+namespace paddle {
+namespace platform {
+
+void SetNumThreads(int num_threads) {
+#ifdef PADDLE_USE_OPENBLAS
+  int real_num_threads = num_threads > 1 ? num_threads : 1;
+  openblas_set_num_threads(real_num_threads);
+#elif defined(PADDLE_WITH_MKLML)
+  int real_num_threads = num_threads > 1 ? num_threads : 1;
+  platform::dynload::MKL_Set_Num_Threads(real_num_threads);
+#else
+  PADDLE_ENFORCE(false, "To be implemented.");
+#endif
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_helper.h b/paddle/fluid/platform/cpu_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..78fc392b632ef92d4ae08de2051041fc0bf6778b
--- /dev/null
+++ b/paddle/fluid/platform/cpu_helper.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>
+
+namespace paddle {
+namespace platform {
+
+//! Set the number of threads in use.
+void SetNumThreads(int num_threads);
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_helper_test.cc b/paddle/fluid/platform/cpu_helper_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dc1b2b56cd98ca6259c46a76231dbc99482970c1
--- /dev/null
+++ b/paddle/fluid/platform/cpu_helper_test.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/cpu_helper.h"
+
+#include "gtest/gtest.h"
+
+TEST(CpuHelper, SetNumThread) {
+  paddle::platform::SetNumThreads(1);
+  paddle::platform::SetNumThreads(4);
+}
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 6c50ab2685c56bafe146c67fe2ef081ee4c55628..2cc26da013f59f5b7ee1747d57baca9c1c0efe2c 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 
+#include <set>
 #include <string>
 #include <unordered_set>
 #include <vector>
@@ -35,7 +36,7 @@ DeviceContextPool::DeviceContextPool(
     const std::vector<platform::Place>& places) {
   PADDLE_ENFORCE_GT(places.size(), 0);
   using PtrType = std::unique_ptr<DeviceContext>;
-  std::unordered_set<Place, PlaceHash> set;
+  std::set<Place> set;
   for (auto& p : places) {
     set.insert(p);
   }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 292ffef1aef12732812b8c5b0020cad73b1d06fc..88e0383146c1adf2752a362091996bad9cfcce5e 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -27,12 +27,12 @@ limitations under the License. */
 #include <mkldnn.hpp>
 #endif
 
+#include <map>
+#include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
-#include "glog/logging.h"
-
 namespace paddle {
 namespace platform {
 
@@ -201,9 +201,7 @@ class DeviceContextPool {
 
  private:
   static DeviceContextPool* pool;
-  std::unordered_map<const platform::Place,
-                     std::unique_ptr<platform::DeviceContext>, PlaceHash>
-      device_contexts_;
+  std::map<Place, std::unique_ptr<DeviceContext>> device_contexts_;
   DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
 };
 
diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu
index fa806aba6d8747beebc3eed2c661b326dd62fd76..171d2979a0218ad5e22112190a59866b3e0b617f 100644
--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -69,19 +69,3 @@ TEST(Device, DeviceContextPool) {
     ASSERT_NE(dev_ctx, nullptr);
   }
 }
-
-int main(int argc, char** argv) {
-  std::vector<paddle::platform::Place> places;
-
-  places.emplace_back(paddle::platform::CPUPlace());
-  int count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < count; ++i) {
-    places.emplace_back(paddle::platform::CUDAPlace(i));
-  }
-
-  VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Init(places);
-
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 198d8566b1bd726c5b33d8af22a19cb30a280fa2..93bf7c13516ffa4baca6a30f1daf946939726d85 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -36,8 +36,6 @@ DEFINE_string(cuda_dir, "",
 
 DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 
-DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
-
 DEFINE_string(nccl_dir, "",
               "Specify path for loading nccl library, such as libcublas, "
               "libcurand. For instance, /usr/local/cuda/lib64. If default, "
@@ -189,14 +187,6 @@ void* GetWarpCTCDsoHandle() {
 #endif
 }
 
-void* GetLapackDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so");
-#endif
-}
-
 void* GetNCCLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib");
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index ca87dc47f355a8a4fc840262044413414edf00a0..84fd2ce9987628a5ed29e4125a03dedb96e416c1 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -23,7 +23,6 @@ void* GetCUDNNDsoHandle();
 void* GetCUPTIDsoHandle();
 void* GetCurandDsoHandle();
 void* GetWarpCTCDsoHandle();
-void* GetLapackDsoHandle();
 void* GetNCCLDsoHandle();
 void* GetTensorRtDsoHandle();
 void* GetMKLMLDsoHandle();
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 70bc9c4e8340b8e02ef2826d828faff3f6d11965..566485cd3c383640047d97f40b452735e8c8c171 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -102,6 +102,15 @@ struct EnforceNotMet : public std::exception {
   const char* what() const noexcept { return err_str_.c_str(); }
 };
 
+struct EOFException : public std::exception {
+  std::string err_str_;
+  EOFException(const char* err_msg, const char* f, int l) {
+    err_str_ = string::Sprintf("%s at [%s:%d]", err_msg, f, l);
+  }
+
+  const char* what() const noexcept { return err_str_.c_str(); }
+};
+
 // Because most enforce conditions would evaluate to true, we can use
 // __builtin_expect to instruct the C++ compiler to generate code that
 // always forces branch prediction of true.
@@ -242,6 +251,11 @@ inline void throw_on_error(T e) {
 #define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
 #endif
 
+#define PADDLE_THROW_EOF()                                                     \
+  do {                                                                         \
+    throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
+                                           __LINE__);                          \
+  } while (false)
 /*
  * Some enforce helpers here, usage:
  *    int a = 1;
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 57d751cc00b5f11f1ba1a3b0c9a6b7ce9e79f586..0e8684581a93f076b1a077cc52e966d3c88cf078 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -210,3 +210,14 @@ TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
   Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
   ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet);
 }
+
+TEST(EOF_EXCEPTION, THROW_EOF) {
+  bool caught_eof = false;
+  try {
+    PADDLE_THROW_EOF();
+  } catch (paddle::platform::EOFException error) {
+    caught_eof = true;
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "There is no next data."));
+  }
+  EXPECT_TRUE(caught_eof);
+}
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index a589e32b61a9b6a44bdc4529eee715d987d6922c..ede294be1e2e26693bd3ead2ccd5e6a6c8a075bc 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -13,8 +13,8 @@ limitations under the License. */
 #include <vector>
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/init.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 577fc24ceb1d3c83cc0546dc5db9c8c7c1f01f86..1b9cf9b5d3fa2121b588c31d7cf2f4c50cb951bc 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 #define ARITHMETIC_KERNEL(op_type, sign)                                 \
   __global__ void op_type(const half* in1, const half* in2, half* out) { \
diff --git a/paddle/fluid/framework/init.cc b/paddle/fluid/platform/init.cc
similarity index 96%
rename from paddle/fluid/framework/init.cc
rename to paddle/fluid/platform/init.cc
index a1094976f6c0965ac0a601d7e37575969146fdab..0b776528414735e8a7c1e3763e7ccb662bb9f285 100644
--- a/paddle/fluid/framework/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -16,10 +16,10 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"
 
@@ -115,7 +115,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
   places.emplace_back(platform::CPUPlace());
   platform::DeviceContextPool::Init(places);
 #ifndef PADDLE_WITH_MKLDNN
-  operators::math::SetNumThreads(1);
+  platform::SetNumThreads(1);
 #endif
 }
 
diff --git a/paddle/fluid/framework/init.h b/paddle/fluid/platform/init.h
similarity index 100%
rename from paddle/fluid/framework/init.h
rename to paddle/fluid/platform/init.h
diff --git a/paddle/fluid/framework/init_test.cc b/paddle/fluid/platform/init_test.cc
similarity index 96%
rename from paddle/fluid/framework/init_test.cc
rename to paddle/fluid/platform/init_test.cc
index 928e2d14abea604cf483f4bc1e1c58fbae04dd21..eef1470a90c7da15efff965fc8f66dfa616ba25f 100644
--- a/paddle/fluid/framework/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/init.h"
 
 TEST(InitDevices, CPU) {
   using paddle::framework::InitDevices;
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index ad54a878996bd36f2d714f6554b44c89dae3fd0c..e3ee504f3d042d6a99036e34507c4c8bee306750 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -30,6 +30,7 @@ struct CPUPlace {
   // needed for variant equality comparison
   inline bool operator==(const CPUPlace &) const { return true; }
   inline bool operator!=(const CPUPlace &) const { return false; }
+  inline bool operator<(const CPUPlace &) const { return false; }
 };
 
 struct CUDAPlace {
@@ -42,6 +43,7 @@ struct CUDAPlace {
     return device == o.device;
   }
   inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
+  inline bool operator<(const CUDAPlace &o) const { return device < o.device; }
 
   int device;
 };
@@ -52,6 +54,7 @@ struct CUDAPinnedPlace {
   // needed for variant equality comparison
   inline bool operator==(const CUDAPinnedPlace &) const { return true; }
   inline bool operator!=(const CUDAPinnedPlace &) const { return false; }
+  inline bool operator<(const CUDAPinnedPlace &) const { return false; }
 };
 
 struct IsCUDAPlace : public boost::static_visitor<bool> {
@@ -89,18 +92,6 @@ bool is_cuda_pinned_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
 bool is_same_place(const Place &, const Place &);
 
-struct PlaceHash {
-  std::size_t operator()(const Place &p) const {
-    constexpr size_t num_dev_bits = 4;
-    std::hash<int> ihash;
-    size_t dev_id = 0;
-    if (is_gpu_place(p)) {
-      dev_id = boost::get<CUDAPlace>(p).device;
-    }
-    return ihash(dev_id << num_dev_bits | p.which());
-  }
-};
-
 std::ostream &operator<<(std::ostream &, const Place &);
 
 template <typename Visitor>
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 4fef351c2118e43697606c90a616cd870e78cd77..89ca4f781273e99bbb83216c238dfc5c88c0a22b 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,13 +2,13 @@ if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
       SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
+      DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
            parallel_executor
       ${GLOB_OP_LIB})
   else()
     cc_library(paddle_pybind SHARED
       SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
+      DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
            parallel_executor
       ${GLOB_OP_LIB})
     if(NOT APPLE AND NOT ANDROID)
diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc
index 08a2f185e117718d07ba984f76dfe5bf8229c33c..831f30e35fd3e01ce0f0524f6f85dd59494f5353 100644
--- a/paddle/fluid/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
@@ -18,10 +18,13 @@ namespace paddle {
 namespace pybind {
 
 void BindException(pybind11::module* m) {
+  static pybind11::exception<platform::EOFException> eof(*m, "EOFException");
   static pybind11::exception<platform::EnforceNotMet> exc(*m, "EnforceNotMet");
   pybind11::register_exception_translator([](std::exception_ptr p) {
     try {
       if (p) std::rethrow_exception(p);
+    } catch (const platform::EOFException& e) {
+      eof(e.what());
     } catch (const platform::EnforceNotMet& e) {
       exc(e.what());
     }
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 1c49ed93432072ddb69f401a4f9988c57241cca1..56bc0f7d7a7b29acbca98c620f2f9886e81a2ee8 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
@@ -37,6 +36,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/const_value.h"
@@ -297,7 +297,7 @@ All parameter, weight, gradient are variables in Paddle.
            py::return_value_policy::reference);
 
   py::class_<framework::ReaderHolder>(m, "Reader", "")
-      .def("reset", &framework::ReaderHolder::ReInit);
+      .def("reset", &framework::ReaderHolder::ResetAll);
 
   using LoDTensorBlockingQueue =
       ::paddle::operators::reader::LoDTensorBlockingQueue;
@@ -495,7 +495,8 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<framework::Executor>(m, "Executor")
       .def(py::init<const platform::Place &>())
 #ifdef PADDLE_WITH_DISTRIBUTE
-      .def("complete", &Executor::Complete)
+      .def("begin_pass", &Executor::BeginPass)
+      .def("end_pass", &Executor::EndPass)
 #endif
       .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
                      int block_id, bool create_local_scope, bool create_vars) {
@@ -647,7 +648,11 @@ All parameter, weight, gradient are variables in Paddle.
           [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
           [](BuildStrategy &self, const std::string &path) {
             self.debug_graphviz_path_ = path;
-          });
+          })
+      .def_property(
+          "enable_data_balance",
+          [](const BuildStrategy &self) { return self.enable_data_balance_; },
+          [](BuildStrategy &self, bool b) { self.enable_data_balance_ = b; });
 
   pe.def(py::init<const std::vector<platform::Place> &,
                   const std::unordered_set<std::string> &,
diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h
index e0f6202506868fd0dcbe93297785682edbcfe792..47de23377398423dabf3b0ed5b670e564f57cdfb 100644
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
@@ -84,7 +84,14 @@ void Fprintf(std::ostream& out, const char* fmt, const Args&... args) {
 }
 
 template <typename... Args>
-std::string Sprintf(const char* fmt = "", const Args&... args) {
+std::string Sprintf(const Args&... args) {
+  std::ostringstream oss;
+  Fprintf(oss, "");
+  return oss.str();
+}
+
+template <typename... Args>
+std::string Sprintf(const char* fmt, const Args&... args) {
   std::ostringstream oss;
   Fprintf(oss, fmt, args...);
   return oss.str();
diff --git a/paddle/fluid/string/printf_test.cc b/paddle/fluid/string/printf_test.cc
index 678029f93534ab374bd29083f8991d632ccdd5a1..544b12ef3a877a6e84c136433799301edaa4abdf 100644
--- a/paddle/fluid/string/printf_test.cc
+++ b/paddle/fluid/string/printf_test.cc
@@ -27,4 +27,5 @@ TEST(StringPrintf, StringPrintf) {
   EXPECT_EQ(std::string("Wednesday, July 27, 14:44"),
             paddle::string::Sprintf("%s, %s %d, %.2d:%.2d", weekday, month, day,
                                     hour, min));
+  EXPECT_EQ(std::string(""), paddle::string::Sprintf());
 }
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
index 813d8386868558bd62a9d5670d540ddeddb2b77d..4425f062efa6eab552caee1a429746528cd66926 100644
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ b/paddle/fluid/train/demo/demo_trainer.cc
@@ -15,11 +15,11 @@
 #include <fstream>
 
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
diff --git a/paddle/api/Arguments.cpp b/paddle/legacy/api/Arguments.cpp
similarity index 100%
rename from paddle/api/Arguments.cpp
rename to paddle/legacy/api/Arguments.cpp
diff --git a/paddle/api/CMakeLists.txt b/paddle/legacy/api/CMakeLists.txt
similarity index 100%
rename from paddle/api/CMakeLists.txt
rename to paddle/legacy/api/CMakeLists.txt
diff --git a/paddle/api/ConfigParser.cpp b/paddle/legacy/api/ConfigParser.cpp
similarity index 98%
rename from paddle/api/ConfigParser.cpp
rename to paddle/legacy/api/ConfigParser.cpp
index d362a1e7cf3c8cd05b8c85cfaf8dbbee8b827d4b..016d6da4e2e4ce888527fe9b61a163056d7729eb 100644
--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/legacy/api/ConfigParser.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
-#include "paddle/trainer/Trainer.h"
+#include "paddle/legacy/trainer/Trainer.h"
 
 struct ParameterConfigPrivate {
   paddle::ParameterPtr parameter;
diff --git a/paddle/api/Evaluator.cpp b/paddle/legacy/api/Evaluator.cpp
similarity index 100%
rename from paddle/api/Evaluator.cpp
rename to paddle/legacy/api/Evaluator.cpp
diff --git a/paddle/api/GradientMachine.cpp b/paddle/legacy/api/GradientMachine.cpp
similarity index 100%
rename from paddle/api/GradientMachine.cpp
rename to paddle/legacy/api/GradientMachine.cpp
diff --git a/paddle/api/Internal.h b/paddle/legacy/api/Internal.h
similarity index 100%
rename from paddle/api/Internal.h
rename to paddle/legacy/api/Internal.h
diff --git a/paddle/api/Matrix.cpp b/paddle/legacy/api/Matrix.cpp
similarity index 100%
rename from paddle/api/Matrix.cpp
rename to paddle/legacy/api/Matrix.cpp
diff --git a/paddle/api/Paddle.i b/paddle/legacy/api/Paddle.i
similarity index 98%
rename from paddle/api/Paddle.i
rename to paddle/legacy/api/Paddle.i
index 3237e73745dca58bed923b20851f0f0039a3487c..7a1456a5c065821caa54fbf4a10f7ceda08780c0 100644
--- a/paddle/api/Paddle.i
+++ b/paddle/legacy/api/Paddle.i
@@ -2,7 +2,7 @@
 %include "std_string.i"
 %{
 #define SWIG_FILE_WITH_INIT
-#include "api/PaddleAPI.h"   
+#include "legacy/api/PaddleAPI.h"
 %}
 
 %include "exception.i"
@@ -198,5 +198,5 @@ namespace std {
 %ignore ParameterConfigPrivate;
 %ignore OptimizationConfigPrivate;
 %ignore ParameterTraverseCallbackPrivate;
-%include "utils/GlobalConstants.h"
-%include "api/PaddleAPI.h"
+%include "legacy/utils/GlobalConstants.h"
+%include "legacy/api/PaddleAPI.h"
diff --git a/paddle/api/PaddleAPI.h b/paddle/legacy/api/PaddleAPI.h
similarity index 99%
rename from paddle/api/PaddleAPI.h
rename to paddle/legacy/api/PaddleAPI.h
index ba3e8154980b5e04bb531b8162847077ed5578df..475984a3d57ebc25d5d071c33b7e6562ac78c503 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/legacy/api/PaddleAPI.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
 
 /// Import PaddlePaddle's enumeration into global namespace.
 using namespace paddle::enumeration_wrapper;  // NOLINT
diff --git a/paddle/api/PaddleAPIPrivate.h b/paddle/legacy/api/PaddleAPIPrivate.h
similarity index 97%
rename from paddle/api/PaddleAPIPrivate.h
rename to paddle/legacy/api/PaddleAPIPrivate.h
index 2e1c504d2e8338b749e2ffbb5af5f3a3ef132c81..3ee192c31d597c4b4575e4a53a4aece09e642831 100644
--- a/paddle/api/PaddleAPIPrivate.h
+++ b/paddle/legacy/api/PaddleAPIPrivate.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/legacy/gserver/evaluators/Evaluator.h"
 #include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
 #include "paddle/legacy/parameter/ParameterUpdaterBase.h"
-#include "paddle/trainer/TrainerConfigHelper.h"
+#include "paddle/legacy/trainer/TrainerConfigHelper.h"
 
 struct GradientMachinePrivate {
   std::shared_ptr<paddle::GradientMachine> machine;
diff --git a/paddle/api/Parameter.cpp b/paddle/legacy/api/Parameter.cpp
similarity index 100%
rename from paddle/api/Parameter.cpp
rename to paddle/legacy/api/Parameter.cpp
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/legacy/api/ParameterOptimizer.cpp
similarity index 100%
rename from paddle/api/ParameterOptimizer.cpp
rename to paddle/legacy/api/ParameterOptimizer.cpp
diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/legacy/api/ParameterUpdater.cpp
similarity index 94%
rename from paddle/api/ParameterUpdater.cpp
rename to paddle/legacy/api/ParameterUpdater.cpp
index 63c000c959f67dc682190b73bac24640ca8d0682..44af3f4635f2bda07d0079faff0bbc1ec7ed3954 100644
--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/legacy/api/ParameterUpdater.cpp
@@ -16,10 +16,10 @@ limitations under the License. */
 
 #include "PaddleAPIPrivate.h"
 #ifndef PADDLE_WITHOUT_GOLANG
-#include "paddle/trainer/NewRemoteParameterUpdater.h"
+#include "paddle/legacy/trainer/NewRemoteParameterUpdater.h"
 #endif
-#include "paddle/trainer/RemoteParameterUpdater.h"
-#include "paddle/trainer/ThreadParameterUpdater.h"
+#include "paddle/legacy/trainer/RemoteParameterUpdater.h"
+#include "paddle/legacy/trainer/ThreadParameterUpdater.h"
 
 ParameterUpdater::ParameterUpdater() : m(new ParameterUpdaterPrivate()) {}
 
diff --git a/paddle/api/SequenceGenerator.cpp b/paddle/legacy/api/SequenceGenerator.cpp
similarity index 99%
rename from paddle/api/SequenceGenerator.cpp
rename to paddle/legacy/api/SequenceGenerator.cpp
index 96e075df50a2b238008ff482c8df9d31dab354d9..2a73228f6d4770d9be31defd7a5dc217fc5c21f2 100644
--- a/paddle/api/SequenceGenerator.cpp
+++ b/paddle/legacy/api/SequenceGenerator.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "PaddleAPI.h"
 #include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
 #include "paddle/legacy/parameter/Argument.h"
-#include "paddle/utils/Flags.h"
+#include "paddle/legacy/utils/Flags.h"
 
 // used to represent partial sequence
 struct Path {
diff --git a/paddle/api/Trainer.cpp b/paddle/legacy/api/Trainer.cpp
similarity index 96%
rename from paddle/api/Trainer.cpp
rename to paddle/legacy/api/Trainer.cpp
index 6506acb738c0fff5f1637811330119c57a7ca03a..e7c607201b0b946a6d6b2f3da35356e2c4e5e15e 100644
--- a/paddle/api/Trainer.cpp
+++ b/paddle/legacy/api/Trainer.cpp
@@ -20,10 +20,10 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/trainer/ParamUtil.h"
-#include "paddle/trainer/Trainer.h"
-#include "paddle/trainer/TrainerInternal.h"
-#include "paddle/utils/Flags.h"
+#include "paddle/legacy/trainer/ParamUtil.h"
+#include "paddle/legacy/trainer/Trainer.h"
+#include "paddle/legacy/trainer/TrainerInternal.h"
+#include "paddle/legacy/utils/Flags.h"
 
 using paddle::real;
 
diff --git a/paddle/api/Util.cpp b/paddle/legacy/api/Util.cpp
similarity index 91%
rename from paddle/api/Util.cpp
rename to paddle/legacy/api/Util.cpp
index d98daadbdecadd690ebf07db52372c0dd664af4a..b458c4d90ecc7333066f887dcbc93c4da5c43853 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/legacy/api/Util.cpp
@@ -15,10 +15,10 @@ limitations under the License. */
 #include "PaddleAPI.h"
 
 #include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include <algorithm>
 #include <iostream>
diff --git a/paddle/api/Vector.cpp b/paddle/legacy/api/Vector.cpp
similarity index 100%
rename from paddle/api/Vector.cpp
rename to paddle/legacy/api/Vector.cpp
diff --git a/paddle/api/__init__.py b/paddle/legacy/api/__init__.py
similarity index 100%
rename from paddle/api/__init__.py
rename to paddle/legacy/api/__init__.py
diff --git a/paddle/api/numpy.i b/paddle/legacy/api/numpy.i
similarity index 100%
rename from paddle/api/numpy.i
rename to paddle/legacy/api/numpy.i
diff --git a/paddle/api/test/.gitignore b/paddle/legacy/api/test/.gitignore
similarity index 100%
rename from paddle/api/test/.gitignore
rename to paddle/legacy/api/test/.gitignore
diff --git a/paddle/api/test/CMakeLists.txt b/paddle/legacy/api/test/CMakeLists.txt
similarity index 100%
rename from paddle/api/test/CMakeLists.txt
rename to paddle/legacy/api/test/CMakeLists.txt
diff --git a/paddle/api/test/testArguments.py b/paddle/legacy/api/test/testArguments.py
similarity index 100%
rename from paddle/api/test/testArguments.py
rename to paddle/legacy/api/test/testArguments.py
diff --git a/paddle/api/test/testGradientMachine.py b/paddle/legacy/api/test/testGradientMachine.py
similarity index 100%
rename from paddle/api/test/testGradientMachine.py
rename to paddle/legacy/api/test/testGradientMachine.py
diff --git a/paddle/api/test/testMatrix.py b/paddle/legacy/api/test/testMatrix.py
similarity index 100%
rename from paddle/api/test/testMatrix.py
rename to paddle/legacy/api/test/testMatrix.py
diff --git a/paddle/api/test/testTrain.py b/paddle/legacy/api/test/testTrain.py
similarity index 100%
rename from paddle/api/test/testTrain.py
rename to paddle/legacy/api/test/testTrain.py
diff --git a/paddle/api/test/testTrainConfig.py b/paddle/legacy/api/test/testTrainConfig.py
similarity index 100%
rename from paddle/api/test/testTrainConfig.py
rename to paddle/legacy/api/test/testTrainConfig.py
diff --git a/paddle/api/test/testTrainer.py b/paddle/legacy/api/test/testTrainer.py
similarity index 100%
rename from paddle/api/test/testTrainer.py
rename to paddle/legacy/api/test/testTrainer.py
diff --git a/paddle/api/test/testVector.py b/paddle/legacy/api/test/testVector.py
similarity index 100%
rename from paddle/api/test/testVector.py
rename to paddle/legacy/api/test/testVector.py
diff --git a/paddle/api/test/util.py b/paddle/legacy/api/test/util.py
similarity index 100%
rename from paddle/api/test/util.py
rename to paddle/legacy/api/test/util.py
diff --git a/paddle/capi/Arguments.cpp b/paddle/legacy/capi/Arguments.cpp
similarity index 100%
rename from paddle/capi/Arguments.cpp
rename to paddle/legacy/capi/Arguments.cpp
diff --git a/paddle/capi/CMakeLists.txt b/paddle/legacy/capi/CMakeLists.txt
similarity index 100%
rename from paddle/capi/CMakeLists.txt
rename to paddle/legacy/capi/CMakeLists.txt
diff --git a/paddle/capi/Main.cpp b/paddle/legacy/capi/Main.cpp
similarity index 90%
rename from paddle/capi/Main.cpp
rename to paddle/legacy/capi/Main.cpp
index 0a289dede65406facf1f1cba584f4330f2569214..17d8f00a88a9fd0818e6b90f8f6888b7d793a46e 100644
--- a/paddle/capi/Main.cpp
+++ b/paddle/legacy/capi/Main.cpp
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <vector>
 #include "capi_private.h"
 #include "main.h"
-#include "paddle/trainer/TrainerConfigHelper.h"
-#include "paddle/utils/Excepts.h"
-#include "paddle/utils/PythonUtil.h"
+#include "paddle/legacy/trainer/TrainerConfigHelper.h"
+#include "paddle/legacy/utils/Excepts.h"
+#include "paddle/legacy/utils/PythonUtil.h"
 
 static void initPaddle(int argc, char** argv) {
   paddle::initMain(argc, argv);
diff --git a/paddle/capi/Matrix.cpp b/paddle/legacy/capi/Matrix.cpp
similarity index 100%
rename from paddle/capi/Matrix.cpp
rename to paddle/legacy/capi/Matrix.cpp
diff --git a/paddle/capi/Vector.cpp b/paddle/legacy/capi/Vector.cpp
similarity index 100%
rename from paddle/capi/Vector.cpp
rename to paddle/legacy/capi/Vector.cpp
diff --git a/paddle/capi/arguments.h b/paddle/legacy/capi/arguments.h
similarity index 100%
rename from paddle/capi/arguments.h
rename to paddle/legacy/capi/arguments.h
diff --git a/paddle/capi/capi.h b/paddle/legacy/capi/capi.h
similarity index 100%
rename from paddle/capi/capi.h
rename to paddle/legacy/capi/capi.h
diff --git a/paddle/capi/capi_private.h b/paddle/legacy/capi/capi_private.h
similarity index 100%
rename from paddle/capi/capi_private.h
rename to paddle/legacy/capi/capi_private.h
diff --git a/paddle/capi/config.h.in b/paddle/legacy/capi/config.h.in
similarity index 100%
rename from paddle/capi/config.h.in
rename to paddle/legacy/capi/config.h.in
diff --git a/paddle/capi/error.cpp b/paddle/legacy/capi/error.cpp
similarity index 100%
rename from paddle/capi/error.cpp
rename to paddle/legacy/capi/error.cpp
diff --git a/paddle/capi/error.h b/paddle/legacy/capi/error.h
similarity index 100%
rename from paddle/capi/error.h
rename to paddle/legacy/capi/error.h
diff --git a/paddle/capi/examples/.gitignore b/paddle/legacy/capi/examples/.gitignore
similarity index 100%
rename from paddle/capi/examples/.gitignore
rename to paddle/legacy/capi/examples/.gitignore
diff --git a/paddle/capi/examples/README.md b/paddle/legacy/capi/examples/README.md
similarity index 100%
rename from paddle/capi/examples/README.md
rename to paddle/legacy/capi/examples/README.md
diff --git a/paddle/capi/examples/model_inference/README.md b/paddle/legacy/capi/examples/model_inference/README.md
similarity index 100%
rename from paddle/capi/examples/model_inference/README.md
rename to paddle/legacy/capi/examples/model_inference/README.md
diff --git a/paddle/capi/examples/model_inference/common/common.h b/paddle/legacy/capi/examples/model_inference/common/common.h
similarity index 100%
rename from paddle/capi/examples/model_inference/common/common.h
rename to paddle/legacy/capi/examples/model_inference/common/common.h
diff --git a/paddle/capi/examples/model_inference/dense/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/dense/CMakeLists.txt
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/CMakeLists.txt
rename to paddle/legacy/capi/examples/model_inference/dense/CMakeLists.txt
diff --git a/paddle/capi/examples/model_inference/dense/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/dense/convert_protobin.sh
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/convert_protobin.sh
rename to paddle/legacy/capi/examples/model_inference/dense/convert_protobin.sh
diff --git a/paddle/capi/examples/model_inference/dense/main.c b/paddle/legacy/capi/examples/model_inference/dense/main.c
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/main.c
rename to paddle/legacy/capi/examples/model_inference/dense/main.c
diff --git a/paddle/capi/examples/model_inference/dense/merge_v2_model.py b/paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/merge_v2_model.py
rename to paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py
diff --git a/paddle/capi/examples/model_inference/dense/mnist_v2.py b/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/mnist_v2.py
rename to paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py
diff --git a/paddle/capi/examples/model_inference/dense/trainer_config.py b/paddle/legacy/capi/examples/model_inference/dense/trainer_config.py
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/trainer_config.py
rename to paddle/legacy/capi/examples/model_inference/dense/trainer_config.py
diff --git a/paddle/capi/examples/model_inference/multi_thread/.gitignore b/paddle/legacy/capi/examples/model_inference/multi_thread/.gitignore
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/.gitignore
rename to paddle/legacy/capi/examples/model_inference/multi_thread/.gitignore
diff --git a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/multi_thread/CMakeLists.txt
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
rename to paddle/legacy/capi/examples/model_inference/multi_thread/CMakeLists.txt
diff --git a/paddle/capi/examples/model_inference/multi_thread/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/convert_protobin.sh
rename to paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
diff --git a/paddle/capi/examples/model_inference/multi_thread/main.c b/paddle/legacy/capi/examples/model_inference/multi_thread/main.c
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/main.c
rename to paddle/legacy/capi/examples/model_inference/multi_thread/main.c
diff --git a/paddle/capi/examples/model_inference/multi_thread/main_gpu.c b/paddle/legacy/capi/examples/model_inference/multi_thread/main_gpu.c
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/main_gpu.c
rename to paddle/legacy/capi/examples/model_inference/multi_thread/main_gpu.c
diff --git a/paddle/capi/examples/model_inference/multi_thread/trainer_config.py b/paddle/legacy/capi/examples/model_inference/multi_thread/trainer_config.py
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/trainer_config.py
rename to paddle/legacy/capi/examples/model_inference/multi_thread/trainer_config.py
diff --git a/paddle/capi/examples/model_inference/sequence/.gitignore b/paddle/legacy/capi/examples/model_inference/sequence/.gitignore
similarity index 100%
rename from paddle/capi/examples/model_inference/sequence/.gitignore
rename to paddle/legacy/capi/examples/model_inference/sequence/.gitignore
diff --git a/paddle/capi/examples/model_inference/sequence/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/sequence/CMakeLists.txt
similarity index 100%
rename from paddle/capi/examples/model_inference/sequence/CMakeLists.txt
rename to paddle/legacy/capi/examples/model_inference/sequence/CMakeLists.txt
diff --git a/paddle/capi/examples/model_inference/sequence/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
similarity index 100%
rename from paddle/capi/examples/model_inference/sequence/convert_protobin.sh
rename to paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
diff --git a/paddle/capi/examples/model_inference/sequence/main.c b/paddle/legacy/capi/examples/model_inference/sequence/main.c
similarity index 100%
rename from paddle/capi/examples/model_inference/sequence/main.c
rename to paddle/legacy/capi/examples/model_inference/sequence/main.c
diff --git a/paddle/capi/examples/model_inference/sequence/trainer_config.py b/paddle/legacy/capi/examples/model_inference/sequence/trainer_config.py
similarity index 100%
rename from paddle/capi/examples/model_inference/sequence/trainer_config.py
rename to paddle/legacy/capi/examples/model_inference/sequence/trainer_config.py
diff --git a/paddle/capi/examples/model_inference/sparse_binary/.gitignore b/paddle/legacy/capi/examples/model_inference/sparse_binary/.gitignore
similarity index 100%
rename from paddle/capi/examples/model_inference/sparse_binary/.gitignore
rename to paddle/legacy/capi/examples/model_inference/sparse_binary/.gitignore
diff --git a/paddle/capi/examples/model_inference/sparse_binary/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/sparse_binary/CMakeLists.txt
similarity index 100%
rename from paddle/capi/examples/model_inference/sparse_binary/CMakeLists.txt
rename to paddle/legacy/capi/examples/model_inference/sparse_binary/CMakeLists.txt
diff --git a/paddle/capi/examples/model_inference/sparse_binary/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
similarity index 100%
rename from paddle/capi/examples/model_inference/sparse_binary/convert_protobin.sh
rename to paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
diff --git a/paddle/capi/examples/model_inference/sparse_binary/main.c b/paddle/legacy/capi/examples/model_inference/sparse_binary/main.c
similarity index 100%
rename from paddle/capi/examples/model_inference/sparse_binary/main.c
rename to paddle/legacy/capi/examples/model_inference/sparse_binary/main.c
diff --git a/paddle/capi/examples/model_inference/sparse_binary/trainer_config.py b/paddle/legacy/capi/examples/model_inference/sparse_binary/trainer_config.py
similarity index 100%
rename from paddle/capi/examples/model_inference/sparse_binary/trainer_config.py
rename to paddle/legacy/capi/examples/model_inference/sparse_binary/trainer_config.py
diff --git a/paddle/capi/gradient_machine.cpp b/paddle/legacy/capi/gradient_machine.cpp
similarity index 100%
rename from paddle/capi/gradient_machine.cpp
rename to paddle/legacy/capi/gradient_machine.cpp
diff --git a/paddle/capi/gradient_machine.h b/paddle/legacy/capi/gradient_machine.h
similarity index 100%
rename from paddle/capi/gradient_machine.h
rename to paddle/legacy/capi/gradient_machine.h
diff --git a/paddle/capi/main.h b/paddle/legacy/capi/main.h
similarity index 100%
rename from paddle/capi/main.h
rename to paddle/legacy/capi/main.h
diff --git a/paddle/capi/matrix.h b/paddle/legacy/capi/matrix.h
similarity index 100%
rename from paddle/capi/matrix.h
rename to paddle/legacy/capi/matrix.h
diff --git a/paddle/capi/paddle_capi.map b/paddle/legacy/capi/paddle_capi.map
similarity index 100%
rename from paddle/capi/paddle_capi.map
rename to paddle/legacy/capi/paddle_capi.map
diff --git a/paddle/capi/tests/.gitignore b/paddle/legacy/capi/tests/.gitignore
similarity index 100%
rename from paddle/capi/tests/.gitignore
rename to paddle/legacy/capi/tests/.gitignore
diff --git a/paddle/capi/tests/CMakeLists.txt b/paddle/legacy/capi/tests/CMakeLists.txt
similarity index 100%
rename from paddle/capi/tests/CMakeLists.txt
rename to paddle/legacy/capi/tests/CMakeLists.txt
diff --git a/paddle/capi/tests/test_Arguments.cpp b/paddle/legacy/capi/tests/test_Arguments.cpp
similarity index 99%
rename from paddle/capi/tests/test_Arguments.cpp
rename to paddle/legacy/capi/tests/test_Arguments.cpp
index bb08adf716bfd6e3c88747616e538e9da89a0e25..6fb379719dc0f3230c0801752720703ad185216f 100644
--- a/paddle/capi/tests/test_Arguments.cpp
+++ b/paddle/legacy/capi/tests/test_Arguments.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <functional>
 #include "capi.h"
 #include "gtest/gtest.h"
-#include "paddle/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
 
 static std::vector<paddle_real> randomBuffer(size_t bufSize) {
   auto& eng = paddle::ThreadLocalRandomEngine::get();
diff --git a/paddle/capi/tests/test_GradientMachine.cpp b/paddle/legacy/capi/tests/test_GradientMachine.cpp
similarity index 97%
rename from paddle/capi/tests/test_GradientMachine.cpp
rename to paddle/legacy/capi/tests/test_GradientMachine.cpp
index 2c02669ccfa7550971e89c0dfad73e19368da527..5d1b7cb6ca4073c0a489366e415f8f74d3c19bec 100644
--- a/paddle/capi/tests/test_GradientMachine.cpp
+++ b/paddle/legacy/capi/tests/test_GradientMachine.cpp
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include <paddle/legacy/gserver/gradientmachines/GradientMachine.h>
-#include <paddle/trainer/TrainerConfigHelper.h>
+#include <paddle/legacy/trainer/TrainerConfigHelper.h>
 #include <stdlib.h>
 #include <string.h>
 #include <type_traits>
 #include "capi.h"
-#include "paddle/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
 
 static std::vector<paddle_real> randomBuffer(size_t bufSize) {
   auto& eng = paddle::ThreadLocalRandomEngine::get();
diff --git a/paddle/capi/tests/test_Matrix.cpp b/paddle/legacy/capi/tests/test_Matrix.cpp
similarity index 100%
rename from paddle/capi/tests/test_Matrix.cpp
rename to paddle/legacy/capi/tests/test_Matrix.cpp
diff --git a/paddle/capi/tests/test_Vector.cpp b/paddle/legacy/capi/tests/test_Vector.cpp
similarity index 100%
rename from paddle/capi/tests/test_Vector.cpp
rename to paddle/legacy/capi/tests/test_Vector.cpp
diff --git a/paddle/capi/tests/test_predict_network.py b/paddle/legacy/capi/tests/test_predict_network.py
similarity index 100%
rename from paddle/capi/tests/test_predict_network.py
rename to paddle/legacy/capi/tests/test_predict_network.py
diff --git a/paddle/capi/vector.h b/paddle/legacy/capi/vector.h
similarity index 100%
rename from paddle/capi/vector.h
rename to paddle/legacy/capi/vector.h
diff --git a/paddle/legacy/cuda/include/hl_base.h b/paddle/legacy/cuda/include/hl_base.h
index 8451d2546d47141f3bc8505d11ce19287286747f..bfe812a4387be72c3e73d6b45852e3a90b1926eb 100644
--- a/paddle/legacy/cuda/include/hl_base.h
+++ b/paddle/legacy/cuda/include/hl_base.h
@@ -208,7 +208,7 @@ typedef struct {
 
 #include <cuda_runtime.h>
 #include "paddle/legacy/cuda/include/hl_cuda.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 extern __thread bool g_sync_flag;
 extern __thread cudaStream_t default_stream;
diff --git a/paddle/legacy/cuda/include/hl_gpu_gru.cuh b/paddle/legacy/cuda/include/hl_gpu_gru.cuh
index 9fcad2c3bc2fa255e3d7cd3e7940a32fd286751b..8d299572c73e879a3a1e9fb60608c4f3abd1f685 100644
--- a/paddle/legacy/cuda/include/hl_gpu_gru.cuh
+++ b/paddle/legacy/cuda/include/hl_gpu_gru.cuh
@@ -18,7 +18,7 @@ limitations under the License. */
 
 #ifdef __NVCC__
 
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 /*
  * threads(framePerBlock, batchPerBlock)
diff --git a/paddle/legacy/cuda/include/hl_gpu_lstm.cuh b/paddle/legacy/cuda/include/hl_gpu_lstm.cuh
index 92517a44d2353a42d905708fc9aa98727a13a9e9..aae011b838c0eca1197f55d236d759eab8ea993c 100644
--- a/paddle/legacy/cuda/include/hl_gpu_lstm.cuh
+++ b/paddle/legacy/cuda/include/hl_gpu_lstm.cuh
@@ -18,7 +18,7 @@ limitations under the License. */
 
 #ifdef __NVCC__
 
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 #include "hl_device_functions.cuh"
 
 /*
diff --git a/paddle/legacy/cuda/include/hl_gpu_matrix_kernel.cuh b/paddle/legacy/cuda/include/hl_gpu_matrix_kernel.cuh
index 0db023ce3745f95ced8b3a33a1d6bcb20066b2ef..6177d23657fba5b2800041a3dd7b5f76bf35aa1a 100644
--- a/paddle/legacy/cuda/include/hl_gpu_matrix_kernel.cuh
+++ b/paddle/legacy/cuda/include/hl_gpu_matrix_kernel.cuh
@@ -18,7 +18,7 @@ limitations under the License. */
 #define HL_GPU_MATRIX_KERNEL_CUH_
 
 #include <algorithm>
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 #include "hl_base.h"
 
 #ifdef __NVCC__
diff --git a/paddle/legacy/cuda/src/hl_cuda_aggregate.cu b/paddle/legacy/cuda/src/hl_cuda_aggregate.cu
index d30c264127f47da9a48acb71c59cb9e134ced127..9831c5ecc340135c27b49d24715c63f8a8dfa8e9 100644
--- a/paddle/legacy/cuda/src/hl_cuda_aggregate.cu
+++ b/paddle/legacy/cuda/src/hl_cuda_aggregate.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "hl_cuda.ph"
 #include "hl_matrix_base.cuh"
 #include "hl_thread.ph"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 /**
  * @brief   matrix row operator.
diff --git a/paddle/legacy/cuda/src/hl_cuda_cublas.cc b/paddle/legacy/cuda/src/hl_cuda_cublas.cc
index 975df4287894090799c44bc0a4e9e08e4144e68f..283b8b6e9c8e7b843a8d28b940c6ef53b77ef655 100644
--- a/paddle/legacy/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/legacy/cuda/src/hl_cuda_cublas.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <sys/time.h>
 #include "hl_cuda.h"
 #include "hl_thread.ph"
-#include "paddle/utils/DynamicLoader.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/DynamicLoader.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace dynload {
 
diff --git a/paddle/legacy/cuda/src/hl_cuda_cudnn.cc b/paddle/legacy/cuda/src/hl_cuda_cudnn.cc
index dfa935dcff9f7ae9f710d0f01a0217298d8cec04..b0ac5aaac284cd939fc46be6a7320242312674ab 100644
--- a/paddle/legacy/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/legacy/cuda/src/hl_cuda_cudnn.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <gflags/gflags.h>
 #include "hl_cuda_cudnn.ph"
 #include "hl_thread.ph"
-#include "paddle/utils/DynamicLoader.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/DynamicLoader.h"
+#include "paddle/legacy/utils/Logging.h"
 
 DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
              4096,
diff --git a/paddle/legacy/cuda/src/hl_cuda_device.cc b/paddle/legacy/cuda/src/hl_cuda_device.cc
index 3025aa48523d67fe3d7ed03f44252d1211d2a46a..501e3b0f3be02b9364f9182b2484d542f0f39889 100644
--- a/paddle/legacy/cuda/src/hl_cuda_device.cc
+++ b/paddle/legacy/cuda/src/hl_cuda_device.cc
@@ -23,8 +23,8 @@ limitations under the License. */
 #include <unistd.h>
 #include "hl_cuda.ph"
 #include "hl_thread.ph"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/DynamicLoader.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/DynamicLoader.h"
 // clang-format on
 
 namespace dynload {
diff --git a/paddle/legacy/cuda/src/hl_cuda_lstm.cu b/paddle/legacy/cuda/src/hl_cuda_lstm.cu
index b8c4e433a118fb1c5af753751f91c34543b1114c..9ac564fd2548cc782bee2380350f4ab888670ca3 100644
--- a/paddle/legacy/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/legacy/cuda/src/hl_cuda_lstm.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "hl_base.h"
 #include "hl_cuda_cublas.h"
 #include "hl_device_functions.cuh"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 typedef hppl::Active<real>::forward t_forward;
 typedef hppl::Active<real>::backward t_backward;
diff --git a/paddle/legacy/cuda/src/hl_cuda_matrix.cu b/paddle/legacy/cuda/src/hl_cuda_matrix.cu
index 3e17c8090c5036037e936af1d6feaa2239251679..6fe460026bbd404e15b43bd221551094a7abeda2 100644
--- a/paddle/legacy/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/legacy/cuda/src/hl_cuda_matrix.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "hl_matrix_ops.cuh"
 #include "hl_sequence.h"
 #include "hl_sparse.ph"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b);
diff --git a/paddle/legacy/cuda/src/hl_cuda_sequence.cu b/paddle/legacy/cuda/src/hl_cuda_sequence.cu
index a3a5f038de7c0a68ee2e387d83b2272907164e90..1d772b5ce27615673d85231ec8fd3ab1d0aed523 100644
--- a/paddle/legacy/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/legacy/cuda/src/hl_cuda_sequence.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "hl_base.h"
 #include "hl_device_functions.cuh"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 __global__ void KeMaxSequenceForward(real* input,
                                      const int* sequence,
diff --git a/paddle/legacy/cuda/src/hl_cuda_sparse.cu b/paddle/legacy/cuda/src/hl_cuda_sparse.cu
index 432041fed5ab1ffc02dabcd4644fa70a6473fba1..8065a6f9f6f2ac4cacf9a63b7b80dd00391824a0 100644
--- a/paddle/legacy/cuda/src/hl_cuda_sparse.cu
+++ b/paddle/legacy/cuda/src/hl_cuda_sparse.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "hl_matrix_ops.cuh"
 #include "hl_sparse.h"
 #include "hl_sparse.ph"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(mul_scalar, ONE_PARAMETER, a = a * p);
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
diff --git a/paddle/legacy/cuda/src/hl_table_apply.cu b/paddle/legacy/cuda/src/hl_table_apply.cu
index efa4bef02ba5f5fe9ae449b44bbdc844e5745307..7411ae35d382833253e3ceabe36b3a1938138028 100644
--- a/paddle/legacy/cuda/src/hl_table_apply.cu
+++ b/paddle/legacy/cuda/src/hl_table_apply.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "hl_base.h"
 #include "hl_cuda.h"
 #include "hl_device_functions.cuh"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
 __global__ void KeMatrixAddRows(real* output,
diff --git a/paddle/legacy/cuda/src/hl_top_k.cu b/paddle/legacy/cuda/src/hl_top_k.cu
index 14b9a7f50ffcb6f0159665693288630f0d556706..041ac419f5addfa49148270b8a8b421eb8ada78c 100644
--- a/paddle/legacy/cuda/src/hl_top_k.cu
+++ b/paddle/legacy/cuda/src/hl_top_k.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/legacy/cuda/include/hl_base.h"
 #include "paddle/legacy/cuda/include/hl_sparse.ph"
 #include "paddle/legacy/cuda/include/hl_top_k.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 // using namespace hppl;
 
diff --git a/paddle/legacy/cuda/src/hl_warpctc_wrap.cc b/paddle/legacy/cuda/src/hl_warpctc_wrap.cc
index 5111bceaff224f2467fe1b6c92daed03414dd12e..31a8652f1f55387ae48cb516cd092442be784cbb 100644
--- a/paddle/legacy/cuda/src/hl_warpctc_wrap.cc
+++ b/paddle/legacy/cuda/src/hl_warpctc_wrap.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "hl_warpctc_wrap.h"
 #include <mutex>
-#include "paddle/utils/DynamicLoader.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/DynamicLoader.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace dynload {
 
diff --git a/paddle/legacy/function/Function.h b/paddle/legacy/function/Function.h
index cc6f999a0e08621c1ffbebb51bcfab1e5f9a5630..bc5ef7e6f20b63a120a577ded876820aafecff19 100644
--- a/paddle/legacy/function/Function.h
+++ b/paddle/legacy/function/Function.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <vector>
 #include "BufferArg.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Any.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/utils/Error.h"
+#include "paddle/legacy/utils/Any.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+#include "paddle/legacy/utils/Error.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/function/MulOp.cpp b/paddle/legacy/function/MulOp.cpp
index 140103175290703e1a0c171d8f45cdc59a1f6912..750978fc90201ccdc0a32f93fc01a2170d3f39d5 100644
--- a/paddle/legacy/function/MulOp.cpp
+++ b/paddle/legacy/function/MulOp.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "MulOp.h"
 #include "GemmFunctor.h"
 #include "paddle/legacy/math/SIMDFunctions.h"
-#include "paddle/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
 
 namespace {
 inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
diff --git a/paddle/legacy/gserver/activations/ActivationFunction.cpp b/paddle/legacy/gserver/activations/ActivationFunction.cpp
index 69f34db5ac193664e13846835ffb4bd7f579e028..ae07c7e6d7fd9fe28a00dd209ae834cd28a327f7 100644
--- a/paddle/legacy/gserver/activations/ActivationFunction.cpp
+++ b/paddle/legacy/gserver/activations/ActivationFunction.cpp
@@ -21,8 +21,8 @@ limitations under the License. */
 #include <thread>
 #include <type_traits>
 #include "paddle/legacy/parameter/Argument.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+#include "paddle/legacy/utils/Logging.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "MKLDNNActivation.h"
diff --git a/paddle/legacy/gserver/activations/ActivationFunction.h b/paddle/legacy/gserver/activations/ActivationFunction.h
index 8e2e144769f2e668a9a8f02890d29c4a7fe128a3..8bc5b0f529a6358fba8b6c9d1e1f6ee2358dbbf9 100644
--- a/paddle/legacy/gserver/activations/ActivationFunction.h
+++ b/paddle/legacy/gserver/activations/ActivationFunction.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include <vector>
-#include "paddle/utils/Error.h"
+#include "paddle/legacy/utils/Error.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/activations/MKLDNNActivation.cpp b/paddle/legacy/gserver/activations/MKLDNNActivation.cpp
index 672444c6561adbeb78c3c453f12ab6aaedeed646..2eed7af70a8a3cc305a79bbe23177ea71d15d252 100644
--- a/paddle/legacy/gserver/activations/MKLDNNActivation.cpp
+++ b/paddle/legacy/gserver/activations/MKLDNNActivation.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "MKLDNNActivation.h"
 #include "mkldnn.hpp"
-#include "paddle/utils/ClassRegistrar.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/dataproviders/DataProvider.cpp b/paddle/legacy/gserver/dataproviders/DataProvider.cpp
index 580cf821c685b3daf7f015bc137c6d5ea31ef100..b67af8a326bdfd211ee5720bf67828040b19e5c1 100644
--- a/paddle/legacy/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/legacy/gserver/dataproviders/DataProvider.cpp
@@ -16,10 +16,10 @@ limitations under the License. */
 
 #include <unistd.h>
 #include <algorithm>
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/StringUtil.h"
+#include "paddle/legacy/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/dataproviders/DataProvider.h b/paddle/legacy/gserver/dataproviders/DataProvider.h
index b6f74afed05e0b42b3a4ec26041bcb8fa50fa9b2..c2e1c5fdd6d504b77873aaeeba3611dff6d8f738 100644
--- a/paddle/legacy/gserver/dataproviders/DataProvider.h
+++ b/paddle/legacy/gserver/dataproviders/DataProvider.h
@@ -29,13 +29,13 @@ limitations under the License. */
 #include "paddle/legacy/math/SparseMatrix.h"
 #include "paddle/legacy/math/Vector.h"
 #include "paddle/legacy/parameter/Argument.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Queue.h"
-#include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Queue.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/Util.h"
 
 namespace paddle {
 /**
diff --git a/paddle/legacy/gserver/dataproviders/MultiDataProvider.cpp b/paddle/legacy/gserver/dataproviders/MultiDataProvider.cpp
index f71947ef3946284b7ecfb50851100fe43bd78857..e5fc6d8a88fe2c03cc74b4a38e999d11d676dfdf 100644
--- a/paddle/legacy/gserver/dataproviders/MultiDataProvider.cpp
+++ b/paddle/legacy/gserver/dataproviders/MultiDataProvider.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "MultiDataProvider.h"
 #include <algorithm>
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/dataproviders/PyDataProvider.cpp b/paddle/legacy/gserver/dataproviders/PyDataProvider.cpp
index dadf1b4cf27f248c7353aaad50dc22d4f6431cca..0827bd39d4cc78ef5658d437b6502f2e60e90b4c 100644
--- a/paddle/legacy/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/legacy/gserver/dataproviders/PyDataProvider.cpp
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "PyDataProvider.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/dataproviders/PyDataProvider.h b/paddle/legacy/gserver/dataproviders/PyDataProvider.h
index da50dd4e2ebb743ef45af319bc713ed7ac3d3e10..4b8bea04a1670c60d5a801ca950f59116ba50195 100644
--- a/paddle/legacy/gserver/dataproviders/PyDataProvider.h
+++ b/paddle/legacy/gserver/dataproviders/PyDataProvider.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include <paddle/utils/PythonUtil.h>
+#include <paddle/legacy/utils/PythonUtil.h>
 #include "DataFormat.pb.h"
 #include "DataProvider.h"
 
diff --git a/paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp b/paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp
index 54ee091e8f257f76b113d4ca6f8a7c3989c0c1df..8e931e40611e27caa43675c3567972384a4d9026 100644
--- a/paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp
@@ -25,9 +25,9 @@ limitations under the License. */
 
 #include "DataProvider.h"
 
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp
index 04335dc7cdd0919d8e24518f5e9992f10f8b204c..c145adda5e04fb4a35df480fd3d0cf93ad453e0d 100644
--- a/paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "Evaluator.h"
 #include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/utils/StringUtil.h"
+#include "paddle/legacy/utils/StringUtil.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp b/paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp
index ea5c609a63a961875543389fadcea7a86b87398a..0ff3f2fa8cf06c13ef327aa7ae2511bfc0d028be 100644
--- a/paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/legacy/math/Vector.h"
-#include "paddle/utils/StringUtil.h"
+#include "paddle/legacy/utils/StringUtil.h"
 
 #include "Evaluator.h"
 
diff --git a/paddle/legacy/gserver/evaluators/Evaluator.cpp b/paddle/legacy/gserver/evaluators/Evaluator.cpp
index 436c33e43b400514608c5ebc8146b214a12b5505..a956f40d02e39ac57ca745988491c2b54741dca3 100644
--- a/paddle/legacy/gserver/evaluators/Evaluator.cpp
+++ b/paddle/legacy/gserver/evaluators/Evaluator.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/legacy/gserver/evaluators/Evaluator.h"
 #include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/StringUtil.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/StringUtil.h"
 
 DECLARE_int32(trainer_id);
 
diff --git a/paddle/legacy/gserver/evaluators/Evaluator.h b/paddle/legacy/gserver/evaluators/Evaluator.h
index 90989bb0b6d49bc5bef3b5009d7179a52df7587e..b3462819b1244e9f2d1a463cb44e7c550406c000 100644
--- a/paddle/legacy/gserver/evaluators/Evaluator.h
+++ b/paddle/legacy/gserver/evaluators/Evaluator.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include "ModelConfig.pb.h"
 #include "paddle/legacy/parameter/Argument.h"
 #include "paddle/legacy/pserver/ParameterClient2.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/utils/Error.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+#include "paddle/legacy/utils/Error.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/gradientmachines/GradientMachine.cpp b/paddle/legacy/gserver/gradientmachines/GradientMachine.cpp
index 654024e8a47c1e538f25823da78dce6a7a093975..1c4034d8bba59dbae0a1059b96ac2b6f18c5971b 100644
--- a/paddle/legacy/gserver/gradientmachines/GradientMachine.cpp
+++ b/paddle/legacy/gserver/gradientmachines/GradientMachine.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "GradientMachine.h"
 
 #include <fstream>
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 #include "NeuralNetwork.h"
 #include "hl_gpu.h"
diff --git a/paddle/legacy/gserver/gradientmachines/GradientMachine.h b/paddle/legacy/gserver/gradientmachines/GradientMachine.h
index 48f5141ce1ba7865ff63e489c31468c82df99afd..d4f754a9f4dc3175f5000774c77a0e7334df7d85 100644
--- a/paddle/legacy/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/legacy/gserver/gradientmachines/GradientMachine.h
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/legacy/math/Matrix.h"
 #include "paddle/legacy/parameter/Parameter.h"
 #include "paddle/legacy/parameter/ParameterUpdaterBase.h"
-#include "paddle/utils/Thread.h"
+#include "paddle/legacy/utils/Thread.h"
 
 #ifndef PADDLE_MOBILE_INFERENCE
 #include "paddle/legacy/gserver/evaluators/Evaluator.h"
diff --git a/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.cpp
index b8d4d28f0f309a5f7348605e8d35e160e7fd5552..3ef0dfbfe2e5842918500a3b0706c1a55024ce46 100644
--- a/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "MultiGradientMachine.h"
 
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 #include "NeuralNetwork.h"
 #include "ParallelNeuralNetwork.h"
@@ -532,6 +532,7 @@ void TrainerThread::computeThread() {
         break;
     }
   }
+  hl_fini();
 }
 
 void TrainerThread::prefetch() {
@@ -651,6 +652,7 @@ void TrainerThread::copyGradToBufferThread() {
     }
     partnerThread->notifyGradientCollect(pid);
   }
+  hl_fini();
 }
 
 void TrainerThread::gradCollectThread() {
@@ -693,6 +695,7 @@ void TrainerThread::gradCollectThread() {
       notifyCopyGradToBuffer(pid);
     }
   }
+  hl_fini();
 }
 
 void TrainerThread::doCallback(int pid) {
@@ -741,6 +744,7 @@ void TrainerThread::valueDispatchThread() {
 
     thread->notifyValueReady(pid);
   }
+  hl_fini();
 }
 
 void TrainerThread::notifyValueReady(int paramId) {
diff --git a/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.h b/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.h
index eff7d5284c6dd4898344203b50acc94ae61b4d59..674acd4124981face13b21aee02f031ea775ffec 100644
--- a/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "GradientMachine.h"
 
 #include "hl_gpu.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Queue.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Queue.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/gradientmachines/MultiNetwork.cpp b/paddle/legacy/gserver/gradientmachines/MultiNetwork.cpp
index 5f3d09dda26772850828e6d44e8cc65635b314dc..1245c441036a601025192ab23a6d2899b688a9dc 100644
--- a/paddle/legacy/gserver/gradientmachines/MultiNetwork.cpp
+++ b/paddle/legacy/gserver/gradientmachines/MultiNetwork.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include "MultiNetwork.h"
 
diff --git a/paddle/legacy/gserver/gradientmachines/MultiNetwork.h b/paddle/legacy/gserver/gradientmachines/MultiNetwork.h
index 495d5592017b5fb937fb8243bf12a5f2f30d67e7..afe15cb020ebe3bbe051800a72562c9543f3faa4 100644
--- a/paddle/legacy/gserver/gradientmachines/MultiNetwork.h
+++ b/paddle/legacy/gserver/gradientmachines/MultiNetwork.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "GradientMachine.h"
 #include "NeuralNetwork.h"
 
-#include "paddle/utils/Locks.h"
+#include "paddle/legacy/utils/Locks.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp
index 339550c458f5e79fb1afa79952ffd373c2850ec4..0f8048152ff317a1e445249fa7093158d2d4a5c5 100644
--- a/paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include "NeuralNetwork.h"
 #include "hl_gpu.h"
-#include "paddle/utils/CustomStackTrace.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/CustomStackTrace.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/legacy/gserver/layers/MKLDNNLayer.h"
diff --git a/paddle/legacy/gserver/gradientmachines/NeuralNetwork.h b/paddle/legacy/gserver/gradientmachines/NeuralNetwork.h
index 5a0909b99b3aaf389b8c6457de7b37cbe00bfae7..566157c8998a38aef4a3620a4dca7246c6e66391 100644
--- a/paddle/legacy/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/legacy/gserver/gradientmachines/NeuralNetwork.h
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/legacy/gserver/layers/DataLayer.h"
 #include "paddle/legacy/gserver/layers/Layer.h"
 #include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/utils/ClassRegistrar.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
 
 namespace paddle {
 /*
diff --git a/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.cpp b/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.cpp
index 85cfc59fbef7017f8dea7fdfecd18aa3e75a871c..33d24b5b832fe9011591606860e0f50361367790 100644
--- a/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.cpp
+++ b/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include "ParallelNeuralNetwork.h"
 
@@ -197,6 +197,7 @@ void ParallelThread::computeThread() {
       job_work.layer_->markAllInputGrad();
     }
   }
+  hl_fini();
 }
 
 void ParallelThread::start() {
diff --git a/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp
index e749cf61f304881f2d789a3ebac87a67cec75828..e49f042404f80a21293545023efa3e68417c1edb 100644
--- a/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -20,9 +20,9 @@ limitations under the License. */
 #include <limits>
 #include "NeuralNetwork.h"
 #include "paddle/legacy/gserver/layers/AgentLayer.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
 
 DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");
 
diff --git a/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h
index 7e943cebd35234ba7af357c9f64fde6b0a9546ce..0a13d4f6f84eb5309a1b25f039357cb8af02c35e 100644
--- a/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "GradientMachine.h"
 #include "NeuralNetwork.h"
 
-#include "paddle/utils/Locks.h"
+#include "paddle/legacy/utils/Locks.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/AddtoLayer.cpp b/paddle/legacy/gserver/layers/AddtoLayer.cpp
index 75e17f52df64253232dc8fc042d0a1a8e7d98e26..39c5603d9389b433b77e2876f34b3061c62f68f0 100644
--- a/paddle/legacy/gserver/layers/AddtoLayer.cpp
+++ b/paddle/legacy/gserver/layers/AddtoLayer.cpp
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "AddtoLayer.h"
 
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/AddtoLayer.h b/paddle/legacy/gserver/layers/AddtoLayer.h
index 1f948de4756cfd3c6c990475bfac4f44004c9068..ad3cefe1a4d27953b2fef535e1b865175a2cadc2 100644
--- a/paddle/legacy/gserver/layers/AddtoLayer.h
+++ b/paddle/legacy/gserver/layers/AddtoLayer.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/AgentLayer.cpp b/paddle/legacy/gserver/layers/AgentLayer.cpp
index e2f73f88f59278c6e6e6f0a1fe8457393d53f44a..bae89b2fa34d156adae1305d78d6c1465ccdd0ae 100644
--- a/paddle/legacy/gserver/layers/AgentLayer.cpp
+++ b/paddle/legacy/gserver/layers/AgentLayer.cpp
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "AgentLayer.h"
 
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/AgentLayer.h b/paddle/legacy/gserver/layers/AgentLayer.h
index f506db2f2dbfe844a7fcb45a16108dd8f2c660d9..a05eac5e704466df02a74ce6e5364ab6f03f7446 100644
--- a/paddle/legacy/gserver/layers/AgentLayer.h
+++ b/paddle/legacy/gserver/layers/AgentLayer.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/AverageLayer.cpp b/paddle/legacy/gserver/layers/AverageLayer.cpp
index b3787b1448a272d2879b372d34406aacc6c0bbfb..0539da793712527c72792603ae28a1d0aa903bcc 100644
--- a/paddle/legacy/gserver/layers/AverageLayer.cpp
+++ b/paddle/legacy/gserver/layers/AverageLayer.cpp
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "AverageLayer.h"
 
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/BatchNormBaseLayer.cpp b/paddle/legacy/gserver/layers/BatchNormBaseLayer.cpp
index a3516f9423e62df0192485c4476357ac51dc27a4..4dcbd8dc270d5e5329b33b366ac937894833085f 100644
--- a/paddle/legacy/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/legacy/gserver/layers/BatchNormBaseLayer.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "BatchNormBaseLayer.h"
 #include "BatchNormalizationLayer.h"
 #include "Layer.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 #ifdef PADDLE_WITH_CUDA
 #include "CudnnBatchNormLayer.h"
 #endif
diff --git a/paddle/legacy/gserver/layers/BatchNormBaseLayer.h b/paddle/legacy/gserver/layers/BatchNormBaseLayer.h
index 5a446c0843a22adecbaf2ae09fcd526b68865ae2..8dc1d7883767b4aabc8501531996036c2def9481 100644
--- a/paddle/legacy/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/legacy/gserver/layers/BatchNormBaseLayer.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "Layer.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/BatchNormalizationLayer.cpp b/paddle/legacy/gserver/layers/BatchNormalizationLayer.cpp
index 59831dd9049d70198721989b4a515df39e015968..0297bd44c7b0485f34598f6926e5337da452460d 100644
--- a/paddle/legacy/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/legacy/gserver/layers/BatchNormalizationLayer.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 #ifdef PADDLE_WITH_CUDA
 #include "hl_batch_transpose.h"
 #endif
diff --git a/paddle/legacy/gserver/layers/BilinearInterpLayer.cpp b/paddle/legacy/gserver/layers/BilinearInterpLayer.cpp
index 9775914596ce3253aada71fbe7197410414fede5..a091f51bc20e219c3111fb07058b5adea5a3fc38 100644
--- a/paddle/legacy/gserver/layers/BilinearInterpLayer.cpp
+++ b/paddle/legacy/gserver/layers/BilinearInterpLayer.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "BilinearInterpLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/BlockExpandLayer.cpp b/paddle/legacy/gserver/layers/BlockExpandLayer.cpp
index 793d24e884a6f76c2aa897b3d03f3adc3e201265..24b5af67d40958c940eb0864994e7e81464f6c70 100644
--- a/paddle/legacy/gserver/layers/BlockExpandLayer.cpp
+++ b/paddle/legacy/gserver/layers/BlockExpandLayer.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "BlockExpandLayer.h"
 
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/ConcatenateLayer.cpp b/paddle/legacy/gserver/layers/ConcatenateLayer.cpp
index e6de329ff3f9ccfdd1cbe697c1de1a9cd8c7926a..ce3f2ca950bf87e287163f1cfc8b15d815a68cf4 100644
--- a/paddle/legacy/gserver/layers/ConcatenateLayer.cpp
+++ b/paddle/legacy/gserver/layers/ConcatenateLayer.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "Projection.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/ContextProjection.cpp b/paddle/legacy/gserver/layers/ContextProjection.cpp
index 10c3cef0da61af76a6b0a207e4b914276a2fa39b..8bcf32663eb381a7d7700270efcaa08f9ff86356 100644
--- a/paddle/legacy/gserver/layers/ContextProjection.cpp
+++ b/paddle/legacy/gserver/layers/ContextProjection.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "ContextProjection.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/Conv3DLayer.cpp b/paddle/legacy/gserver/layers/Conv3DLayer.cpp
index b38de86b1591f987a63478d019019f87c88cee20..d072a74234b43e06c1194acc2ec2b3f961b4a97e 100644
--- a/paddle/legacy/gserver/layers/Conv3DLayer.cpp
+++ b/paddle/legacy/gserver/layers/Conv3DLayer.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Conv3DLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/ConvBaseLayer.cpp b/paddle/legacy/gserver/layers/ConvBaseLayer.cpp
index d8997527fb1934b915bde2ae052159ea60ba302e..76120915e48661a9b14fb6b9bb99e9ec9dd71e4b 100644
--- a/paddle/legacy/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/legacy/gserver/layers/ConvBaseLayer.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "ConvBaseLayer.h"
 #include "paddle/legacy/math/MathUtils.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 namespace paddle {
 
 bool ConvBaseLayer::init(const LayerMap& layerMap,
diff --git a/paddle/legacy/gserver/layers/ConvBaseProjection.cpp b/paddle/legacy/gserver/layers/ConvBaseProjection.cpp
index 39f433b78fe7ce22cc7f93b87d96ed19c10fc2e9..ff5d3412de1c2940cdd9dcf9397370153c24b0c6 100644
--- a/paddle/legacy/gserver/layers/ConvBaseProjection.cpp
+++ b/paddle/legacy/gserver/layers/ConvBaseProjection.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "ConvBaseProjection.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/ConvProjection.cpp b/paddle/legacy/gserver/layers/ConvProjection.cpp
index f382e6cab12a833ce555c948f41e1086093bd78e..b40cdac2587d1fc0fec00801414560d2a27bd34a 100644
--- a/paddle/legacy/gserver/layers/ConvProjection.cpp
+++ b/paddle/legacy/gserver/layers/ConvProjection.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "ConvProjection.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/ConvShiftLayer.cpp b/paddle/legacy/gserver/layers/ConvShiftLayer.cpp
index dda1a91e450fe4d1636a6c9af9a15e473b517983..b7ecbe556c59b32cc5833617717b40c730392506 100644
--- a/paddle/legacy/gserver/layers/ConvShiftLayer.cpp
+++ b/paddle/legacy/gserver/layers/ConvShiftLayer.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/ConvTransProjection.cpp b/paddle/legacy/gserver/layers/ConvTransProjection.cpp
index 242ce34a607057069a4d0a31e9b70d56279d37ab..00e34c8f2dcd2ea9698779f8b4425561f979cfef 100644
--- a/paddle/legacy/gserver/layers/ConvTransProjection.cpp
+++ b/paddle/legacy/gserver/layers/ConvTransProjection.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "ConvTransProjection.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp b/paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp
index 29a71fc1d9be2ff3c2688d647b0a1892631f3cc8..c38ab251f18728425d01479b82630550d29e9b61 100644
--- a/paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp
+++ b/paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/CosSimLayer.cpp b/paddle/legacy/gserver/layers/CosSimLayer.cpp
index 4e44a5e8dfdad98bff0cd0f405b4227340a45728..ab8d7cc1f61823890676e8f647f784cfa9a0775e 100644
--- a/paddle/legacy/gserver/layers/CosSimLayer.cpp
+++ b/paddle/legacy/gserver/layers/CosSimLayer.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "CosSimLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/CosSimLayer.h b/paddle/legacy/gserver/layers/CosSimLayer.h
index 2e53de414d2f1f28627c831e9972ab6f7d1dd4ad..b08e2c6a35369832732706d64f209f85a5292a6f 100644
--- a/paddle/legacy/gserver/layers/CosSimLayer.h
+++ b/paddle/legacy/gserver/layers/CosSimLayer.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
 
 namespace paddle {
 /**
diff --git a/paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp b/paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp
index da3ddf11dc7430d0e41d633d426c20ff0c400151..03de0be815a1fb5eeb7ffab31b1721dc5951a469 100644
--- a/paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp
+++ b/paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 /**
diff --git a/paddle/legacy/gserver/layers/CostLayer.cpp b/paddle/legacy/gserver/layers/CostLayer.cpp
index 2c0762be25eec702af350fefab7a65dd72dbf7af..18b5b77bde9dee97cb6971624007307ff06411c7 100644
--- a/paddle/legacy/gserver/layers/CostLayer.cpp
+++ b/paddle/legacy/gserver/layers/CostLayer.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <algorithm>
 #include <cmath>
 #include <memory>
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 #include "paddle/legacy/math/SparseMatrix.h"
 
diff --git a/paddle/legacy/gserver/layers/CropLayer.cpp b/paddle/legacy/gserver/layers/CropLayer.cpp
index bc97ca2f9e0cdc86f82baa0ce3fbafde2db0c10f..d891375ecce0371503ba3034f0584f3b1e553a55 100644
--- a/paddle/legacy/gserver/layers/CropLayer.cpp
+++ b/paddle/legacy/gserver/layers/CropLayer.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "CropLayer.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 namespace paddle {
 
 REGISTER_LAYER(crop, CropLayer);
diff --git a/paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp
index 3f4e17c0188c7d68965f43148ce29a38dacbf809..051155e0d2c1b4910c6627a902a4150cbfb15800 100644
--- a/paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "CudnnBatchNormLayer.h"
 #include "Layer.h"
 #include "paddle/legacy/cuda/include/hl_batch_norm.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/CudnnBatchNormLayer.h b/paddle/legacy/gserver/layers/CudnnBatchNormLayer.h
index 1bb4eff8d2372660caa4ec4a4a20a27f365bebd0..3b33b983b31173ab941df5f2e66eac51aabc6315 100644
--- a/paddle/legacy/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/legacy/gserver/layers/CudnnBatchNormLayer.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <cudnn.h>
 #include "BatchNormBaseLayer.h"
 #include "Layer.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/CudnnConvBaseLayer.cpp b/paddle/legacy/gserver/layers/CudnnConvBaseLayer.cpp
index 6d0a40a60710603900a9b89980d38b2d7638ad60..9353cca9c83bd90a454b2be56dc08b8eadee0bf7 100644
--- a/paddle/legacy/gserver/layers/CudnnConvBaseLayer.cpp
+++ b/paddle/legacy/gserver/layers/CudnnConvBaseLayer.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "CudnnConvBaseLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 REGISTER_LAYER(cudnn_conv, CudnnConvBaseLayer);
diff --git a/paddle/legacy/gserver/layers/CudnnPoolLayer.cpp b/paddle/legacy/gserver/layers/CudnnPoolLayer.cpp
index 9739ed9da463be7434731a9b035f3ee7cf3fc2bf..c790dfd71efbee1a2a0afa69e6c336c4330737d0 100644
--- a/paddle/legacy/gserver/layers/CudnnPoolLayer.cpp
+++ b/paddle/legacy/gserver/layers/CudnnPoolLayer.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "CudnnPoolLayer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/DataNormLayer.cpp b/paddle/legacy/gserver/layers/DataNormLayer.cpp
index 86da4d6f957e2ce0afc53d69f9d57c234f8f178f..6820dfa4d4dcf90b2318a190ad4cc082c26fc180 100644
--- a/paddle/legacy/gserver/layers/DataNormLayer.cpp
+++ b/paddle/legacy/gserver/layers/DataNormLayer.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "DataNormLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/DataNormLayer.h b/paddle/legacy/gserver/layers/DataNormLayer.h
index 556d7f4d669095cef0d6506fbba07b7455456b43..7bb8e928248355cb7ae78dc16e467b77a42e02fc 100644
--- a/paddle/legacy/gserver/layers/DataNormLayer.h
+++ b/paddle/legacy/gserver/layers/DataNormLayer.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/DeConv3DLayer.cpp b/paddle/legacy/gserver/layers/DeConv3DLayer.cpp
index db6d6e073c08c35c5a71b2b18ab0103d42ccd318..2cd635564c4cd9f42d27cd58694cff381d1ce224 100644
--- a/paddle/legacy/gserver/layers/DeConv3DLayer.cpp
+++ b/paddle/legacy/gserver/layers/DeConv3DLayer.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "DeConv3DLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/DotProdLayer.cpp b/paddle/legacy/gserver/layers/DotProdLayer.cpp
index 445361b10176a160609b181cb2fdc3756921c423..06060d93f76c18d893852a5f5c99c36fe5641b2e 100644
--- a/paddle/legacy/gserver/layers/DotProdLayer.cpp
+++ b/paddle/legacy/gserver/layers/DotProdLayer.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/EosIdCheckLayer.cpp b/paddle/legacy/gserver/layers/EosIdCheckLayer.cpp
index 04400f2836581179849a4dd1c256bbddcc82530f..38671126c62ba36e22496dcbe1ff3c8d6dcea742 100644
--- a/paddle/legacy/gserver/layers/EosIdCheckLayer.cpp
+++ b/paddle/legacy/gserver/layers/EosIdCheckLayer.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Layer.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 /**
diff --git a/paddle/legacy/gserver/layers/ExpandConvLayer.cpp b/paddle/legacy/gserver/layers/ExpandConvLayer.cpp
index 3a8478658249bfb0886e904aec43e50fe3618f79..8a53db380686cea2ad121c948c45a0fa1154381e 100644
--- a/paddle/legacy/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/legacy/gserver/layers/ExpandConvLayer.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "ExpandConvLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 DEFINE_bool(use_nnpack,
             false,
diff --git a/paddle/legacy/gserver/layers/ExpandLayer.cpp b/paddle/legacy/gserver/layers/ExpandLayer.cpp
index 6b5776754017bca8f8c14170ecfb4faa4109e0b5..074fbab8ef9d1453160058031be370e991459fa5 100644
--- a/paddle/legacy/gserver/layers/ExpandLayer.cpp
+++ b/paddle/legacy/gserver/layers/ExpandLayer.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "ExpandLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp b/paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp
index ddd202e1c6d20e25ec77dc881965a47092b10e42..6cf269fa3ffb3f4a2864aea4225d9401930e73b1 100644
--- a/paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/FactorizationMachineLayer.h b/paddle/legacy/gserver/layers/FactorizationMachineLayer.h
index 1070ebd0971e18d090cd7c46cf38a016522db5b8..fc015ed727bbd8781bb50a22b8e745d8896837e1 100644
--- a/paddle/legacy/gserver/layers/FactorizationMachineLayer.h
+++ b/paddle/legacy/gserver/layers/FactorizationMachineLayer.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
 
 namespace paddle {
 /**
diff --git a/paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp
index 417756a286d9538d9bd17a41b744f3f9ac820ae3..a3fe1433e4b5fd7bd77f8d6bb73378243d391dd5 100644
--- a/paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp
+++ b/paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/FullMatrixProjection.h b/paddle/legacy/gserver/layers/FullMatrixProjection.h
index a27aa4a12327ac39ec3418a849b1230e13f759ee..c33d02a3aeac8e83f613e61320cb6cd63baeae83 100644
--- a/paddle/legacy/gserver/layers/FullMatrixProjection.h
+++ b/paddle/legacy/gserver/layers/FullMatrixProjection.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 #include "Projection.h"
 
diff --git a/paddle/legacy/gserver/layers/FullyConnectedLayer.cpp b/paddle/legacy/gserver/layers/FullyConnectedLayer.cpp
index 0ffb4876f8b0a45abf5588201482d75cd9222fbd..07f4dfbe39c6b9bc233b3c75b4b5891a1ec9b2ec 100644
--- a/paddle/legacy/gserver/layers/FullyConnectedLayer.cpp
+++ b/paddle/legacy/gserver/layers/FullyConnectedLayer.cpp
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/FullyConnectedLayer.h b/paddle/legacy/gserver/layers/FullyConnectedLayer.h
index a8a1c54e55fd4710c62ee8b91720755e0af80ff5..7e29cac0437a8ae735ffb71e5ee901edd79fa7f3 100644
--- a/paddle/legacy/gserver/layers/FullyConnectedLayer.h
+++ b/paddle/legacy/gserver/layers/FullyConnectedLayer.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
 
 namespace paddle {
 /**
diff --git a/paddle/legacy/gserver/layers/GatedRecurrentLayer.cpp b/paddle/legacy/gserver/layers/GatedRecurrentLayer.cpp
index 9d38849fdf97e6099e39384dd7e6546de9180462..bdcd445cb47de346a8ca496fdaecf7d1f841f51e 100644
--- a/paddle/legacy/gserver/layers/GatedRecurrentLayer.cpp
+++ b/paddle/legacy/gserver/layers/GatedRecurrentLayer.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "GatedRecurrentLayer.h"
 #include "Layer.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/GruCompute.cpp b/paddle/legacy/gserver/layers/GruCompute.cpp
index d50c959e4386ece16b935835ee7d6d717b844e64..adad6285b7d5acd8780444ffeab6627531683cb7 100644
--- a/paddle/legacy/gserver/layers/GruCompute.cpp
+++ b/paddle/legacy/gserver/layers/GruCompute.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "GruCompute.h"
 #include "hl_recurrent_apply.cuh"
 #include "paddle/legacy/function/GruFunctor.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/GruCompute.h b/paddle/legacy/gserver/layers/GruCompute.h
index 50006325ce9969c4941aaf28604260f0aeb9b97a..6feea7aca81b8618071893581a4e16d8ad38101c 100644
--- a/paddle/legacy/gserver/layers/GruCompute.h
+++ b/paddle/legacy/gserver/layers/GruCompute.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "ModelConfig.pb.h"
 #include "hl_gpu.h"
-#include "paddle/utils/Common.h"
+#include "paddle/legacy/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/GruStepLayer.cpp b/paddle/legacy/gserver/layers/GruStepLayer.cpp
index 114f287411c2fccbc08b7da4c05462967c81b268..2480e42d68b87ee406efc2b220b9ad6bf5cacbd6 100644
--- a/paddle/legacy/gserver/layers/GruStepLayer.cpp
+++ b/paddle/legacy/gserver/layers/GruStepLayer.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "GruCompute.h"
 #include "Layer.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.cpp
index 3e720f179ee66baa73f40b8f5f19bfb4090831c0..34495994096a87640bdeef777feb5cd783cd4598 100644
--- a/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ b/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "HierarchicalSigmoidLayer.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/IdentityProjection.cpp b/paddle/legacy/gserver/layers/IdentityProjection.cpp
index 34e9eb90161f7942c528b70f177e30f301a8f53f..f707642e09b86721a88142ab8b745bb3492e820c 100644
--- a/paddle/legacy/gserver/layers/IdentityProjection.cpp
+++ b/paddle/legacy/gserver/layers/IdentityProjection.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Projection.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/InterpolationLayer.cpp b/paddle/legacy/gserver/layers/InterpolationLayer.cpp
index aabfdc55ba4dc44bf7f487cb30be32a376175729..ed2294e8a397edfee6ad3c1f52235970d6ad48a9 100644
--- a/paddle/legacy/gserver/layers/InterpolationLayer.cpp
+++ b/paddle/legacy/gserver/layers/InterpolationLayer.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/L2DistanceLayer.cpp b/paddle/legacy/gserver/layers/L2DistanceLayer.cpp
index c8cca3762cc3ecd6c04d7d2b804bc588c281bfb4..a3e627e57047b790b4f74089a352f06b55e48664 100644
--- a/paddle/legacy/gserver/layers/L2DistanceLayer.cpp
+++ b/paddle/legacy/gserver/layers/L2DistanceLayer.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "L2DistanceLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/Layer.cpp b/paddle/legacy/gserver/layers/Layer.cpp
index f580b8e6977d5c757c118e7945d6dc8d88b3c927..890d33552dd31a8fd348a36d44fb0824ac9b32b5 100644
--- a/paddle/legacy/gserver/layers/Layer.cpp
+++ b/paddle/legacy/gserver/layers/Layer.cpp
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include "CostLayer.h"
 #include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/utils/Error.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Error.h"
+#include "paddle/legacy/utils/Logging.h"
 
 #ifndef PADDLE_MOBILE_INFERENCE
 #include "ValidationLayer.h"
diff --git a/paddle/legacy/gserver/layers/Layer.h b/paddle/legacy/gserver/layers/Layer.h
index 65ec3bd03faaaf35065f8e2c896d4336103fc0a8..a7ff76decea9a448acfcdef1c81a68b5a823cc56 100644
--- a/paddle/legacy/gserver/layers/Layer.h
+++ b/paddle/legacy/gserver/layers/Layer.h
@@ -23,8 +23,8 @@ limitations under the License. */
 #include "paddle/legacy/parameter/Argument.h"
 #include "paddle/legacy/parameter/Parameter.h"
 #include "paddle/legacy/parameter/Weight.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+#include "paddle/legacy/utils/Util.h"
 
 /// Macro for registering a layer type.
 /// Example: REGISTER_LAYER(crf_error, CRFDecodingErrorLayer);
diff --git a/paddle/legacy/gserver/layers/LstmCompute.cpp b/paddle/legacy/gserver/layers/LstmCompute.cpp
index ea30f6d6b1b8586569407af6baac2c14034e709c..70f08e1d4efd2223e7ddec1b104e4ee63fc34de5 100644
--- a/paddle/legacy/gserver/layers/LstmCompute.cpp
+++ b/paddle/legacy/gserver/layers/LstmCompute.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "LstmCompute.h"
 #include "hl_recurrent_apply.cuh"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/LstmCompute.h b/paddle/legacy/gserver/layers/LstmCompute.h
index 80fb01cd1885151c8d62a4b5dfdb4ba08327926d..ac40c35ef1b0a11e61b5d1b11476ffe7daff6d5e 100644
--- a/paddle/legacy/gserver/layers/LstmCompute.h
+++ b/paddle/legacy/gserver/layers/LstmCompute.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "ModelConfig.pb.h"
 #include "hl_gpu.h"
-#include "paddle/utils/Common.h"
+#include "paddle/legacy/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/LstmLayer.cpp b/paddle/legacy/gserver/layers/LstmLayer.cpp
index bb40ec05855c8e2871a5ea8181eeae480db54b1a..43a55d8d490faf0049d47bbca6ae1947d13e6be8 100644
--- a/paddle/legacy/gserver/layers/LstmLayer.cpp
+++ b/paddle/legacy/gserver/layers/LstmLayer.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "LstmLayer.h"
 #include "paddle/legacy/math/BaseMatrix.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 DECLARE_bool(prev_batch_state);
 
diff --git a/paddle/legacy/gserver/layers/LstmStepLayer.cpp b/paddle/legacy/gserver/layers/LstmStepLayer.cpp
index c44768ddb2b903763288465325899d86176df73a..f02f8ad62fe4d4cb4bb580923200b398c8483a99 100644
--- a/paddle/legacy/gserver/layers/LstmStepLayer.cpp
+++ b/paddle/legacy/gserver/layers/LstmStepLayer.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "LstmCompute.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp
index 01c20d240b5a1f4aab642d6be00689716a015576..b47bf14821fed4057227c80bb77e584649ab3145 100644
--- a/paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "MKLDNNConvLayer.h"
 #include "paddle/legacy/math/MathUtils.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 using namespace mkldnn;  // NOLINT
 typedef memory::format format;
diff --git a/paddle/legacy/gserver/layers/MKLDNNFcLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNFcLayer.cpp
index 0c7e6f16e24a65b552cebcbd2111926cefc211f4..f3747c7db84ef53fdcfa3741525a754fab63bca5 100644
--- a/paddle/legacy/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/legacy/gserver/layers/MKLDNNFcLayer.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "MKLDNNFcLayer.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 using namespace mkldnn;  // NOLINT
 typedef memory::format format;
diff --git a/paddle/legacy/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNLRNLayer.cpp
index 88513ab8bca3899775be7822083b51120a04d6e4..739482348f71bf144551cd1d881f1f1d7d69201f 100644
--- a/paddle/legacy/gserver/layers/MKLDNNLRNLayer.cpp
+++ b/paddle/legacy/gserver/layers/MKLDNNLRNLayer.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "MKLDNNLRNLayer.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 using namespace mkldnn;  // NOLINT
 typedef memory::format format;
diff --git a/paddle/legacy/gserver/layers/MKLDNNLayer.h b/paddle/legacy/gserver/layers/MKLDNNLayer.h
index b8f292684cdb23af197c6d4dbf023321781b662b..94dc8625f68985a16bd68a6e36a1ad607d77a7cb 100644
--- a/paddle/legacy/gserver/layers/MKLDNNLayer.h
+++ b/paddle/legacy/gserver/layers/MKLDNNLayer.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "MKLDNNBase.h"
 #include "mkldnn.hpp"
 #include "paddle/legacy/math/MKLDNNMatrix.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 DECLARE_bool(use_mkldnn);
 
diff --git a/paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp
index 99c419be88fe65cf93b339b0d90622e656bd09ac..83d980538d2b1b7351bf858ab391c14f6e7170bd 100644
--- a/paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "MKLDNNPoolLayer.h"
 #include "paddle/legacy/math/MathUtils.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 using namespace mkldnn;  // NOLINT
 typedef memory::format format;
diff --git a/paddle/legacy/gserver/layers/MaxLayer.cpp b/paddle/legacy/gserver/layers/MaxLayer.cpp
index 7ee2e0dd946d6f332f6b8454f977601b0ee8d249..b51251b663cf818fbe662a96b7c0d55a615640d4 100644
--- a/paddle/legacy/gserver/layers/MaxLayer.cpp
+++ b/paddle/legacy/gserver/layers/MaxLayer.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "MaxLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/MaxLayer.h b/paddle/legacy/gserver/layers/MaxLayer.h
index 6b3491cde56aa6897789cb1faad1099859bff12e..12d0128e39f2113d0e156813f9b3657cae145eed 100644
--- a/paddle/legacy/gserver/layers/MaxLayer.h
+++ b/paddle/legacy/gserver/layers/MaxLayer.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "SequencePoolLayer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.cpp b/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.cpp
index e594e22b5eaa6027fdf5bbd09ab93774d9a798be..a1cc59a719e43453a8919a5827369982ac355480 100644
--- a/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.cpp
+++ b/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "MaxPoolWithMaskLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/MixedLayer.cpp b/paddle/legacy/gserver/layers/MixedLayer.cpp
index 7dcb30b98d6e6b08929d5fecba0833c8b1989725..63e658c09c2b3bae30c8b2890e4d67f72266dd4d 100644
--- a/paddle/legacy/gserver/layers/MixedLayer.cpp
+++ b/paddle/legacy/gserver/layers/MixedLayer.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "MixedLayer.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/MultinomialSampler.h b/paddle/legacy/gserver/layers/MultinomialSampler.h
index 8cbb229f157c0904e63a696f860ec6739d5167c4..ed445352418f8504e52a6139492e3577a95eecb1 100644
--- a/paddle/legacy/gserver/layers/MultinomialSampler.h
+++ b/paddle/legacy/gserver/layers/MultinomialSampler.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <memory>
 #include <random>
-#include "paddle/utils/Common.h"
+#include "paddle/legacy/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/MultiplexLayer.cpp b/paddle/legacy/gserver/layers/MultiplexLayer.cpp
index 54a554a1a9f3eb44421fa578604a5ea490ce0fcb..9ca2b2417596e7978ea6b84ec76bcb8a305a4f5d 100644
--- a/paddle/legacy/gserver/layers/MultiplexLayer.cpp
+++ b/paddle/legacy/gserver/layers/MultiplexLayer.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/NormLayer.cpp b/paddle/legacy/gserver/layers/NormLayer.cpp
index 4678f6fa9ab184870fc2651def18f47da9a0cc01..443e26dbc859b1c51c5fb93077178ac45bdeaff3 100644
--- a/paddle/legacy/gserver/layers/NormLayer.cpp
+++ b/paddle/legacy/gserver/layers/NormLayer.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "NormLayer.h"
 #include "NormProjectionLayer.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 namespace paddle {
 
 REGISTER_LAYER_CREATE_FUNC(norm, &NormLayer::create);
diff --git a/paddle/legacy/gserver/layers/NormProjectionLayer.cpp b/paddle/legacy/gserver/layers/NormProjectionLayer.cpp
index 3013bbdbc791546897fca51e73a056f2c843e63f..72affaa1ce618a841f8040c84467a46b77531958 100644
--- a/paddle/legacy/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/legacy/gserver/layers/NormProjectionLayer.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "NormProjectionLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 size_t CMRProjectionNormLayer::getSize() {
diff --git a/paddle/legacy/gserver/layers/OuterProdLayer.cpp b/paddle/legacy/gserver/layers/OuterProdLayer.cpp
index 7988560d5aa271d444f158e656bbc460152b2590..d0928be9d4d52532503987af8e29fdf5c7fb16a5 100644
--- a/paddle/legacy/gserver/layers/OuterProdLayer.cpp
+++ b/paddle/legacy/gserver/layers/OuterProdLayer.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/PadLayer.cpp b/paddle/legacy/gserver/layers/PadLayer.cpp
index b1910e108b5b2f7b55a2aa1527b96e6b8a16f348..7b92b3de2d839f240ec8cbe07ed7685295568809 100644
--- a/paddle/legacy/gserver/layers/PadLayer.cpp
+++ b/paddle/legacy/gserver/layers/PadLayer.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "PadLayer.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/ParameterReluLayer.cpp b/paddle/legacy/gserver/layers/ParameterReluLayer.cpp
index 12d04fc1c3ca169179beafc372a07a2e6d0a1773..23715d1975d7a3606a9418d54bc69ae6f036a93a 100644
--- a/paddle/legacy/gserver/layers/ParameterReluLayer.cpp
+++ b/paddle/legacy/gserver/layers/ParameterReluLayer.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "ParameterReluLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/ParameterReluLayer.h b/paddle/legacy/gserver/layers/ParameterReluLayer.h
index a4abd7af75547ca2108a31552904fe6e83dcd8f1..3aac4b42f60531b5856ddef208b8356898e42859 100644
--- a/paddle/legacy/gserver/layers/ParameterReluLayer.h
+++ b/paddle/legacy/gserver/layers/ParameterReluLayer.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/Pool3DLayer.cpp b/paddle/legacy/gserver/layers/Pool3DLayer.cpp
index 3ac9eb0d8198814c9f01fe101a60ab1f1f431062..ae3f55c27f2d7bd3ab47d834d5b6f274ff558310 100644
--- a/paddle/legacy/gserver/layers/Pool3DLayer.cpp
+++ b/paddle/legacy/gserver/layers/Pool3DLayer.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "Pool3DLayer.h"
 #include "PoolProjectionLayer.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/PoolLayer.cpp b/paddle/legacy/gserver/layers/PoolLayer.cpp
index ee589e6be51b1e66984f5a1d808b73aab962821d..df172d95757e0842328caa508042f3613bc72232 100644
--- a/paddle/legacy/gserver/layers/PoolLayer.cpp
+++ b/paddle/legacy/gserver/layers/PoolLayer.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "PoolLayer.h"
 #include "MaxPoolWithMaskLayer.h"
 #include "PoolProjectionLayer.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 #ifdef PADDLE_WITH_CUDA
 #include "CudnnPoolLayer.h"
 #endif
diff --git a/paddle/legacy/gserver/layers/PoolProjectionLayer.cpp b/paddle/legacy/gserver/layers/PoolProjectionLayer.cpp
index 73d320e67ec09513f419ecdd45a57fc5c54df5ed..e44b1d7ba1494e43db81f998c2818bbbf7779d6f 100644
--- a/paddle/legacy/gserver/layers/PoolProjectionLayer.cpp
+++ b/paddle/legacy/gserver/layers/PoolProjectionLayer.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "PoolProjectionLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/PowerLayer.cpp b/paddle/legacy/gserver/layers/PowerLayer.cpp
index 26a57fcfdd6e8b746b6ba67bd8c7fb674c4cc796..5e94c64db6098dbc1ed13bdcbd573f95024713bc 100644
--- a/paddle/legacy/gserver/layers/PowerLayer.cpp
+++ b/paddle/legacy/gserver/layers/PowerLayer.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/RecurrentLayer.h b/paddle/legacy/gserver/layers/RecurrentLayer.h
index 94e633e65777aad540738ea67ea1b4e03dd75954..287ea27a0984729fde5b35aa0807e9f2b29f993f 100644
--- a/paddle/legacy/gserver/layers/RecurrentLayer.h
+++ b/paddle/legacy/gserver/layers/RecurrentLayer.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gflags/gflags.h>
 #include "Layer.h"
 #include "SequenceToBatch.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp b/paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp
index 4f121bdb4ab39e0d618d536b75e8c47d4520fe0b..39321245995fce2f2bd671593c028fd6038865de 100644
--- a/paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp
+++ b/paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/legacy/gserver/layers/Layer.h"
 
 #include "paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/RowConvLayer.cpp b/paddle/legacy/gserver/layers/RowConvLayer.cpp
index 63b499e486fd24b5f816ee0e897b040ee5007581..1961557dc2d2601091bb0e56fcd884d76d49bc0e 100644
--- a/paddle/legacy/gserver/layers/RowConvLayer.cpp
+++ b/paddle/legacy/gserver/layers/RowConvLayer.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "RowConvLayer.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/ScaleSubRegionLayer.cpp b/paddle/legacy/gserver/layers/ScaleSubRegionLayer.cpp
index 68a0ff735844679df1393473355f54ee616c09bd..70d44d2a7ef25df64beb2c861692436d842dac02 100644
--- a/paddle/legacy/gserver/layers/ScaleSubRegionLayer.cpp
+++ b/paddle/legacy/gserver/layers/ScaleSubRegionLayer.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "ScaleSubRegionLayer.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 namespace paddle {
 
 REGISTER_LAYER(scale_sub_region, ScaleSubRegionLayer);
diff --git a/paddle/legacy/gserver/layers/ScalingLayer.cpp b/paddle/legacy/gserver/layers/ScalingLayer.cpp
index e68ff8905ee9e9965437addd2ef583c2d8b279e8..a8286b6614c3cdfbd720d0719f939018f6ae9579 100644
--- a/paddle/legacy/gserver/layers/ScalingLayer.cpp
+++ b/paddle/legacy/gserver/layers/ScalingLayer.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp
index a181f55d91f07ac6863084d9f5f724c47ef4e13b..72fb06814884cc2bcca2c600105077d8cf1459c5 100644
--- a/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp
+++ b/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h
index 068da57d8d2a79297ce95acd0f152514c9dcb65e..3ba04d9b2ae208eda021a451e94856d9993dc126 100644
--- a/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h
+++ b/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/SequenceConcatLayer.cpp b/paddle/legacy/gserver/layers/SequenceConcatLayer.cpp
index 024ca048b4a0fb28b068fd69fafcfd9b313dbb72..7b598e11acde533564f6eda49d78ea8df99a5056 100644
--- a/paddle/legacy/gserver/layers/SequenceConcatLayer.cpp
+++ b/paddle/legacy/gserver/layers/SequenceConcatLayer.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp
index b00bf65997b656379463f651b6ff78ddd03ee300..8735d71ba372de894c9852229ed8c77537792ea0 100644
--- a/paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 #include "SequencePoolLayer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/SequencePoolLayer.cpp b/paddle/legacy/gserver/layers/SequencePoolLayer.cpp
index 650ab425d1fcca56d8862200f37dd5bb36a67240..243b795db428ede1fbb39a5054485a198a14e00c 100644
--- a/paddle/legacy/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/legacy/gserver/layers/SequencePoolLayer.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "SequencePoolLayer.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp b/paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp
index f72acadec9d8f6108fe5f9c79ae7924c2b010d4d..e3d40cab50af1d6eafe28331cdd481ee2b187a56 100644
--- a/paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp
+++ b/paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/SequenceSliceLayer.cpp b/paddle/legacy/gserver/layers/SequenceSliceLayer.cpp
index 65b4787fed3aa029dac4663c8f8dd6097d952c44..3ed51c4ef2f6e91da94f302c14d1c0cc555886aa 100644
--- a/paddle/legacy/gserver/layers/SequenceSliceLayer.cpp
+++ b/paddle/legacy/gserver/layers/SequenceSliceLayer.cpp
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
 #include "paddle/legacy/math/Vector.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp b/paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp
index beb288e4ad8a058458d0b3488e7b95ca53d65cb1..9168fd7dda6dcdcd9e272acbf6337f1c8468e6f0 100644
--- a/paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp
+++ b/paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h
index 6cdfba33b3db03e5a9d6b497675407287e936628..6d8ed9c87889a93664f09dbaf2a84bd00b1757ad 100644
--- a/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h
+++ b/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "Layer.h"
 #include "PoolProjection.h"
 #include "paddle/legacy/math/MathUtils.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 /**
diff --git a/paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp b/paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp
index 4f648ec01c4a25ce598db9b6ca5583b24c51d57c..f363c2ac8dd22fc8b8e1d7fca27e5beb935d42de 100644
--- a/paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp
+++ b/paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
 #include "paddle/legacy/math/Vector.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/SubSequenceLayer.cpp b/paddle/legacy/gserver/layers/SubSequenceLayer.cpp
index 6b27550048c7c8930cf4a8b5e96fd06292f64070..36796f04739054bb19d4a3ce656e248898ba4b17 100644
--- a/paddle/legacy/gserver/layers/SubSequenceLayer.cpp
+++ b/paddle/legacy/gserver/layers/SubSequenceLayer.cpp
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
 #include "paddle/legacy/math/Vector.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/SumToOneNormLayer.cpp b/paddle/legacy/gserver/layers/SumToOneNormLayer.cpp
index 4cd173a8c79204946fdeb4eb107cd0d9234f675a..410f4dd7c90e67488bc3dda6dfad551032890d65 100644
--- a/paddle/legacy/gserver/layers/SumToOneNormLayer.cpp
+++ b/paddle/legacy/gserver/layers/SumToOneNormLayer.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/SwitchOrderLayer.cpp b/paddle/legacy/gserver/layers/SwitchOrderLayer.cpp
index 704735de38bd373c0714de6bb4e139d1505c5451..513f3df7bcaf854835ec0e500d47c23469d5aa46 100644
--- a/paddle/legacy/gserver/layers/SwitchOrderLayer.cpp
+++ b/paddle/legacy/gserver/layers/SwitchOrderLayer.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "SwitchOrderLayer.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/TensorLayer.cpp b/paddle/legacy/gserver/layers/TensorLayer.cpp
index b2271c63ef76d85574cf7f71b18aef4239938b8e..7f874bce0f2bdf7ab4771e470e2e4535693ecf68 100644
--- a/paddle/legacy/gserver/layers/TensorLayer.cpp
+++ b/paddle/legacy/gserver/layers/TensorLayer.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "TensorLayer.h"
 
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/TensorLayer.h b/paddle/legacy/gserver/layers/TensorLayer.h
index 1c30f7c8899e23ad21f39a574d598dcefa32c11e..fc491a7c9f223cf0dff6d878c6ec27a858c7c7b7 100644
--- a/paddle/legacy/gserver/layers/TensorLayer.h
+++ b/paddle/legacy/gserver/layers/TensorLayer.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/TransLayer.cpp b/paddle/legacy/gserver/layers/TransLayer.cpp
index cf87ca53d1def32708400c507da673c3a6ec0a87..fd1d435ea5f53785c9c416146c642637adc786a8 100644
--- a/paddle/legacy/gserver/layers/TransLayer.cpp
+++ b/paddle/legacy/gserver/layers/TransLayer.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "TransLayer.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 namespace paddle {
 
 REGISTER_LAYER(trans, TransLayer);
diff --git a/paddle/legacy/gserver/layers/TransposedFullMatrixProjection.cpp b/paddle/legacy/gserver/layers/TransposedFullMatrixProjection.cpp
index 45f59779896f993aface284e3485e1e3d801f4c5..c8533dc7d78ec4fd3629e29e6c1c3e73c6acdc17 100644
--- a/paddle/legacy/gserver/layers/TransposedFullMatrixProjection.cpp
+++ b/paddle/legacy/gserver/layers/TransposedFullMatrixProjection.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Projection.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/UpsampleLayer.h b/paddle/legacy/gserver/layers/UpsampleLayer.h
index ea12a711a8a8a630ccf800812e71b75bab73550d..2fe5938244c81ab25c66083cc1ad63ba15618aa1 100644
--- a/paddle/legacy/gserver/layers/UpsampleLayer.h
+++ b/paddle/legacy/gserver/layers/UpsampleLayer.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <vector>
 #include "Layer.h"
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/layers/ValidationLayer.cpp b/paddle/legacy/gserver/layers/ValidationLayer.cpp
index b626825a7b45fdb09cd8f9e8cc6727e218ab2940..9956fd2ed41464eae096911620e160f5ecd89da3 100644
--- a/paddle/legacy/gserver/layers/ValidationLayer.cpp
+++ b/paddle/legacy/gserver/layers/ValidationLayer.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <memory>
 
 #include "ValidationLayer.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/tests/MKLDNNTester.cpp b/paddle/legacy/gserver/tests/MKLDNNTester.cpp
index bed58f94bb047e7d7bbc3c7746d2b484d3883861..b550ba9c72d85830dbf12485a6a645a6b5360026 100644
--- a/paddle/legacy/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/legacy/gserver/tests/MKLDNNTester.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "MKLDNNTester.h"
 #include "paddle/legacy/gserver/layers/MKLDNNBase.h"
 #include "paddle/legacy/gserver/layers/MKLDNNLayer.h"
-#include "paddle/trainer/Trainer.h"
+#include "paddle/legacy/trainer/Trainer.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/gserver/tests/test_BatchNorm.cpp b/paddle/legacy/gserver/tests/test_BatchNorm.cpp
index c7a65a30510e225e2cfeabb7c851a4533771d44a..e21fa16074406645be88eeb454d743531f825041 100644
--- a/paddle/legacy/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/legacy/gserver/tests/test_BatchNorm.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
 #include "paddle/legacy/cuda/include/hl_batch_norm.h"
diff --git a/paddle/legacy/gserver/tests/test_CompareSparse.cpp b/paddle/legacy/gserver/tests/test_CompareSparse.cpp
index 51433c9aaaec3d195a5591f20103bfa66cd4e4ea..11b633a5885180ae227f6e93330117b567d4a4ab 100644
--- a/paddle/legacy/gserver/tests/test_CompareSparse.cpp
+++ b/paddle/legacy/gserver/tests/test_CompareSparse.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/utils/PythonUtil.h>
+#include <paddle/legacy/utils/PythonUtil.h>
 
-#include "paddle/trainer/Trainer.h"
+#include "paddle/legacy/trainer/Trainer.h"
 
 #include <gtest/gtest.h>
 #include <paddle/legacy/pserver/ParameterServer2.h>
diff --git a/paddle/legacy/gserver/tests/test_CompareTwoNets.cpp b/paddle/legacy/gserver/tests/test_CompareTwoNets.cpp
index 3ac86ce516afa751b5625293be901ffa81eb698a..e19c34abbd8a84660a9e79bcbf602437bfc92832 100644
--- a/paddle/legacy/gserver/tests/test_CompareTwoNets.cpp
+++ b/paddle/legacy/gserver/tests/test_CompareTwoNets.cpp
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
+#include <paddle/legacy/utils/PythonUtil.h>
 #include <algorithm>
 #include <cstdlib>
 
-#include "paddle/trainer/Trainer.h"
+#include "paddle/legacy/trainer/Trainer.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/legacy/gserver/tests/test_ConvTrans.cpp b/paddle/legacy/gserver/tests/test_ConvTrans.cpp
index 41a03f3b44c8728ee48bf29dd1596c7af978a157..4ea0a3d379b010fcb6ccb91a28e653a53cfe66d8 100644
--- a/paddle/legacy/gserver/tests/test_ConvTrans.cpp
+++ b/paddle/legacy/gserver/tests/test_ConvTrans.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "ModelConfig.pb.h"
 #include "paddle/legacy/gserver/layers/DataLayer.h"
 #include "paddle/legacy/math/MathUtils.h"
-#include "paddle/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
 #include "paddle/testing/TestUtil.h"
diff --git a/paddle/legacy/gserver/tests/test_ConvUnify.cpp b/paddle/legacy/gserver/tests/test_ConvUnify.cpp
index a01a2b69374bc6e086feaee6ce84737ab034244f..d4ca158352d9e4bf859b31b7c7410518bdc20ac6 100644
--- a/paddle/legacy/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/legacy/gserver/tests/test_ConvUnify.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "ModelConfig.pb.h"
 #include "paddle/legacy/gserver/layers/DataLayer.h"
 #include "paddle/legacy/math/MathUtils.h"
-#include "paddle/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
 #include "paddle/testing/TestUtil.h"
diff --git a/paddle/legacy/gserver/tests/test_Evaluator.cpp b/paddle/legacy/gserver/tests/test_Evaluator.cpp
index 4a8843f3affe7b1d4f3172be733aefc085c9e7a5..8aab50d23e56e449d86f22a315c45432253cdd07 100644
--- a/paddle/legacy/gserver/tests/test_Evaluator.cpp
+++ b/paddle/legacy/gserver/tests/test_Evaluator.cpp
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <vector>
 #include "ModelConfig.pb.h"
+#include "paddle/legacy/trainer/Trainer.h"
 #include "paddle/testing/TestUtil.h"
-#include "paddle/trainer/Trainer.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp b/paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp
index 6a1cfdc705b6c38859f816e062bc68c051bf48f7..e15b4e5038cddda00acdd06b7748984b03094e6e 100644
--- a/paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp
+++ b/paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
 #include "paddle/testing/TestUtil.h"
diff --git a/paddle/legacy/gserver/tests/test_LinearChainCRF.cpp b/paddle/legacy/gserver/tests/test_LinearChainCRF.cpp
index 1c9549255180294bf5d12c7085d102fd1851b2d5..7082c1363a4cdadfd0e4a4497c20ae5c513bc7f1 100644
--- a/paddle/legacy/gserver/tests/test_LinearChainCRF.cpp
+++ b/paddle/legacy/gserver/tests/test_LinearChainCRF.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <vector>
 #include "paddle/legacy/gserver/layers/LinearChainCRF.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/legacy/gserver/tests/test_MKLDNN.cpp b/paddle/legacy/gserver/tests/test_MKLDNN.cpp
index a20ccfb772dcd119ef14f640dcde7259e4b4de79..c79ccd1956c5c68e5c97c2a185230b8ea9c3dea0 100644
--- a/paddle/legacy/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/legacy/gserver/tests/test_MKLDNN.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
+#include <paddle/legacy/utils/PythonUtil.h>
 #include <string>
 #include <vector>
 #include "MKLDNNTester.h"
diff --git a/paddle/legacy/gserver/tests/test_MultinomialSampler.cpp b/paddle/legacy/gserver/tests/test_MultinomialSampler.cpp
index ca1a588d83acc76bf59a8edfaaf51828dc4a569a..25b1a1191d0100c8ee625d3f5f36d1513164b23b 100644
--- a/paddle/legacy/gserver/tests/test_MultinomialSampler.cpp
+++ b/paddle/legacy/gserver/tests/test_MultinomialSampler.cpp
@@ -18,10 +18,10 @@ limitations under the License. */
 #include <vector>
 
 #undef PADDLE_DISABLE_TIMER
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 #include "paddle/legacy/gserver/layers/MultinomialSampler.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/legacy/gserver/tests/test_NetworkCompare.cpp b/paddle/legacy/gserver/tests/test_NetworkCompare.cpp
index 5a6b2245830832c1ca60ec657231c1bc2900f158..c9f9f3e61be11fa33ab37e27065fdf275f86453a 100644
--- a/paddle/legacy/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/legacy/gserver/tests/test_NetworkCompare.cpp
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #undef PADDLE_DISABLE_TIMER
 #include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
+#include <paddle/legacy/utils/PythonUtil.h>
 #include <algorithm>
 #include <cstdlib>
 
+#include "paddle/legacy/trainer/Trainer.h"
+#include "paddle/legacy/utils/Stat.h"
 #include "paddle/testing/TestUtil.h"
-#include "paddle/trainer/Trainer.h"
-#include "paddle/utils/Stat.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/legacy/gserver/tests/test_PyDataProvider.cpp b/paddle/legacy/gserver/tests/test_PyDataProvider.cpp
index 9cde4ecca52957a6de30bb37a497d4af162d804c..0209e6818a8340fe128146909b9e8ec610e310a3 100644
--- a/paddle/legacy/gserver/tests/test_PyDataProvider.cpp
+++ b/paddle/legacy/gserver/tests/test_PyDataProvider.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 
 #include "paddle/legacy/gserver/dataproviders/PyDataProvider.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include "paddle/testing/TestUtil.h"
 
diff --git a/paddle/legacy/gserver/tests/test_PyDataProvider2.cpp b/paddle/legacy/gserver/tests/test_PyDataProvider2.cpp
index 7f5a087b9abb967bf40022c079efe1fdf4bfb221..de313ba82cf2697c13d6eae17056240b6272ca1c 100644
--- a/paddle/legacy/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/legacy/gserver/tests/test_PyDataProvider2.cpp
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <fstream>
 #include "paddle/legacy/gserver/dataproviders/DataProvider.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Util.h"
 
 DEFINE_string(train_list, "unittest.list", "file list for unittest");
 
diff --git a/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp
index 9f9fee7ef6c83d3bb53bf5725dd6a3725d6c7c93..153c3e7f36a30a70d0c5870144a0091b1e5f7237 100644
--- a/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -15,11 +15,11 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <paddle/legacy/gserver/gradientmachines/GradientMachine.h>
 #include <paddle/legacy/parameter/ParameterUpdateFunctions.h>
-#include <paddle/trainer/Trainer.h>
-#include <paddle/trainer/TrainerInternal.h>
-#include <paddle/utils/PythonUtil.h>
-#include <paddle/utils/Util.h>
-#include <paddle/utils/Version.h>
+#include <paddle/legacy/trainer/Trainer.h>
+#include <paddle/legacy/trainer/TrainerInternal.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <paddle/legacy/utils/Util.h>
+#include <paddle/legacy/utils/Version.h>
 
 DECLARE_int32(seed);
 
diff --git a/paddle/legacy/gserver/tests/test_RecurrentLayer.cpp b/paddle/legacy/gserver/tests/test_RecurrentLayer.cpp
index 852a08d49343766f9e5a2c4ff8318586262ca1a2..71198cb6a1d29433ed0e315378f5aee51b921766 100644
--- a/paddle/legacy/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/legacy/gserver/tests/test_RecurrentLayer.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <paddle/utils/Version.h>
+#include <paddle/legacy/utils/Version.h>
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/legacy/gserver/layers/DataLayer.h"
diff --git a/paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp
index 160d95f15833ece4a59f7a0a912593938ad92218..1975d9196d61dbb80667b2ba86c09d56bc568064 100644
--- a/paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp
+++ b/paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include <math.h>
-#include <paddle/utils/PythonUtil.h>
+#include <paddle/legacy/utils/PythonUtil.h>
 #include <algorithm>
 #include <cstdlib>
 #include <ctime>
diff --git a/paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp b/paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp
index 34b88e68930ad54cddc6faaaa1bee18875031624..b1697e1616484ec5389cdb5b59ba413a9615cf2e 100644
--- a/paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <paddle/utils/Version.h>
+#include <paddle/legacy/utils/Version.h>
 #include "ModelConfig.pb.h"
 #include "paddle/legacy/gserver/layers/CTCLayer.h"
 #include "paddle/legacy/gserver/layers/DataLayer.h"
diff --git a/paddle/legacy/math/Allocator.h b/paddle/legacy/math/Allocator.h
index c43a83891eb6b7eae278169736149ad1d89e950e..ffb5ec1cad4113c2035daad8c385bbe57a161079 100644
--- a/paddle/legacy/math/Allocator.h
+++ b/paddle/legacy/math/Allocator.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <stdlib.h>
 #include <mutex>
 #include "hl_gpu.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/math/BaseMatrix.cu b/paddle/legacy/math/BaseMatrix.cu
index 7b57419e5a510ba50aff0b47681d1294607e31f9..7e7cdc57a9887152ecd9e0bbd9fe14fcba56799d 100644
--- a/paddle/legacy/math/BaseMatrix.cu
+++ b/paddle/legacy/math/BaseMatrix.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/utils/Logging.h>
+#include <paddle/legacy/utils/Logging.h>
 #include <string.h>
 #include <cmath>
 #include "BaseMatrix.h"
diff --git a/paddle/legacy/math/BaseMatrix.h b/paddle/legacy/math/BaseMatrix.h
index 1958629aa0354fcc332b1e5677a64c29397e0d26..4627f847d356f07600edae8cadcb02302e19381c 100644
--- a/paddle/legacy/math/BaseMatrix.h
+++ b/paddle/legacy/math/BaseMatrix.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <stdint.h>
 #include <cstddef>
 #include "TensorExpression.h"
-#include "paddle/utils/Common.h"
+#include "paddle/legacy/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/math/CpuSparseMatrix.cpp b/paddle/legacy/math/CpuSparseMatrix.cpp
index 88683ec98464561f70b9619e834f4029cfedc91a..20c65a3a1d7099a73d8b3c490cd42e721e60823b 100644
--- a/paddle/legacy/math/CpuSparseMatrix.cpp
+++ b/paddle/legacy/math/CpuSparseMatrix.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "float.h"
 #include "hl_gpu.h"
 #include "paddle/legacy/math/MathUtils.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/math/MathFunctions.cpp b/paddle/legacy/math/MathFunctions.cpp
index 152aeb5d645a58df6b6d078ce25f5921f6f1ba58..bbf34a32f36fa7988058f8d3bb7f91eaf2bc1ba0 100644
--- a/paddle/legacy/math/MathFunctions.cpp
+++ b/paddle/legacy/math/MathFunctions.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/legacy/math/MathFunctions.h"
 #include "hl_matrix_apply.cuh"
 #include "hl_matrix_ops.cuh"
-#include "paddle/utils/DynamicLoader.h"
+#include "paddle/legacy/utils/DynamicLoader.h"
 
 namespace dynload {
 
diff --git a/paddle/legacy/math/MathUtils.cpp b/paddle/legacy/math/MathUtils.cpp
index b2afdbcd51a3cf5d3e6f3e2bb14902bf78fe68c8..47ac9c187ca731c98c755501ff3633eabf095186 100644
--- a/paddle/legacy/math/MathUtils.cpp
+++ b/paddle/legacy/math/MathUtils.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "MathUtils.h"
 #include <algorithm>
 #include "Vector.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/math/Matrix.cpp b/paddle/legacy/math/Matrix.cpp
index 50b0bc501148be260464fbec4694f2f5565ce6ad..e53f95006c36bfce5df8e57e9efc249f56098b70 100644
--- a/paddle/legacy/math/Matrix.cpp
+++ b/paddle/legacy/math/Matrix.cpp
@@ -26,11 +26,11 @@ limitations under the License. */
 #include "hl_gpu.h"
 #include "hl_table_apply.h"
 #include "hl_top_k.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 #include "NEONFunctions.h"
 #include "paddle/legacy/function/GemmFunctor.h"
-#include "paddle/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
 
 #include "SIMDFunctions.h"
 
diff --git a/paddle/legacy/math/Matrix.h b/paddle/legacy/math/Matrix.h
index 74dc690792c189db537450d9e3e6cc02f68b48ca..ff4f4cfc2a41add1a06308556b38aba5bbdac884 100644
--- a/paddle/legacy/math/Matrix.h
+++ b/paddle/legacy/math/Matrix.h
@@ -18,16 +18,16 @@ limitations under the License. */
 #include <memory>
 #include <thread>
 
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
 
 #include <hl_gpu.h>
 
 #include "BaseMatrix.h"
 #include "MemoryHandle.h"
 #include "Vector.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/math/MatrixBitCode.cpp b/paddle/legacy/math/MatrixBitCode.cpp
index f7a949294b54a5a874e1239a13ca9dce3ba18e94..f35f266a30506110eb6c656f7b631d12d8f6ae90 100644
--- a/paddle/legacy/math/MatrixBitCode.cpp
+++ b/paddle/legacy/math/MatrixBitCode.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "Matrix.h"
 #include "hl_gpu.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/math/RowBuffer.h b/paddle/legacy/math/RowBuffer.h
index 6950afaa21d60615b27c06a151b0afbb296653bf..9dfd5eff06a39494cea6a8ce0b1f5ead6490b148 100644
--- a/paddle/legacy/math/RowBuffer.h
+++ b/paddle/legacy/math/RowBuffer.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <vector>
 #include "MemoryHandle.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/math/SparseMatrix.cpp b/paddle/legacy/math/SparseMatrix.cpp
index 1faa343dbcef3d20b29b272a8da37f8e2bba654b..6f68252b0a74802946e899e6e13e1da681d76986 100644
--- a/paddle/legacy/math/SparseMatrix.cpp
+++ b/paddle/legacy/math/SparseMatrix.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "hl_gpu.h"
 #include "hl_top_k.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/math/SparseRowMatrix.cpp b/paddle/legacy/math/SparseRowMatrix.cpp
index 4254175aabc8c32edb243d4a82c2e34c81393f74..39bcdf22984db766283a3b4fbf56f224f730c5f8 100644
--- a/paddle/legacy/math/SparseRowMatrix.cpp
+++ b/paddle/legacy/math/SparseRowMatrix.cpp
@@ -17,12 +17,12 @@ limitations under the License. */
 
 #include <algorithm>
 
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 #include "SIMDFunctions.h"
 
-#include "paddle/utils/Thread.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Thread.h"
+#include "paddle/legacy/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/math/SparseRowMatrix.h b/paddle/legacy/math/SparseRowMatrix.h
index cf6779e8b0b1d6b0c13b21a08ffff5af76e57ba6..e206747a41c9f3a0f058bf3b0a94472bf4b2c349 100644
--- a/paddle/legacy/math/SparseRowMatrix.h
+++ b/paddle/legacy/math/SparseRowMatrix.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include <algorithm>
 #include "Matrix.h"
 #include "RowBuffer.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/math/Storage.cpp b/paddle/legacy/math/Storage.cpp
index 5982bf2e5637ff4b4af6baae47e40b68e0c07c86..65d53aeaa926690c7fe9e6fcac7affdfb68fede9 100644
--- a/paddle/legacy/math/Storage.cpp
+++ b/paddle/legacy/math/Storage.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "Storage.h"
 #include "Allocator.h"
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/StringUtil.h"
+#include "paddle/legacy/utils/Util.h"
 
 #ifndef PADDLE_MOBILE_INFERENCE
 DEFINE_int32(pool_limit_size,
diff --git a/paddle/legacy/math/Storage.h b/paddle/legacy/math/Storage.h
index 61a9aa2a07442d9e4ede80c961e17e079eb8b3ba..bd22dde2c85be5ba432cb3a259211c1900a17b6c 100644
--- a/paddle/legacy/math/Storage.h
+++ b/paddle/legacy/math/Storage.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <mutex>
 #include <vector>
 #include "PoolAllocator.h"
-#include "paddle/utils/Locks.h"
+#include "paddle/legacy/utils/Locks.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/math/TensorAssign.h b/paddle/legacy/math/TensorAssign.h
index 7d4726ddba43202970c37dd1a08f842104b24ada..efbfce6c4f88197f18285e3679698b8bbb1ed3b8 100644
--- a/paddle/legacy/math/TensorAssign.h
+++ b/paddle/legacy/math/TensorAssign.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <algorithm>
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/math/TensorEvaluate.h b/paddle/legacy/math/TensorEvaluate.h
index 2a722016e777a131ef14636a6871d29d9b131044..3029dd35fb05c893f99cde0689f816f4257f21c4 100644
--- a/paddle/legacy/math/TensorEvaluate.h
+++ b/paddle/legacy/math/TensorEvaluate.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include "hl_base.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/math/TensorExpression.h b/paddle/legacy/math/TensorExpression.h
index f6da9adfca50e49ca260e20313c8979a38e1b06b..1c6cf07831487165445a3f59931c4ca9196375b9 100644
--- a/paddle/legacy/math/TensorExpression.h
+++ b/paddle/legacy/math/TensorExpression.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <stdint.h>
 #include <cstddef>
 #include "hl_tensor_ops.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/math/TrainingAlgorithmOp.cu b/paddle/legacy/math/TrainingAlgorithmOp.cu
index b844768d3b9fd05b5a0eada5e315b9e91588a4ee..9e1eaa0f45ae94d12cf7763bbaff632fc473bcc8 100644
--- a/paddle/legacy/math/TrainingAlgorithmOp.cu
+++ b/paddle/legacy/math/TrainingAlgorithmOp.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "BaseMatrix.h"
 #include "TrainingAlgorithmOp.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 #if __cplusplus > 199711L
 
diff --git a/paddle/legacy/math/TrainingAlgorithmOp.h b/paddle/legacy/math/TrainingAlgorithmOp.h
index fe40fc2d36e796bd4be7b7fc1e12a6eafa5d4700..921c2742cfe2576785768da40ab11c94234be966 100644
--- a/paddle/legacy/math/TrainingAlgorithmOp.h
+++ b/paddle/legacy/math/TrainingAlgorithmOp.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "BaseMatrix.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/math/Vector.cpp b/paddle/legacy/math/Vector.cpp
index 2a47ed7ef81a2e969757c244370cc346b13e1c03..87f48bb1622f28f8cb53e5afc924f5cadb14c528 100644
--- a/paddle/legacy/math/Vector.cpp
+++ b/paddle/legacy/math/Vector.cpp
@@ -13,17 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Vector.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include <memory>
 #include "Matrix.h"
 #include "hl_gpu.h"
 #include "hl_matrix.h"
 #include "hl_table_apply.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Thread.h"
-#include "paddle/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Thread.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/math/Vector.h b/paddle/legacy/math/Vector.h
index 964b42cae52af9b487ab17103bc5e999514e4dd1..63cb4651c52219807e11e778db9c42667759a055 100644
--- a/paddle/legacy/math/Vector.h
+++ b/paddle/legacy/math/Vector.h
@@ -21,8 +21,8 @@ limitations under the License. */
 
 #include "BaseMatrix.h"
 #include "MemoryHandle.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Thread.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Thread.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/math/tests/OriginalOptimizerApi.h b/paddle/legacy/math/tests/OriginalOptimizerApi.h
index 1f942e28f47832a25d5aa00f80f83eb5a6f5210f..f386e19958a21214151776e6d0ae7bb2a4530b6c 100644
--- a/paddle/legacy/math/tests/OriginalOptimizerApi.h
+++ b/paddle/legacy/math/tests/OriginalOptimizerApi.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/legacy/math/Vector.h"
-#include "paddle/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/legacy/math/tests/PerfUtils.h b/paddle/legacy/math/tests/PerfUtils.h
index bee2351e2fb80f9ccef670535c92485389f0c51a..eaf4869e4c994e5ec739fe650d0228687d24853f 100644
--- a/paddle/legacy/math/tests/PerfUtils.h
+++ b/paddle/legacy/math/tests/PerfUtils.h
@@ -21,7 +21,7 @@ limitations under the License. */
 
 #else
 
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 using namespace paddle;  // NOLINT
 
 #define EXPRESSION_PERFORMANCE(expression)                             \
diff --git a/paddle/legacy/math/tests/test_Allocator.cpp b/paddle/legacy/math/tests/test_Allocator.cpp
index 710b55f57e5468bf8ccd3ceb49821f5832cffb90..122be9082a8db33caf55661091caad115f575099 100644
--- a/paddle/legacy/math/tests/test_Allocator.cpp
+++ b/paddle/legacy/math/tests/test_Allocator.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Util.h"
 #define private public
 #include "paddle/legacy/math/Allocator.h"
 #include "paddle/legacy/math/MemoryHandle.h"
diff --git a/paddle/legacy/math/tests/test_CpuGpuVector.cpp b/paddle/legacy/math/tests/test_CpuGpuVector.cpp
index 38071582001d6b4914b6600d12b3fd951e1023de..010fef534d1e19d2d7d134298eb97aa1b56e2270 100644
--- a/paddle/legacy/math/tests/test_CpuGpuVector.cpp
+++ b/paddle/legacy/math/tests/test_CpuGpuVector.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "paddle/legacy/math/Vector.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 #include "test_matrixUtil.h"
 
 using namespace paddle;  // NOLINT
diff --git a/paddle/legacy/math/tests/test_ExecViaCpu.cpp b/paddle/legacy/math/tests/test_ExecViaCpu.cpp
index 55a3f5f50545d76bf7d62ed6b5a4b9fb8a590f45..b2ce0bc7ede133028fff8a855ff336ff83f55d82 100644
--- a/paddle/legacy/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/legacy/math/tests/test_ExecViaCpu.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
-#include <paddle/utils/Util.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <paddle/legacy/utils/Util.h>
 #include <vector>
 #include "paddle/legacy/math/SparseMatrix.h"
 
diff --git a/paddle/legacy/math/tests/test_FPException.cpp b/paddle/legacy/math/tests/test_FPException.cpp
index 6fd17f29695886044ab65c6ce78da1fc64ec0607..aa6aea71c8d959834ff11c04969e13bb36b630ff 100644
--- a/paddle/legacy/math/tests/test_FPException.cpp
+++ b/paddle/legacy/math/tests/test_FPException.cpp
@@ -31,7 +31,7 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Common.h"
+#include "paddle/legacy/utils/Common.h"
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/legacy/math/tests/test_GpuProfiler.cpp b/paddle/legacy/math/tests/test_GpuProfiler.cpp
index 450c9a035e369cb2d1ce5ba4e89d8e142d9af016..ee27109f218ca56df8f42ca6395b22621f5fbc11 100644
--- a/paddle/legacy/math/tests/test_GpuProfiler.cpp
+++ b/paddle/legacy/math/tests/test_GpuProfiler.cpp
@@ -17,9 +17,9 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include "paddle/legacy/math/Matrix.h"
 #include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
 #include "paddle/testing/TestUtil.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/legacy/math/tests/test_SIMDFunctions.cpp b/paddle/legacy/math/tests/test_SIMDFunctions.cpp
index eef281b3f7c46a2957e4ae75e8568280639d24c4..c6490f70e336dadcf6710c83ced2afddc13b7812 100644
--- a/paddle/legacy/math/tests/test_SIMDFunctions.cpp
+++ b/paddle/legacy/math/tests/test_SIMDFunctions.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/legacy/math/SIMDFunctions.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/legacy/math/tests/test_SparseMatrix.cpp b/paddle/legacy/math/tests/test_SparseMatrix.cpp
index dbcbeb8d506cf22c026bb7299bf7f71de488cb4a..30896a945ec6d111c35eea94d8008a62593d2893 100644
--- a/paddle/legacy/math/tests/test_SparseMatrix.cpp
+++ b/paddle/legacy/math/tests/test_SparseMatrix.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/utils/PythonUtil.h>
+#include <paddle/legacy/utils/PythonUtil.h>
 #include <vector>
 #include "test_matrixUtil.h"
 
diff --git a/paddle/legacy/math/tests/test_TrainingAlgorithm.cpp b/paddle/legacy/math/tests/test_TrainingAlgorithm.cpp
index 3ae9cf111ad4259462be34795df4dbab685302b8..214ae8971ae953ce0266f03dc3bba8c6160f1cf6 100644
--- a/paddle/legacy/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/legacy/math/tests/test_TrainingAlgorithm.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "PerfUtils.h"
 #include "TensorCheck.h"
 #include "paddle/legacy/math/TrainingAlgorithmOp.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/legacy/math/tests/test_matrixCompare.cpp b/paddle/legacy/math/tests/test_matrixCompare.cpp
index 98521aeb04bf46bf0061f3fa27455a2089d0c8b1..a43adde46fc6526cc3ff5affec2ce1c7c3a44214 100644
--- a/paddle/legacy/math/tests/test_matrixCompare.cpp
+++ b/paddle/legacy/math/tests/test_matrixCompare.cpp
@@ -21,10 +21,10 @@ limitations under the License. */
 #include "paddle/legacy/math/MathUtils.h"
 #include "paddle/legacy/math/Matrix.h"
 #include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/utils/DynamicLoader.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
 #include "paddle/testing/TestUtil.h"
-#include "paddle/utils/DynamicLoader.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/legacy/math/tests/test_matrixUtil.h b/paddle/legacy/math/tests/test_matrixUtil.h
index bb80172b1e02e6927d15d648f18ddfa3bcbab596..58c93f746e7ef4e2f2f98d4f410c74909a723812 100644
--- a/paddle/legacy/math/tests/test_matrixUtil.h
+++ b/paddle/legacy/math/tests/test_matrixUtil.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <gtest/gtest.h>
-#include <paddle/utils/Util.h>
+#include <paddle/legacy/utils/Util.h>
 #include "paddle/legacy/math/SparseMatrix.h"
 
 namespace paddle {
diff --git a/paddle/legacy/math/tests/test_sparseMatrixCompare.cpp b/paddle/legacy/math/tests/test_sparseMatrixCompare.cpp
index 959c9d40b0ec67a8ae8822da7a96a0541370b956..492aa0a689540dbb2c687326ff8a2919d89d2e6f 100644
--- a/paddle/legacy/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/legacy/math/tests/test_sparseMatrixCompare.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "paddle/legacy/math/Matrix.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 #include "test_matrixUtil.h"
 
 using namespace paddle;  // NOLINT
diff --git a/paddle/legacy/optimizer/serialization.h b/paddle/legacy/optimizer/serialization.h
index bf12eed15f0190b8e856163c68690f3f6eef9a12..2067a8d8cff23bff975d23a4df4d0aa7df20b00f 100644
--- a/paddle/legacy/optimizer/serialization.h
+++ b/paddle/legacy/optimizer/serialization.h
@@ -19,7 +19,7 @@
 #include <string>
 #include <type_traits>
 #include "OptimizerConfig.pb.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 #include "tensor.h"
 
 namespace paddle {
diff --git a/paddle/legacy/optimizer/tensor.h b/paddle/legacy/optimizer/tensor.h
index d2cef99074335be6f9852d60daa103b9b45a550d..2e58577d4df7aabd8cd218dc13837461cc681ac6 100644
--- a/paddle/legacy/optimizer/tensor.h
+++ b/paddle/legacy/optimizer/tensor.h
@@ -18,8 +18,8 @@
 
 #include <string.h>
 #include <memory>
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 namespace optimizer {
diff --git a/paddle/legacy/parameter/Argument.h b/paddle/legacy/parameter/Argument.h
index f936d944cbfbf71d01528e88f7380a6052409f1e..ea8634896c18c7c3516c0d584aec4b475d626e61 100644
--- a/paddle/legacy/parameter/Argument.h
+++ b/paddle/legacy/parameter/Argument.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/legacy/math/Matrix.h"
 #include "paddle/legacy/math/Vector.h"
 #include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/parameter/FirstOrderOptimizer.cpp b/paddle/legacy/parameter/FirstOrderOptimizer.cpp
index 89bb840f82c30081143c99c38c02a770bf0e1b96..4f82a115f7bb467737b53b9891d88d3c4f501faf 100644
--- a/paddle/legacy/parameter/FirstOrderOptimizer.cpp
+++ b/paddle/legacy/parameter/FirstOrderOptimizer.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "FirstOrderOptimizer.h"
 #include "paddle/legacy/math/TrainingAlgorithmOp.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include <cmath>
 
diff --git a/paddle/legacy/parameter/LearningRateScheduler.cpp b/paddle/legacy/parameter/LearningRateScheduler.cpp
index d57d2189a45dc8cbcea7a8a5f25c5ec7ac71cca3..68c44a7ec49f64a1085609d906441c9ed4502888 100644
--- a/paddle/legacy/parameter/LearningRateScheduler.cpp
+++ b/paddle/legacy/parameter/LearningRateScheduler.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "LearningRateScheduler.h"
-#include "paddle/utils/StringUtil.h"
+#include "paddle/legacy/utils/StringUtil.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/parameter/LearningRateScheduler.h b/paddle/legacy/parameter/LearningRateScheduler.h
index 3fad97040248dcf8a22988c38153df31f267ed37..fc7e380a6af58577f4ba319d85522535b8f93a45 100644
--- a/paddle/legacy/parameter/LearningRateScheduler.h
+++ b/paddle/legacy/parameter/LearningRateScheduler.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "TrainerConfig.pb.h"
-#include "paddle/utils/ClassRegistrar.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
 
 namespace paddle {
 // NOLINTNEXTLINES_4
diff --git a/paddle/legacy/parameter/Parameter.cpp b/paddle/legacy/parameter/Parameter.cpp
index d00019027b5f9da1ceba4392d606a90602c0b7a1..666d808f0c13c5c828c51b2a36ee9d05f7f78c13 100644
--- a/paddle/legacy/parameter/Parameter.cpp
+++ b/paddle/legacy/parameter/Parameter.cpp
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/legacy/math/CpuSparseMatrix.h"
 #include "paddle/legacy/math/MathUtils.h"
 #include "paddle/legacy/math/SparseRowMatrix.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 DEFINE_int32(enable_grad_share,
              (100 * 1024 * 1024),
diff --git a/paddle/legacy/parameter/Parameter.h b/paddle/legacy/parameter/Parameter.h
index 75cfb3f4aa6174990ce579171cb9e0e35e7e9b41..43b567dad045ad786b1b3f2d3614072f58310527 100644
--- a/paddle/legacy/parameter/Parameter.h
+++ b/paddle/legacy/parameter/Parameter.h
@@ -26,11 +26,11 @@ limitations under the License. */
 #include "ParameterUpdaterHook.h"
 #include "paddle/legacy/math/Matrix.h"
 #include "paddle/legacy/math/Vector.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/parameter/ParameterOptimizer.cpp b/paddle/legacy/parameter/ParameterOptimizer.cpp
index 638daa58f1e5f3f416d7f90ad2662a523eaf6741..b9dffa5afb4c99314869c7ed547ea9711d718b6e 100644
--- a/paddle/legacy/parameter/ParameterOptimizer.cpp
+++ b/paddle/legacy/parameter/ParameterOptimizer.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 #include <fstream>
 
diff --git a/paddle/legacy/parameter/ParameterUpdateFunctions.cpp b/paddle/legacy/parameter/ParameterUpdateFunctions.cpp
index db1153c2d6430e453d776b92b63152c311771668..72c9841acf6d3eb1d28d631e1599a1a403175013 100644
--- a/paddle/legacy/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/legacy/parameter/ParameterUpdateFunctions.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 #ifdef __AVX__
 #include <x86intrin.h>
 #include <xmmintrin.h>
diff --git a/paddle/legacy/parameter/ParameterUpdateFunctions.h b/paddle/legacy/parameter/ParameterUpdateFunctions.h
index 3dbde93b9196a73630f34ef76200933bf4e6dc7e..a7cc1c4c47b6c8723520221cb0efc2afb53a900c 100644
--- a/paddle/legacy/parameter/ParameterUpdateFunctions.h
+++ b/paddle/legacy/parameter/ParameterUpdateFunctions.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/legacy/math/Vector.h"
-#include "paddle/utils/Common.h"
+#include "paddle/legacy/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/parameter/ParameterUpdaterBase.cpp b/paddle/legacy/parameter/ParameterUpdaterBase.cpp
index 7815856b45d93406597b332469de1c57a7781da5..7d9d3fad63160b76d6de0932f39596a8643d0a8e 100644
--- a/paddle/legacy/parameter/ParameterUpdaterBase.cpp
+++ b/paddle/legacy/parameter/ParameterUpdaterBase.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "ParameterUpdaterBase.h"
 #include <fstream>
 #include "hl_gpu.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/parameter/ParameterUpdaterHook.cpp b/paddle/legacy/parameter/ParameterUpdaterHook.cpp
index e4677f894acc7632c2a20c49c0a799101357eea6..bfb9769fb67fc71b6f96f09d44b2c108745eafa3 100644
--- a/paddle/legacy/parameter/ParameterUpdaterHook.cpp
+++ b/paddle/legacy/parameter/ParameterUpdaterHook.cpp
@@ -24,8 +24,8 @@ limitations under the License. */
 
 #include "paddle/legacy/math/Vector.h"
 #include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/parameter/Regularizer.cpp b/paddle/legacy/parameter/Regularizer.cpp
index d223fd2df679af1e983e84f48a4d3b0715ce1569..c1d5f4fa68403408bb44341e1e28f2ce3beb2e4c 100644
--- a/paddle/legacy/parameter/Regularizer.cpp
+++ b/paddle/legacy/parameter/Regularizer.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Regularizer.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/parameter/Weight.cpp b/paddle/legacy/parameter/Weight.cpp
index ba4ddce69fb9c2ad0fa937efca5ba470247978e4..9d94050a5cd8c3570c286e8e82c2a1470c40e6db 100644
--- a/paddle/legacy/parameter/Weight.cpp
+++ b/paddle/legacy/parameter/Weight.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Weight.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/parameter/tests/test_common.cpp b/paddle/legacy/parameter/tests/test_common.cpp
index 3c4ee11934b0dd487517b3799611c8c1a153f52d..8de9d6da983553c0b9e574ac27ae8fca14bea5b7 100644
--- a/paddle/legacy/parameter/tests/test_common.cpp
+++ b/paddle/legacy/parameter/tests/test_common.cpp
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/utils/Util.h>
+#include <paddle/legacy/utils/Util.h>
 #include <stdlib.h>
 
 #include <gtest/gtest.h>
 #include <paddle/legacy/parameter/ParameterUpdateFunctions.h>
-#include <paddle/utils/Flags.h>
-#include <paddle/utils/Stat.h>
-#include <paddle/utils/Thread.h>
+#include <paddle/legacy/utils/Flags.h>
+#include <paddle/legacy/utils/Stat.h>
+#include <paddle/legacy/utils/Thread.h>
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/legacy/pserver/BaseClient.cpp b/paddle/legacy/pserver/BaseClient.cpp
index a6204ef47ea553246ddadbb2eae6cc714cafe594..13bb8a1cc58580a8e0af31c23b420836c7422ad8 100644
--- a/paddle/legacy/pserver/BaseClient.cpp
+++ b/paddle/legacy/pserver/BaseClient.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <gflags/gflags.h>
 #include <string.h>
 #include <vector>
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 DECLARE_string(pservers);
 
diff --git a/paddle/legacy/pserver/BaseClient.h b/paddle/legacy/pserver/BaseClient.h
index 92bb0a8b6a1ac896b8a281601e407a729556d5f0..66e8f39cd60998122bb8958b12b23ee7142be94d 100644
--- a/paddle/legacy/pserver/BaseClient.h
+++ b/paddle/legacy/pserver/BaseClient.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "ParameterService.pb.h"
 #include "paddle/legacy/math/Matrix.h"
 #include "paddle/legacy/pserver/ProtoServer.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Queue.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Queue.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/pserver/LightNetwork.cpp b/paddle/legacy/pserver/LightNetwork.cpp
index 4c0da2217e880b7509ea5f42da5ac7ffe93a53ec..469c95853ecdc02a6028417ca37b0020406eea09 100644
--- a/paddle/legacy/pserver/LightNetwork.cpp
+++ b/paddle/legacy/pserver/LightNetwork.cpp
@@ -27,8 +27,8 @@ limitations under the License. */
 
 #include "LightNetwork.h"
 #include "RDMANetwork.h"
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/StringUtil.h"
+#include "paddle/legacy/utils/Util.h"
 
 /// quick ack can reduce the latency of small message
 DEFINE_bool(small_messages,
diff --git a/paddle/legacy/pserver/LightNetwork.h b/paddle/legacy/pserver/LightNetwork.h
index bcfc9655e989e80e08e9dce9b8734c0643cbf661..380f86832f5894fdf29588dde9a77068c624e066 100644
--- a/paddle/legacy/pserver/LightNetwork.h
+++ b/paddle/legacy/pserver/LightNetwork.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include <thread>
 #include <vector>
 
-#include "paddle/utils/Thread.h"
+#include "paddle/legacy/utils/Thread.h"
 
 struct sxi_socket;
 
diff --git a/paddle/legacy/pserver/ParameterClient2.cpp b/paddle/legacy/pserver/ParameterClient2.cpp
index 98b3966250c60ecba7a48320c98f47c590ceb95c..4c544ddc28517f50e7deb23d4fa7a82b34d42677 100644
--- a/paddle/legacy/pserver/ParameterClient2.cpp
+++ b/paddle/legacy/pserver/ParameterClient2.cpp
@@ -16,9 +16,9 @@ limitations under the License. */
 
 #include "ParameterClient2.h"
 #include "paddle/legacy/math/SparseRowMatrix.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/StringUtil.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/StringUtil.h"
 
 DEFINE_string(pservers, "127.0.0.1", "Comma separated addresses of pservers");
 DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
diff --git a/paddle/legacy/pserver/ParameterClient2.h b/paddle/legacy/pserver/ParameterClient2.h
index 2bc0e478664c0d2f6cf5d38f75bd14e25c1724c6..9320e19c4df6c5439266f89e5599b9496f145172 100644
--- a/paddle/legacy/pserver/ParameterClient2.h
+++ b/paddle/legacy/pserver/ParameterClient2.h
@@ -23,11 +23,11 @@ limitations under the License. */
 #include "paddle/legacy/math/Vector.h"
 #include "paddle/legacy/parameter/Parameter.h"
 #include "paddle/legacy/pserver/BaseClient.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Queue.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Queue.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include "ParameterService.pb.h"
 
diff --git a/paddle/legacy/pserver/ParameterServer2.cpp b/paddle/legacy/pserver/ParameterServer2.cpp
index 293fc7ca69bed0fa59bf6972fbe9908967842acf..8533a322d92d292ee613d44795cf60462082a11b 100644
--- a/paddle/legacy/pserver/ParameterServer2.cpp
+++ b/paddle/legacy/pserver/ParameterServer2.cpp
@@ -26,10 +26,10 @@ limitations under the License. */
 #include "paddle/legacy/parameter/ParameterUpdateFunctions.h"
 #include "paddle/legacy/parameter/Regularizer.h"
 #include "paddle/legacy/parameter/ThreadLocalBuffer.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/StringUtil.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/StringUtil.h"
 
 DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
 DEFINE_double(async_lagged_ratio_min,
diff --git a/paddle/legacy/pserver/ParameterServer2.h b/paddle/legacy/pserver/ParameterServer2.h
index 040699878d4d3c2f7effa2b6394a663441512dda..069e730ea4ea4b253518d70142f0f242145cd326 100644
--- a/paddle/legacy/pserver/ParameterServer2.h
+++ b/paddle/legacy/pserver/ParameterServer2.h
@@ -29,10 +29,10 @@ limitations under the License. */
 #include "paddle/legacy/math/Vector.h"
 #include "paddle/legacy/parameter/Parameter.h"
 #include "paddle/legacy/parameter/ParameterOptimizer.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
 
 #include "ParameterService.pb.h"
 
diff --git a/paddle/legacy/pserver/ParameterServerController.h b/paddle/legacy/pserver/ParameterServerController.h
index 1308d62fb1787f19123fe37d49f8e14039c5a39a..b90d0cbceaa879b8cb281867b5326ff50c1e311a 100644
--- a/paddle/legacy/pserver/ParameterServerController.h
+++ b/paddle/legacy/pserver/ParameterServerController.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "ParameterServer2.h"
 #include "ParameterServerConfig.pb.h"
 #include "RDMANetwork.h"
-#include "paddle/utils/StringUtil.h"
+#include "paddle/legacy/utils/StringUtil.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/pserver/RDMANetwork.h b/paddle/legacy/pserver/RDMANetwork.h
index 83db6b9df71274c3a8eb3403457877b68f2b6dea..c87056f72c56647c827cdbd7bdd6a992b4bb1cf6 100644
--- a/paddle/legacy/pserver/RDMANetwork.h
+++ b/paddle/legacy/pserver/RDMANetwork.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #else
 #define PROMPT_ERR() LOG(FATAL) << "Paddle is not compiled with rdma"
 #endif
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 #include <netinet/in.h>
 struct sxi_sock;
diff --git a/paddle/legacy/pserver/SocketChannel.cpp b/paddle/legacy/pserver/SocketChannel.cpp
index 72e6943408a1856db214262ff0b0698a2eb89a91..79c763c62ba845067c7729eafb5b218fc7b91482 100644
--- a/paddle/legacy/pserver/SocketChannel.cpp
+++ b/paddle/legacy/pserver/SocketChannel.cpp
@@ -22,7 +22,7 @@ limitations under the License. */
 #include <unistd.h>
 #include "RDMANetwork.h"
 
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/pserver/SocketChannel.h b/paddle/legacy/pserver/SocketChannel.h
index 8b45ac56090ef82e77514566e7df6b366958655e..a7b3cd42f0aa32c3a74e14f87dbfe64d25473254 100644
--- a/paddle/legacy/pserver/SocketChannel.h
+++ b/paddle/legacy/pserver/SocketChannel.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include <sys/uio.h>
 
diff --git a/paddle/legacy/pserver/SparseParameterDistribution.cpp b/paddle/legacy/pserver/SparseParameterDistribution.cpp
index bb247f389cc26b32ff79d36bdf5c81ba8591dc58..3f17b228f0e5fd33b7e7db2afe1fb9421acc69c5 100644
--- a/paddle/legacy/pserver/SparseParameterDistribution.cpp
+++ b/paddle/legacy/pserver/SparseParameterDistribution.cpp
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include <unistd.h>
 
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
-#include "paddle/utils/Flags.h"
+#include "paddle/legacy/utils/Flags.h"
 
 #include "SparseParameterDistribution.h"
 
diff --git a/paddle/legacy/pserver/SparseParameterDistribution.h b/paddle/legacy/pserver/SparseParameterDistribution.h
index e168f36c75e9452fff547f139a67a553cc6b796a..ee78029958f675d07ec0aba2d0c1ea92d664e8fd 100644
--- a/paddle/legacy/pserver/SparseParameterDistribution.h
+++ b/paddle/legacy/pserver/SparseParameterDistribution.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <unistd.h>
 
 #include <atomic>
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/legacy/pserver/test/SocketTest.cpp b/paddle/legacy/pserver/test/SocketTest.cpp
index bb9ee355ddbd1333cb45965e6756938d7f44fe38..3a781fcbf655b554e79fc753f3409d12f10f6646 100644
--- a/paddle/legacy/pserver/test/SocketTest.cpp
+++ b/paddle/legacy/pserver/test/SocketTest.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include <netdb.h>
 #include <netinet/in.h>
@@ -23,7 +23,7 @@ limitations under the License. */
 #include <thread>
 
 #include "paddle/legacy/math/Vector.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 struct MessageHeader {
   int64_t dataLength;
diff --git a/paddle/legacy/pserver/test/test_ParameterServer2.cpp b/paddle/legacy/pserver/test/test_ParameterServer2.cpp
index 60419f3a4abad8b209706d568c52f3d4761b5445..542e80e046972be38d403bc3223f7e7fcd15e3f0 100644
--- a/paddle/legacy/pserver/test/test_ParameterServer2.cpp
+++ b/paddle/legacy/pserver/test/test_ParameterServer2.cpp
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <paddle/legacy/pserver/ParameterClient2.h>
 #include <paddle/legacy/pserver/ParameterServer2.h>
-#include <paddle/utils/Flags.h>
-#include <paddle/utils/Util.h>
+#include <paddle/legacy/utils/Flags.h>
+#include <paddle/legacy/utils/Util.h>
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/legacy/pserver/test/test_ProtoServer.cpp b/paddle/legacy/pserver/test/test_ProtoServer.cpp
index 8d5e26f995771e0209db95342551d4800bbaa2a3..f7ab2e8af45f97a6537d41ca1afe51a4d3270b80 100644
--- a/paddle/legacy/pserver/test/test_ProtoServer.cpp
+++ b/paddle/legacy/pserver/test/test_ProtoServer.cpp
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "ParameterService.pb.h"
 #include "paddle/legacy/math/Vector.h"
 #include "paddle/legacy/pserver/ProtoServer.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
 
 DEFINE_string(server_addr, "127.0.0.1", "Server address");
 DEFINE_int64(dim, 50000000, "Data size");
diff --git a/paddle/trainer/CMakeLists.txt b/paddle/legacy/trainer/CMakeLists.txt
similarity index 100%
rename from paddle/trainer/CMakeLists.txt
rename to paddle/legacy/trainer/CMakeLists.txt
diff --git a/paddle/trainer/MergeModel.cpp b/paddle/legacy/trainer/MergeModel.cpp
similarity index 97%
rename from paddle/trainer/MergeModel.cpp
rename to paddle/legacy/trainer/MergeModel.cpp
index 6624d6d27bf41605ad8486ad1297558996c81978..8a3601f192224a43687191527374149d99285ae0 100644
--- a/paddle/trainer/MergeModel.cpp
+++ b/paddle/legacy/trainer/MergeModel.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "ParamUtil.h"
 #include "Trainer.h"
 #include "paddle/legacy/pserver/ParameterServer2.h"
-#include "paddle/utils/PythonUtil.h"
+#include "paddle/legacy/utils/PythonUtil.h"
 
 DEFINE_string(model_dir, "", "Directory for separated model files");
 DEFINE_string(config_file, "", "Config file for the model");
diff --git a/paddle/trainer/NewRemoteParameterUpdater.cpp b/paddle/legacy/trainer/NewRemoteParameterUpdater.cpp
similarity index 99%
rename from paddle/trainer/NewRemoteParameterUpdater.cpp
rename to paddle/legacy/trainer/NewRemoteParameterUpdater.cpp
index 410ac6d95c4d65ce6fb25c05351bb8ddb24473f4..cdd832acd16e5c259a7f6463aac537e4e6537c97 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/legacy/trainer/NewRemoteParameterUpdater.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "NewRemoteParameterUpdater.h"
 #include "Trainer.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/Stat.h"
 
 DECLARE_int32(trainer_id);
 DECLARE_string(save_dir);
diff --git a/paddle/trainer/NewRemoteParameterUpdater.h b/paddle/legacy/trainer/NewRemoteParameterUpdater.h
similarity index 98%
rename from paddle/trainer/NewRemoteParameterUpdater.h
rename to paddle/legacy/trainer/NewRemoteParameterUpdater.h
index 33c1fa7bdf347d5ad2d42384da05018bc9525d94..707e9ceb9b6a22d265f9bf7b02af7f3002930fd4 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ b/paddle/legacy/trainer/NewRemoteParameterUpdater.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "ParameterUpdater.h"
 #include "libpaddle_pserver_cclient.h"
 #include "paddle/legacy/pserver/ParameterClient2.h"
-#include "paddle/utils/Queue.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Queue.h"
+#include "paddle/legacy/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/trainer/ParamUtil.cpp b/paddle/legacy/trainer/ParamUtil.cpp
similarity index 95%
rename from paddle/trainer/ParamUtil.cpp
rename to paddle/legacy/trainer/ParamUtil.cpp
index b577e3e868359912814ac41144b324a221515247..b5aba32dee1d07015ae3fce1cc76242b8ae80fe5 100644
--- a/paddle/trainer/ParamUtil.cpp
+++ b/paddle/legacy/trainer/ParamUtil.cpp
@@ -23,12 +23,12 @@ limitations under the License. */
 #include <sstream>
 
 #include <google/protobuf/text_format.h>
-#include <paddle/utils/Version.h>
+#include <paddle/legacy/utils/Version.h>
 
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include "TesterConfig.h"
 #include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
diff --git a/paddle/trainer/ParamUtil.h b/paddle/legacy/trainer/ParamUtil.h
similarity index 99%
rename from paddle/trainer/ParamUtil.h
rename to paddle/legacy/trainer/ParamUtil.h
index c34e079b90be763f8530ba3213b78f80d726e8f0..07786967762a7b9267d190de5275f0f94bbd21ef 100644
--- a/paddle/trainer/ParamUtil.h
+++ b/paddle/legacy/trainer/ParamUtil.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include <stdio.h>
 
diff --git a/paddle/trainer/ParameterUpdater.cpp b/paddle/legacy/trainer/ParameterUpdater.cpp
similarity index 98%
rename from paddle/trainer/ParameterUpdater.cpp
rename to paddle/legacy/trainer/ParameterUpdater.cpp
index 4e9e890c85945aedd7e604f52a06902191c95d4c..549fb0332da78053a261928b5558beb1ffbc79c5 100644
--- a/paddle/trainer/ParameterUpdater.cpp
+++ b/paddle/legacy/trainer/ParameterUpdater.cpp
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "ParameterUpdater.h"
 
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
-#include "paddle/utils/Thread.h"
+#include "paddle/legacy/utils/Thread.h"
 
 namespace paddle {
 
diff --git a/paddle/trainer/ParameterUpdater.h b/paddle/legacy/trainer/ParameterUpdater.h
similarity index 98%
rename from paddle/trainer/ParameterUpdater.h
rename to paddle/legacy/trainer/ParameterUpdater.h
index 0070254d1c56ff26596a7585de7833df52107acf..acddc3702d78fdb198973f70a8642c5192af992b 100644
--- a/paddle/trainer/ParameterUpdater.h
+++ b/paddle/legacy/trainer/ParameterUpdater.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/Thread.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Thread.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include "paddle/legacy/parameter/AverageOptimizer.h"
 #include "paddle/legacy/parameter/FirstOrderOptimizer.h"
diff --git a/paddle/trainer/RemoteParameterUpdater.cpp b/paddle/legacy/trainer/RemoteParameterUpdater.cpp
similarity index 99%
rename from paddle/trainer/RemoteParameterUpdater.cpp
rename to paddle/legacy/trainer/RemoteParameterUpdater.cpp
index 7314266cb24da9b9e9f0f1cbe61ed363247f51fe..5de1cc7827aa8f219de60fe9da67fbb0595eb1d5 100644
--- a/paddle/trainer/RemoteParameterUpdater.cpp
+++ b/paddle/legacy/trainer/RemoteParameterUpdater.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "RemoteParameterUpdater.h"
 #include "Trainer.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/Stat.h"
 
 DECLARE_int32(trainer_id);
 DECLARE_string(save_dir);
diff --git a/paddle/trainer/RemoteParameterUpdater.h b/paddle/legacy/trainer/RemoteParameterUpdater.h
similarity index 99%
rename from paddle/trainer/RemoteParameterUpdater.h
rename to paddle/legacy/trainer/RemoteParameterUpdater.h
index 7a9b687ac2ee078aa21beb5a2c20d42db4fdb429..68468532981a49ef32f5f0da1170815d657d86c1 100644
--- a/paddle/trainer/RemoteParameterUpdater.h
+++ b/paddle/legacy/trainer/RemoteParameterUpdater.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <thread>
 #include "ParameterUpdater.h"
 #include "paddle/legacy/pserver/ParameterClient2.h"
-#include "paddle/utils/Queue.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Queue.h"
+#include "paddle/legacy/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/trainer/Tester.cpp b/paddle/legacy/trainer/Tester.cpp
similarity index 98%
rename from paddle/trainer/Tester.cpp
rename to paddle/legacy/trainer/Tester.cpp
index f7daf1327b470233a6918a6807ea061f6f170f98..d977ca9657a7688c101ed060935c644e4876e6d1 100644
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/legacy/trainer/Tester.cpp
@@ -24,10 +24,10 @@ limitations under the License. */
 
 #include <google/protobuf/text_format.h>
 
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include "TesterConfig.h"
 #include "paddle/legacy/gserver/gradientmachines/GradientMachineMode.h"
diff --git a/paddle/trainer/Tester.h b/paddle/legacy/trainer/Tester.h
similarity index 99%
rename from paddle/trainer/Tester.h
rename to paddle/legacy/trainer/Tester.h
index bce9775a098da92db4edecdd47e29887abf84fde..a298602d1d0894af90c098818908862a553cb3e7 100644
--- a/paddle/trainer/Tester.h
+++ b/paddle/legacy/trainer/Tester.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include <stdio.h>
 
diff --git a/paddle/trainer/TesterConfig.h b/paddle/legacy/trainer/TesterConfig.h
similarity index 98%
rename from paddle/trainer/TesterConfig.h
rename to paddle/legacy/trainer/TesterConfig.h
index ef10c7dbf7346bef3006cb433ac86a6ea7786946..6c78f7cda347d5808d11e8af98672ef56898d643 100644
--- a/paddle/trainer/TesterConfig.h
+++ b/paddle/legacy/trainer/TesterConfig.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include <stdio.h>
 
diff --git a/paddle/trainer/ThreadParameterUpdater.cpp b/paddle/legacy/trainer/ThreadParameterUpdater.cpp
similarity index 99%
rename from paddle/trainer/ThreadParameterUpdater.cpp
rename to paddle/legacy/trainer/ThreadParameterUpdater.cpp
index 39e63c333e272da4f3b11d917e23063088189532..0601bdf24e3150f5d182e2addde3a91609a967e4 100644
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ b/paddle/legacy/trainer/ThreadParameterUpdater.cpp
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #include "ThreadParameterUpdater.h"
 
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 #include "paddle/legacy/math/SparseRowMatrix.h"
 #include "paddle/legacy/parameter/ThreadLocalBuffer.h"
-#include "paddle/utils/Thread.h"
+#include "paddle/legacy/utils/Thread.h"
 
 DECLARE_int32(trainer_count);
 
diff --git a/paddle/trainer/ThreadParameterUpdater.h b/paddle/legacy/trainer/ThreadParameterUpdater.h
similarity index 98%
rename from paddle/trainer/ThreadParameterUpdater.h
rename to paddle/legacy/trainer/ThreadParameterUpdater.h
index bd0ce990783d4fc57b03090397f4291be3fdabda..172287d4eb56828c83e6670226b4c1f179fac6d8 100644
--- a/paddle/trainer/ThreadParameterUpdater.h
+++ b/paddle/legacy/trainer/ThreadParameterUpdater.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/legacy/parameter/OptimizerWithRegularizer.h"
 #include "paddle/legacy/parameter/Parameter.h"
 #include "paddle/legacy/parameter/Regularizer.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include <memory>
 #include <vector>
diff --git a/paddle/trainer/Trainer.cpp b/paddle/legacy/trainer/Trainer.cpp
similarity index 99%
rename from paddle/trainer/Trainer.cpp
rename to paddle/legacy/trainer/Trainer.cpp
index edfd72197e642a76ebc3db90a406284da6cd54bc..2db754793cf19e0c29455f61ada5f1d15b3204af 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/legacy/trainer/Trainer.cpp
@@ -23,11 +23,11 @@ limitations under the License. */
 
 #include <google/protobuf/text_format.h>
 
-#include "paddle/utils/Common.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include "RemoteParameterUpdater.h"
 #include "TesterConfig.h"
diff --git a/paddle/trainer/Trainer.h b/paddle/legacy/trainer/Trainer.h
similarity index 99%
rename from paddle/trainer/Trainer.h
rename to paddle/legacy/trainer/Trainer.h
index 58acec17818a80840b4c0b61641331d8d91d7ea8..b467f9af0cf12a39dd3d119c59e6cafcb05474b4 100644
--- a/paddle/trainer/Trainer.h
+++ b/paddle/legacy/trainer/Trainer.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include <stdio.h>
 
diff --git a/paddle/trainer/TrainerBenchmark.cpp b/paddle/legacy/trainer/TrainerBenchmark.cpp
similarity index 96%
rename from paddle/trainer/TrainerBenchmark.cpp
rename to paddle/legacy/trainer/TrainerBenchmark.cpp
index 173653c81688fe4606731c68ea1854268b3f4590..7f5bd2335481c417b466ac4ca9ca54798524045f 100644
--- a/paddle/trainer/TrainerBenchmark.cpp
+++ b/paddle/legacy/trainer/TrainerBenchmark.cpp
@@ -15,8 +15,8 @@ limitations under the License. */
 #undef PADDLE_DISABLE_TIMER
 
 #include "Trainer.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
 
 DECLARE_int32(test_period);
 
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/legacy/trainer/TrainerConfigHelper.cpp
similarity index 98%
rename from paddle/trainer/TrainerConfigHelper.cpp
rename to paddle/legacy/trainer/TrainerConfigHelper.cpp
index 2b68d89e48a3efd5de205ce33643b7e6320a4303..4d31ba8d71d52ac51191affc612a79b6734dee74 100644
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/legacy/trainer/TrainerConfigHelper.cpp
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "TrainerConfigHelper.h"
 #include "ParamUtil.h"
 #include "TrainerConfig.pb.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/PythonUtil.h"
 
 DECLARE_string(config);
 DECLARE_string(init_model_path);
diff --git a/paddle/trainer/TrainerConfigHelper.h b/paddle/legacy/trainer/TrainerConfigHelper.h
similarity index 98%
rename from paddle/trainer/TrainerConfigHelper.h
rename to paddle/legacy/trainer/TrainerConfigHelper.h
index b21dda964e70fce6e5e9672cc131595ad5af3bbc..0e428bea2c4b44bf98772ccca8f8b10d315efbbd 100644
--- a/paddle/trainer/TrainerConfigHelper.h
+++ b/paddle/legacy/trainer/TrainerConfigHelper.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include <paddle/utils/Logging.h>
-#include <paddle/utils/Util.h>
+#include <paddle/legacy/utils/Logging.h>
+#include <paddle/legacy/utils/Util.h>
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/trainer/TrainerInternal.cpp b/paddle/legacy/trainer/TrainerInternal.cpp
similarity index 98%
rename from paddle/trainer/TrainerInternal.cpp
rename to paddle/legacy/trainer/TrainerInternal.cpp
index b4b1a87cd5b1e8013cec9c1d8f9ce143e03209c2..ee3dea6340167ab16d2bfefe3d757b10f5d90bb5 100644
--- a/paddle/trainer/TrainerInternal.cpp
+++ b/paddle/legacy/trainer/TrainerInternal.cpp
@@ -26,10 +26,10 @@ limitations under the License. */
 
 #include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
 #include "paddle/legacy/gserver/layers/ValidationLayer.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include "RemoteParameterUpdater.h"
 #include "ThreadParameterUpdater.h"
diff --git a/paddle/trainer/TrainerInternal.h b/paddle/legacy/trainer/TrainerInternal.h
similarity index 99%
rename from paddle/trainer/TrainerInternal.h
rename to paddle/legacy/trainer/TrainerInternal.h
index ecc87966dc884065d4736920102941e52a734d88..93919a68fca2930cdf106f45d112e2a459fe695a 100644
--- a/paddle/trainer/TrainerInternal.h
+++ b/paddle/legacy/trainer/TrainerInternal.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/paddle/trainer/TrainerInternalConfig.cpp b/paddle/legacy/trainer/TrainerInternalConfig.cpp
similarity index 100%
rename from paddle/trainer/TrainerInternalConfig.cpp
rename to paddle/legacy/trainer/TrainerInternalConfig.cpp
diff --git a/paddle/trainer/TrainerInternalConfig.h b/paddle/legacy/trainer/TrainerInternalConfig.h
similarity index 99%
rename from paddle/trainer/TrainerInternalConfig.h
rename to paddle/legacy/trainer/TrainerInternalConfig.h
index 29d588e1be18f1fd86eeaadf8c19074ce649470d..b91b53932381a8698b331a2989b5f16829c06a25 100644
--- a/paddle/trainer/TrainerInternalConfig.h
+++ b/paddle/legacy/trainer/TrainerInternalConfig.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 #include <stdio.h>
 
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/legacy/trainer/TrainerMain.cpp
similarity index 97%
rename from paddle/trainer/TrainerMain.cpp
rename to paddle/legacy/trainer/TrainerMain.cpp
index 115e5d88a24b5360f2633c4bb5b73fb531e6641d..911aeba1928f7208aecb92910dac981f00fc6db5 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/legacy/trainer/TrainerMain.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include <fenv.h>
 #include "paddle/legacy/pserver/ParameterServerController.h"
-#include "paddle/utils/PythonUtil.h"
+#include "paddle/legacy/utils/PythonUtil.h"
 
 #include "ParamUtil.h"
 #include "Trainer.h"
diff --git a/paddle/trainer/tests/.gitignore b/paddle/legacy/trainer/tests/.gitignore
similarity index 100%
rename from paddle/trainer/tests/.gitignore
rename to paddle/legacy/trainer/tests/.gitignore
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/legacy/trainer/tests/CMakeLists.txt
similarity index 89%
rename from paddle/trainer/tests/CMakeLists.txt
rename to paddle/legacy/trainer/tests/CMakeLists.txt
index 12c9ea8cef79a6bdbd6e26c35612d0abbe00257b..08548bea4c4a7fc4fa99d9305208abd4ee442572 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/legacy/trainer/tests/CMakeLists.txt
@@ -5,7 +5,7 @@ add_custom_target(copy_trainer_conf ALL DEPENDS sample_trainer_config.conf)
 
 set(PYTHON_PATH 
    ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
-   ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/trainer/tests)
+   ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/legacy/trainer/tests)
 function(trainer_test TARGET)
   add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
   add_test(NAME ${TARGET}
@@ -33,5 +33,5 @@ endif()
 #################### test_config_parser #########################
 add_test(NAME test_config_parser
   COMMAND ${PYTHON_PATH} ${PYTHON_EXECUTABLE} 
-        ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py
+        ${PADDLE_SOURCE_DIR}/paddle/legacy/trainer/tests/config_parser_test.py
     WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
diff --git a/paddle/trainer/tests/__init__.py b/paddle/legacy/trainer/tests/__init__.py
similarity index 100%
rename from paddle/trainer/tests/__init__.py
rename to paddle/legacy/trainer/tests/__init__.py
diff --git a/paddle/trainer/tests/config_parser_test.py b/paddle/legacy/trainer/tests/config_parser_test.py
similarity index 87%
rename from paddle/trainer/tests/config_parser_test.py
rename to paddle/legacy/trainer/tests/config_parser_test.py
index 88646e11f7610846558fb7bfb02c1dafddc68fea..0d3d82cbdafcf85d42247e810fe7caa685a86e4d 100644
--- a/paddle/trainer/tests/config_parser_test.py
+++ b/paddle/legacy/trainer/tests/config_parser_test.py
@@ -15,9 +15,9 @@
 from paddle.trainer.config_parser import parse_config_and_serialize
 
 if __name__ == '__main__':
-    parse_config_and_serialize('trainer/tests/test_config.conf', '')
+    parse_config_and_serialize('legacy/trainer/tests/test_config.conf', '')
     parse_config_and_serialize(
-        'trainer/tests/sample_trainer_config.conf',
+        'legacy/trainer/tests/sample_trainer_config.conf',
         'extension_module_name=paddle.trainer.config_parser_extension')
     parse_config_and_serialize(
         'legacy/gserver/tests/pyDataProvider/trainer.conf', '')
diff --git a/paddle/trainer/tests/fake_file_list.list b/paddle/legacy/trainer/tests/fake_file_list.list
similarity index 100%
rename from paddle/trainer/tests/fake_file_list.list
rename to paddle/legacy/trainer/tests/fake_file_list.list
diff --git a/paddle/trainer/tests/picojson.h b/paddle/legacy/trainer/tests/picojson.h
similarity index 100%
rename from paddle/trainer/tests/picojson.h
rename to paddle/legacy/trainer/tests/picojson.h
diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data b/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
similarity index 100%
rename from paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
rename to paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
diff --git a/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list b/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list
new file mode 100644
index 0000000000000000000000000000000000000000..11c1b1b38b9edacc4953fdf526906d28bcc2d720
--- /dev/null
+++ b/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list
@@ -0,0 +1 @@
+legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.beam b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.beam
similarity index 100%
rename from paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.beam
rename to paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.beam
diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nest b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nest
similarity index 100%
rename from paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nest
rename to paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nest
diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam
similarity index 100%
rename from paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam
rename to paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam
diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/t1/transtable b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/transtable
similarity index 100%
rename from paddle/trainer/tests/rnn_gen_test_model_dir/t1/transtable
rename to paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/transtable
diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/t1/wordvec b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/wordvec
similarity index 100%
rename from paddle/trainer/tests/rnn_gen_test_model_dir/t1/wordvec
rename to paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/wordvec
diff --git a/paddle/trainer/tests/sample_data.txt b/paddle/legacy/trainer/tests/sample_data.txt
similarity index 100%
rename from paddle/trainer/tests/sample_data.txt
rename to paddle/legacy/trainer/tests/sample_data.txt
diff --git a/paddle/legacy/trainer/tests/sample_filelist.txt b/paddle/legacy/trainer/tests/sample_filelist.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8573f9e1795edd37cfa0d21f0effc08a80d38e29
--- /dev/null
+++ b/paddle/legacy/trainer/tests/sample_filelist.txt
@@ -0,0 +1 @@
+legacy/trainer/tests/sample_data.txt
diff --git a/paddle/trainer/tests/sample_trainer_config.conf b/paddle/legacy/trainer/tests/sample_trainer_config.conf
similarity index 95%
rename from paddle/trainer/tests/sample_trainer_config.conf
rename to paddle/legacy/trainer/tests/sample_trainer_config.conf
index 2697832840f35a33c07f1664ef18a229d656d784..5800b3625661efac80b84b19c2a5cedc34718488 100644
--- a/paddle/trainer/tests/sample_trainer_config.conf
+++ b/paddle/legacy/trainer/tests/sample_trainer_config.conf
@@ -16,13 +16,13 @@
 from paddle.trainer_config_helpers import *
 
 TrainData(SimpleData(
-            files = "trainer/tests/sample_filelist.txt",
+            files = "legacy/trainer/tests/sample_filelist.txt",
             feat_dim = 3,
             context_len = 0,
             buffer_capacity = 1000000))
 
 TestData(SimpleData(
-           files = "trainer/tests/sample_filelist.txt",
+           files = "legacy/trainer/tests/sample_filelist.txt",
            feat_dim = 3,
            context_len = 0,
            buffer_capacity = 1000000))
diff --git a/paddle/trainer/tests/sample_trainer_config_hsigmoid.conf b/paddle/legacy/trainer/tests/sample_trainer_config_hsigmoid.conf
similarity index 96%
rename from paddle/trainer/tests/sample_trainer_config_hsigmoid.conf
rename to paddle/legacy/trainer/tests/sample_trainer_config_hsigmoid.conf
index e4abe31d480b69bc2ff4741649b336714818515b..155c40b31f30c40e1ddeb65500f55162beb9a0ee 100644
--- a/paddle/trainer/tests/sample_trainer_config_hsigmoid.conf
+++ b/paddle/legacy/trainer/tests/sample_trainer_config_hsigmoid.conf
@@ -17,7 +17,7 @@
 from paddle.trainer_config_helpers import *
 
 TrainData(SimpleData(
-    files = "trainer/tests/sample_filelist.txt",
+    files = "legacy/trainer/tests/sample_filelist.txt",
     feat_dim = 3,
     context_len = 0,
     buffer_capacity = 1000000,
diff --git a/paddle/trainer/tests/sample_trainer_config_parallel.conf b/paddle/legacy/trainer/tests/sample_trainer_config_parallel.conf
similarity index 95%
rename from paddle/trainer/tests/sample_trainer_config_parallel.conf
rename to paddle/legacy/trainer/tests/sample_trainer_config_parallel.conf
index e2b8b3ecdab83b4614dbe468c3a295c05867f7f9..49cdde7fa2c55e6536a49633f959af6a888ec463 100644
--- a/paddle/trainer/tests/sample_trainer_config_parallel.conf
+++ b/paddle/legacy/trainer/tests/sample_trainer_config_parallel.conf
@@ -16,13 +16,13 @@
 from paddle.trainer_config_helpers import *
 
 TrainData(SimpleData(
-            files = "trainer/tests/sample_filelist.txt",
+            files = "legacy/trainer/tests/sample_filelist.txt",
             feat_dim = 3,
             context_len = 0,
             buffer_capacity = 1000000))
 
 TestData(SimpleData(
-           files = "trainer/tests/sample_filelist.txt",
+           files = "legacy/trainer/tests/sample_filelist.txt",
            feat_dim = 3,
            context_len = 0,
            buffer_capacity = 1000000))
diff --git a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf b/paddle/legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf
similarity index 94%
rename from paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
rename to paddle/legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf
index 741a0aa71df7866c180ab2513f28638117d0f1ca..51ef905a5a182464f69a1629e51bf8180eadb3fb 100644
--- a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
+++ b/paddle/legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf
@@ -63,8 +63,8 @@ beam_gen_concat = recurrent_group(name="rnn_gen_concat",
 
 seqtext_printer_evaluator(input=beam_gen_concat,
                           id_input=sent_id,
-                          dict_file="./trainer/tests/test_gen_dict.txt",
-                          result_file="./trainer/tests/dump_text.test")
+                          dict_file="./legacy/trainer/tests/test_gen_dict.txt",
+                          result_file="./legacy/trainer/tests/dump_text.test")
 #outputs(beam_gen_concat)
 # In this config, as dummy_data_input doesn't work on beam_gen (we can find dummy_memory
 # is read-only memory, and isn't used by other layers of step), we show the Inputs and Outputs
diff --git a/paddle/trainer/tests/sample_trainer_rnn_gen.conf b/paddle/legacy/trainer/tests/sample_trainer_rnn_gen.conf
similarity index 94%
rename from paddle/trainer/tests/sample_trainer_rnn_gen.conf
rename to paddle/legacy/trainer/tests/sample_trainer_rnn_gen.conf
index 58d27f15ae1c0a38885ee105a7963b6e7bd55906..35c7f0fcd91f9b534a4f535387af720659d7f9b8 100644
--- a/paddle/trainer/tests/sample_trainer_rnn_gen.conf
+++ b/paddle/legacy/trainer/tests/sample_trainer_rnn_gen.conf
@@ -56,8 +56,8 @@ beam_gen = beam_search(name="rnn_gen",
 
 seqtext_printer_evaluator(input=beam_gen,
                           id_input=sent_id,
-                          dict_file="./trainer/tests/test_gen_dict.txt",
-                          result_file="./trainer/tests/dump_text.test")
+                          dict_file="./legacy/trainer/tests/test_gen_dict.txt",
+                          result_file="./legacy/trainer/tests/dump_text.test")
 #outputs(beam_gen)
 # In this config, as dummy_data_input doesn't work on beam_gen (we can find dummy_memory
 # is read-only memory, and isn't used by other layers of step), we show the Inputs and Outputs
diff --git a/paddle/trainer/tests/simple_sparse_neural_network.py b/paddle/legacy/trainer/tests/simple_sparse_neural_network.py
similarity index 95%
rename from paddle/trainer/tests/simple_sparse_neural_network.py
rename to paddle/legacy/trainer/tests/simple_sparse_neural_network.py
index 970fb466dc5061713fe7815d5247cbbde93be821..9419f4d903b1de205a6c549c7dcd9bb85ed7396b 100644
--- a/paddle/trainer/tests/simple_sparse_neural_network.py
+++ b/paddle/legacy/trainer/tests/simple_sparse_neural_network.py
@@ -16,7 +16,7 @@ from paddle.trainer_config_helpers import *
 
 settings(batch_size=17, learning_method=AdaGradOptimizer(), learning_rate=1e-4)
 
-file_list = 'trainer/tests/fake_file_list.list'
+file_list = 'legacy/trainer/tests/fake_file_list.list'
 
 define_py_data_sources2(
     train_list=file_list,
diff --git a/paddle/trainer/tests/simple_sparse_neural_network_dp.py b/paddle/legacy/trainer/tests/simple_sparse_neural_network_dp.py
similarity index 100%
rename from paddle/trainer/tests/simple_sparse_neural_network_dp.py
rename to paddle/legacy/trainer/tests/simple_sparse_neural_network_dp.py
diff --git a/paddle/trainer/tests/testPyDataWrapper.py b/paddle/legacy/trainer/tests/testPyDataWrapper.py
similarity index 100%
rename from paddle/trainer/tests/testPyDataWrapper.py
rename to paddle/legacy/trainer/tests/testPyDataWrapper.py
diff --git a/paddle/trainer/tests/test_Compare.cpp b/paddle/legacy/trainer/tests/test_Compare.cpp
similarity index 96%
rename from paddle/trainer/tests/test_Compare.cpp
rename to paddle/legacy/trainer/tests/test_Compare.cpp
index f3a964acb69be059a43470f7b68910a3b6cecaab..e37e546be8513b1cc7438810a01641859a4bad18 100644
--- a/paddle/trainer/tests/test_Compare.cpp
+++ b/paddle/legacy/trainer/tests/test_Compare.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/utils/PythonUtil.h>
+#include <paddle/legacy/utils/PythonUtil.h>
 
-#include "paddle/trainer/Trainer.h"
+#include "paddle/legacy/trainer/Trainer.h"
 
 #include <gtest/gtest.h>
 #include <cstdlib>
@@ -22,7 +22,8 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-static const string& configFile = "trainer/tests/sample_trainer_config.conf";
+static const string& configFile =
+    "legacy/trainer/tests/sample_trainer_config.conf";
 
 DECLARE_int32(gpu_id);
 DECLARE_bool(use_gpu);
diff --git a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/legacy/trainer/tests/test_PyDataProviderWrapper.cpp
similarity index 98%
rename from paddle/trainer/tests/test_PyDataProviderWrapper.cpp
rename to paddle/legacy/trainer/tests/test_PyDataProviderWrapper.cpp
index e3cd1c904d50af2efb7c5d6fa23a99cf8c5b5c85..847adcfabada18e11203d3f18fb6dc355c670afb 100644
--- a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
+++ b/paddle/legacy/trainer/tests/test_PyDataProviderWrapper.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <paddle/legacy/gserver/dataproviders/DataProvider.h>
 #include <paddle/legacy/math/Matrix.h>
 #include <paddle/legacy/parameter/Argument.h>
-#include <paddle/utils/PythonUtil.h>
+#include <paddle/legacy/utils/PythonUtil.h>
 #include <fstream>
 #include <typeinfo>
 #include <unordered_map>
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "picojson.h"
 
 void checkValue(std::vector<paddle::Argument>& arguments, picojson::array& arr);
-const std::string kDir = "./trainer/tests/pydata_provider_wrapper_dir/";
+const std::string kDir = "./legacy/trainer/tests/pydata_provider_wrapper_dir/";
 
 TEST(PyDataProviderWrapper, SequenceData) {
   paddle::DataConfig conf;
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/legacy/trainer/tests/test_Trainer.cpp
similarity index 88%
rename from paddle/trainer/tests/test_Trainer.cpp
rename to paddle/legacy/trainer/tests/test_Trainer.cpp
index 394038cf730f13cb957fbbc5ae0e5719b8fe9db6..14ad0a265281a8df20a70b0da2873ea451338ddb 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/legacy/trainer/tests/test_Trainer.cpp
@@ -12,20 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/utils/PythonUtil.h>
-#include <paddle/utils/Version.h>
-#include "paddle/trainer/Trainer.h"
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <paddle/legacy/utils/Version.h>
+#include "paddle/legacy/trainer/Trainer.h"
 
 #include <gtest/gtest.h>
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-static const string& configFile1 = "trainer/tests/sample_trainer_config.conf";
+static const string& configFile1 =
+    "legacy/trainer/tests/sample_trainer_config.conf";
 static const string& configFile2 =
-    "trainer/tests/sample_trainer_config_hsigmoid.conf";
+    "legacy/trainer/tests/sample_trainer_config_hsigmoid.conf";
 static const string& configFile4 =
-    "trainer/tests/sample_trainer_config_parallel.conf";
+    "legacy/trainer/tests/sample_trainer_config_parallel.conf";
 
 DECLARE_bool(use_gpu);
 DECLARE_string(config);
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/legacy/trainer/tests/test_TrainerOnePass.cpp
similarity index 95%
rename from paddle/trainer/tests/test_TrainerOnePass.cpp
rename to paddle/legacy/trainer/tests/test_TrainerOnePass.cpp
index 1e1b2d2bf46b8664892e3b0f96e7e56989ef7e25..3e5c5ea723f3fd80316ee826fe9c6566e7049b7b 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/legacy/trainer/tests/test_TrainerOnePass.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/utils/GlobalConstants.h>
-#include <paddle/utils/PythonUtil.h>
-#include "paddle/trainer/Trainer.h"
-#include "paddle/trainer/TrainerInternal.h"
+#include <paddle/legacy/utils/GlobalConstants.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+#include "paddle/legacy/trainer/Trainer.h"
+#include "paddle/legacy/trainer/TrainerInternal.h"
 
 #include <gtest/gtest.h>
 #include <paddle/legacy/pserver/ParameterServer2.h>
@@ -23,12 +23,13 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-static const string& configFile1 = "trainer/tests/sample_trainer_config.conf";
+static const string& configFile1 =
+    "legacy/trainer/tests/sample_trainer_config.conf";
 static const string& configFile2 =
-    "trainer/tests/sample_trainer_config_parallel.conf";
+    "legacy/trainer/tests/sample_trainer_config_parallel.conf";
 
 static const string& configFileSimpleSparse =
-    "trainer/tests/simple_sparse_neural_network.py";
+    "legacy/trainer/tests/simple_sparse_neural_network.py";
 
 DECLARE_bool(use_gpu);
 DECLARE_string(config);
diff --git a/paddle/trainer/tests/test_config.conf b/paddle/legacy/trainer/tests/test_config.conf
similarity index 97%
rename from paddle/trainer/tests/test_config.conf
rename to paddle/legacy/trainer/tests/test_config.conf
index 2f86aaa75316fa2a5a28edfef31c01e15a44b3d0..bce687ad83686d465987d72defd37db2b50953a1 100644
--- a/paddle/trainer/tests/test_config.conf
+++ b/paddle/legacy/trainer/tests/test_config.conf
@@ -16,7 +16,7 @@
 from paddle.trainer_config_helpers import *
 
 TrainData(SimpleData(
-    files = "trainer/tests/sample_filelist.txt",
+    files = "legacy/trainer/tests/sample_filelist.txt",
     feat_dim = 3,
     context_len = 0,
     buffer_capacity = 1000000,
diff --git a/paddle/trainer/tests/test_gen_dict.txt b/paddle/legacy/trainer/tests/test_gen_dict.txt
similarity index 100%
rename from paddle/trainer/tests/test_gen_dict.txt
rename to paddle/legacy/trainer/tests/test_gen_dict.txt
diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/legacy/trainer/tests/test_recurrent_machine_generation.cpp
similarity index 90%
rename from paddle/trainer/tests/test_recurrent_machine_generation.cpp
rename to paddle/legacy/trainer/tests/test_recurrent_machine_generation.cpp
index a8fbe31c2b1e228107dfc19483444409bfcbf788..47b4e82cd32917fcf32dbb5ffabb47330dab93d9 100644
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/legacy/trainer/tests/test_recurrent_machine_generation.cpp
@@ -14,21 +14,23 @@ limitations under the License. */
 
 #include <fstream>
 
-#include <paddle/trainer/Trainer.h>
-#include <paddle/utils/PythonUtil.h>
+#include <paddle/legacy/trainer/Trainer.h>
+#include <paddle/legacy/utils/PythonUtil.h>
 
 #include <gtest/gtest.h>
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-static const string& CONFIG_FILE = "trainer/tests/sample_trainer_rnn_gen.conf";
+static const string& CONFIG_FILE =
+    "legacy/trainer/tests/sample_trainer_rnn_gen.conf";
 static const string& NEST_CONFIG_FILE =
-    "trainer/tests/sample_trainer_nest_rnn_gen.conf";
-static const string& OUTPUT_DIR = "trainer/tests/dump_text.test";
-static string modelDir = "trainer/tests/rnn_gen_test_model_dir/t1";  // NOLINT
-static string expectFile =                                           // NOLINT
-    "trainer/tests/rnn_gen_test_model_dir/r1.test";                  // NOLINT
+    "legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf";
+static const string& OUTPUT_DIR = "legacy/trainer/tests/dump_text.test";
+static string modelDir =
+    "legacy/trainer/tests/rnn_gen_test_model_dir/t1";       // NOLINT
+static string expectFile =                                  // NOLINT
+    "legacy/trainer/tests/rnn_gen_test_model_dir/r1.test";  // NOLINT
 
 DECLARE_string(config_args);
 
diff --git a/paddle/utils/.gitignore b/paddle/legacy/utils/.gitignore
similarity index 100%
rename from paddle/utils/.gitignore
rename to paddle/legacy/utils/.gitignore
diff --git a/paddle/utils/Any.h b/paddle/legacy/utils/Any.h
similarity index 100%
rename from paddle/utils/Any.h
rename to paddle/legacy/utils/Any.h
diff --git a/paddle/utils/CMakeLists.txt b/paddle/legacy/utils/CMakeLists.txt
similarity index 100%
rename from paddle/utils/CMakeLists.txt
rename to paddle/legacy/utils/CMakeLists.txt
diff --git a/paddle/utils/ClassRegistrar.h b/paddle/legacy/utils/ClassRegistrar.h
similarity index 100%
rename from paddle/utils/ClassRegistrar.h
rename to paddle/legacy/utils/ClassRegistrar.h
diff --git a/paddle/utils/Common.h b/paddle/legacy/utils/Common.h
similarity index 100%
rename from paddle/utils/Common.h
rename to paddle/legacy/utils/Common.h
diff --git a/paddle/utils/CpuId.cpp b/paddle/legacy/utils/CpuId.cpp
similarity index 96%
rename from paddle/utils/CpuId.cpp
rename to paddle/legacy/utils/CpuId.cpp
index 7186feef041eb3b1be459a506294f83f9a00ad94..66e7c6606f070aef4fd954b8f4ada994b2f4fb96 100644
--- a/paddle/utils/CpuId.cpp
+++ b/paddle/legacy/utils/CpuId.cpp
@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/CpuId.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/CpuId.h"
+#include "paddle/legacy/utils/Util.h"
 
 #ifdef _WIN32
 
diff --git a/paddle/utils/CpuId.h b/paddle/legacy/utils/CpuId.h
similarity index 100%
rename from paddle/utils/CpuId.h
rename to paddle/legacy/utils/CpuId.h
diff --git a/paddle/utils/CustomStackTrace.cpp b/paddle/legacy/utils/CustomStackTrace.cpp
similarity index 100%
rename from paddle/utils/CustomStackTrace.cpp
rename to paddle/legacy/utils/CustomStackTrace.cpp
diff --git a/paddle/utils/CustomStackTrace.h b/paddle/legacy/utils/CustomStackTrace.h
similarity index 100%
rename from paddle/utils/CustomStackTrace.h
rename to paddle/legacy/utils/CustomStackTrace.h
diff --git a/paddle/utils/DynamicLoader.cpp b/paddle/legacy/utils/DynamicLoader.cpp
similarity index 100%
rename from paddle/utils/DynamicLoader.cpp
rename to paddle/legacy/utils/DynamicLoader.cpp
diff --git a/paddle/utils/DynamicLoader.h b/paddle/legacy/utils/DynamicLoader.h
similarity index 100%
rename from paddle/utils/DynamicLoader.h
rename to paddle/legacy/utils/DynamicLoader.h
diff --git a/paddle/utils/Error.h b/paddle/legacy/utils/Error.h
similarity index 100%
rename from paddle/utils/Error.h
rename to paddle/legacy/utils/Error.h
diff --git a/paddle/utils/Excepts.h b/paddle/legacy/utils/Excepts.h
similarity index 100%
rename from paddle/utils/Excepts.h
rename to paddle/legacy/utils/Excepts.h
diff --git a/paddle/utils/Flags.cpp b/paddle/legacy/utils/Flags.cpp
similarity index 100%
rename from paddle/utils/Flags.cpp
rename to paddle/legacy/utils/Flags.cpp
diff --git a/paddle/utils/Flags.h b/paddle/legacy/utils/Flags.h
similarity index 100%
rename from paddle/utils/Flags.h
rename to paddle/legacy/utils/Flags.h
diff --git a/paddle/utils/GlobalConstants.cpp b/paddle/legacy/utils/GlobalConstants.cpp
similarity index 100%
rename from paddle/utils/GlobalConstants.cpp
rename to paddle/legacy/utils/GlobalConstants.cpp
diff --git a/paddle/utils/GlobalConstants.h b/paddle/legacy/utils/GlobalConstants.h
similarity index 100%
rename from paddle/utils/GlobalConstants.h
rename to paddle/legacy/utils/GlobalConstants.h
diff --git a/paddle/utils/Locks.h b/paddle/legacy/utils/Locks.h
similarity index 100%
rename from paddle/utils/Locks.h
rename to paddle/legacy/utils/Locks.h
diff --git a/paddle/utils/Logging.cpp b/paddle/legacy/utils/Logging.cpp
similarity index 100%
rename from paddle/utils/Logging.cpp
rename to paddle/legacy/utils/Logging.cpp
diff --git a/paddle/utils/Logging.h b/paddle/legacy/utils/Logging.h
similarity index 100%
rename from paddle/utils/Logging.h
rename to paddle/legacy/utils/Logging.h
diff --git a/paddle/utils/PythonUtil.cpp b/paddle/legacy/utils/PythonUtil.cpp
similarity index 100%
rename from paddle/utils/PythonUtil.cpp
rename to paddle/legacy/utils/PythonUtil.cpp
diff --git a/paddle/utils/PythonUtil.h b/paddle/legacy/utils/PythonUtil.h
similarity index 99%
rename from paddle/utils/PythonUtil.h
rename to paddle/legacy/utils/PythonUtil.h
index 6f8d7e09309503e47aca7ae2d20774c748703b21..b0c8612c378fbe12cdf24e51a5b6546740b2d4c8 100644
--- a/paddle/utils/PythonUtil.h
+++ b/paddle/legacy/utils/PythonUtil.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 // clang-format off
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 #ifndef PADDLE_NO_PYTHON
 // must include the following two blocks, otherwise,
diff --git a/paddle/utils/Queue.h b/paddle/legacy/utils/Queue.h
similarity index 100%
rename from paddle/utils/Queue.h
rename to paddle/legacy/utils/Queue.h
diff --git a/paddle/utils/Stat.cpp b/paddle/legacy/utils/Stat.cpp
similarity index 100%
rename from paddle/utils/Stat.cpp
rename to paddle/legacy/utils/Stat.cpp
diff --git a/paddle/utils/Stat.h b/paddle/legacy/utils/Stat.h
similarity index 100%
rename from paddle/utils/Stat.h
rename to paddle/legacy/utils/Stat.h
diff --git a/paddle/utils/StringUtil.cpp b/paddle/legacy/utils/StringUtil.cpp
similarity index 100%
rename from paddle/utils/StringUtil.cpp
rename to paddle/legacy/utils/StringUtil.cpp
diff --git a/paddle/utils/StringUtil.h b/paddle/legacy/utils/StringUtil.h
similarity index 100%
rename from paddle/utils/StringUtil.h
rename to paddle/legacy/utils/StringUtil.h
diff --git a/paddle/utils/Thread.h b/paddle/legacy/utils/Thread.h
similarity index 100%
rename from paddle/utils/Thread.h
rename to paddle/legacy/utils/Thread.h
diff --git a/paddle/utils/ThreadLocal.cpp b/paddle/legacy/utils/ThreadLocal.cpp
similarity index 100%
rename from paddle/utils/ThreadLocal.cpp
rename to paddle/legacy/utils/ThreadLocal.cpp
diff --git a/paddle/utils/ThreadLocal.h b/paddle/legacy/utils/ThreadLocal.h
similarity index 100%
rename from paddle/utils/ThreadLocal.h
rename to paddle/legacy/utils/ThreadLocal.h
diff --git a/paddle/utils/Util.cpp b/paddle/legacy/utils/Util.cpp
similarity index 100%
rename from paddle/utils/Util.cpp
rename to paddle/legacy/utils/Util.cpp
diff --git a/paddle/utils/Util.h b/paddle/legacy/utils/Util.h
similarity index 100%
rename from paddle/utils/Util.h
rename to paddle/legacy/utils/Util.h
diff --git a/paddle/utils/Version.cpp b/paddle/legacy/utils/Version.cpp
similarity index 100%
rename from paddle/utils/Version.cpp
rename to paddle/legacy/utils/Version.cpp
diff --git a/paddle/utils/Version.h b/paddle/legacy/utils/Version.h
similarity index 100%
rename from paddle/utils/Version.h
rename to paddle/legacy/utils/Version.h
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/legacy/utils/arch/linux/Locks.cpp
similarity index 97%
rename from paddle/utils/arch/linux/Locks.cpp
rename to paddle/legacy/utils/arch/linux/Locks.cpp
index 409af8bce3621c51bfd7a69c6b4ec1f9cc6be8e4..32d351e3328afd79007aea7a51e59cbfc941eeeb 100644
--- a/paddle/utils/arch/linux/Locks.cpp
+++ b/paddle/legacy/utils/arch/linux/Locks.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Locks.h"
+#include "paddle/legacy/utils/Locks.h"
 #include <semaphore.h>
 #include <unistd.h>
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 class SemaphorePrivate {
diff --git a/paddle/utils/arch/osx/Excepts.cpp b/paddle/legacy/utils/arch/osx/Excepts.cpp
similarity index 97%
rename from paddle/utils/arch/osx/Excepts.cpp
rename to paddle/legacy/utils/arch/osx/Excepts.cpp
index ac444615786fa9f89f96504a31b2289eae7bb643..2b7d6dca8454417fd78f6da7f906785d24a6219b 100644
--- a/paddle/utils/arch/osx/Excepts.cpp
+++ b/paddle/legacy/utils/arch/osx/Excepts.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Excepts.h"
+#include "paddle/legacy/utils/Excepts.h"
 
 #if defined(__APPLE__) || defined(__OSX__)
 #if defined(__arm__) || defined(__arm64__)
diff --git a/paddle/utils/arch/osx/Locks.cpp b/paddle/legacy/utils/arch/osx/Locks.cpp
similarity index 97%
rename from paddle/utils/arch/osx/Locks.cpp
rename to paddle/legacy/utils/arch/osx/Locks.cpp
index f3905091bd024ab02c3f5d39cfed6dbc38fabbbc..b68c48f0c31aa928a634e0369295ec084b9ccd8e 100644
--- a/paddle/utils/arch/osx/Locks.cpp
+++ b/paddle/legacy/utils/arch/osx/Locks.cpp
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Locks.h"
+#include "paddle/legacy/utils/Locks.h"
 #include <dispatch/dispatch.h>
 #include <libkern/OSAtomic.h>
 #include <atomic>
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/enable_virtualenv.py b/paddle/legacy/utils/enable_virtualenv.py
similarity index 100%
rename from paddle/utils/enable_virtualenv.py
rename to paddle/legacy/utils/enable_virtualenv.py
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/legacy/utils/tests/CMakeLists.txt
similarity index 84%
rename from paddle/utils/tests/CMakeLists.txt
rename to paddle/legacy/utils/tests/CMakeLists.txt
index c770ce169878d9998e559b1d417fc1acc88cde97..4af01db5c84cb497b756027cbb6ad06c081a8899 100644
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/legacy/utils/tests/CMakeLists.txt
@@ -13,6 +13,6 @@ add_executable(
 link_paddle_exe(test_CustomStackTracePrint)
 if(NOT APPLE)
     add_test(NAME test_CustomStackTracePrint
-        COMMAND ${PADDLE_SOURCE_DIR}/paddle/utils/tests/test_CustomStackTracePrint.sh
+        COMMAND ${PADDLE_SOURCE_DIR}/paddle/legacy/utils/tests/test_CustomStackTracePrint.sh
         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif()
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/legacy/utils/tests/test_CustomStackTrace.cpp
similarity index 94%
rename from paddle/utils/tests/test_CustomStackTrace.cpp
rename to paddle/legacy/utils/tests/test_CustomStackTrace.cpp
index 4d5540b24cb9d52482cfa5a77dfa956b8bf4ef38..2a418e3ae2277fc5dc6856d131dafa9daf0bad47 100644
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/legacy/utils/tests/test_CustomStackTrace.cpp
@@ -15,10 +15,10 @@ limitations under the License. */
 #include <gflags/gflags.h>  // NOLINT
 #include <gtest/gtest.h>    // NOLINT
 
-#include "paddle/utils/CustomStackTrace.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/CustomStackTrace.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/StringUtil.h"
+#include "paddle/legacy/utils/Util.h"
 
 DEFINE_int32(test_thread_num, 10, "testing thread number");
 
diff --git a/paddle/utils/tests/test_CustomStackTracePrint.cpp b/paddle/legacy/utils/tests/test_CustomStackTracePrint.cpp
similarity index 86%
rename from paddle/utils/tests/test_CustomStackTracePrint.cpp
rename to paddle/legacy/utils/tests/test_CustomStackTracePrint.cpp
index 360c61c88a757da708b01d2bb54068b948b235cc..78886a3ed9f237a39079bbf604a376f98bd86b59 100644
--- a/paddle/utils/tests/test_CustomStackTracePrint.cpp
+++ b/paddle/legacy/utils/tests/test_CustomStackTracePrint.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/CustomStackTrace.h"
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/CustomStackTrace.h"
+#include "paddle/legacy/utils/StringUtil.h"
+#include "paddle/legacy/utils/Util.h"
 
 int main(int argc, char** argv) {
   paddle::initMain(argc, argv);
diff --git a/paddle/utils/tests/test_CustomStackTracePrint.sh b/paddle/legacy/utils/tests/test_CustomStackTracePrint.sh
similarity index 100%
rename from paddle/utils/tests/test_CustomStackTracePrint.sh
rename to paddle/legacy/utils/tests/test_CustomStackTracePrint.sh
diff --git a/paddle/utils/tests/test_Error.cpp b/paddle/legacy/utils/tests/test_Error.cpp
similarity index 96%
rename from paddle/utils/tests/test_Error.cpp
rename to paddle/legacy/utils/tests/test_Error.cpp
index 6f311fa6b80191de1e11ce1f63c31b64fe2eeb80..250c4d58a64a0d284a15418e40264f1857d30050 100644
--- a/paddle/utils/tests/test_Error.cpp
+++ b/paddle/legacy/utils/tests/test_Error.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Error.h"
+#include "paddle/legacy/utils/Error.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/utils/tests/test_SIMDFlags.cpp b/paddle/legacy/utils/tests/test_SIMDFlags.cpp
similarity index 94%
rename from paddle/utils/tests/test_SIMDFlags.cpp
rename to paddle/legacy/utils/tests/test_SIMDFlags.cpp
index a808d456a69866f72502bcf1ae244cec14738e22..6362210acdaf26a26a2548ddaf8ed455b9c76618 100644
--- a/paddle/utils/tests/test_SIMDFlags.cpp
+++ b/paddle/legacy/utils/tests/test_SIMDFlags.cpp
@@ -11,9 +11,9 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 
-#include "paddle/utils/CpuId.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/CpuId.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Util.h"
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/utils/tests/test_SpinLock.cpp b/paddle/legacy/utils/tests/test_SpinLock.cpp
similarity index 93%
rename from paddle/utils/tests/test_SpinLock.cpp
rename to paddle/legacy/utils/tests/test_SpinLock.cpp
index cc34eb1f868003d3db9221578c0c20c44be285eb..4cd7836d6af251b48925de95c2811361313d7b41 100644
--- a/paddle/utils/tests/test_SpinLock.cpp
+++ b/paddle/legacy/utils/tests/test_SpinLock.cpp
@@ -17,9 +17,9 @@ limitations under the License. */
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Util.h"
 
 DEFINE_int32(test_thread_num, 100, "testing thread number");
 
diff --git a/paddle/utils/tests/test_StringUtils.cpp b/paddle/legacy/utils/tests/test_StringUtils.cpp
similarity index 95%
rename from paddle/utils/tests/test_StringUtils.cpp
rename to paddle/legacy/utils/tests/test_StringUtils.cpp
index 248f58a7f26e26e82b55110930964cee04fb558b..61d2815f097af7125bfefdc4909509564300d6aa 100644
--- a/paddle/utils/tests/test_StringUtils.cpp
+++ b/paddle/legacy/utils/tests/test_StringUtils.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/StringUtil.h"
+#include "paddle/legacy/utils/StringUtil.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/utils/tests/test_Thread.cpp b/paddle/legacy/utils/tests/test_Thread.cpp
similarity index 98%
rename from paddle/utils/tests/test_Thread.cpp
rename to paddle/legacy/utils/tests/test_Thread.cpp
index 6e2580c4913f0adc7ba1e63c9cebce308775aac6..5e07da3236862c5f585671d9bb8e3fbbd1c5b5fc 100644
--- a/paddle/utils/tests/test_Thread.cpp
+++ b/paddle/legacy/utils/tests/test_Thread.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <paddle/utils/Thread.h>
+#include <paddle/legacy/utils/Thread.h>
 #include <atomic>
 
 using paddle::AsyncThreadPool;  // NOLINT
diff --git a/paddle/utils/tests/test_ThreadBarrier.cpp b/paddle/legacy/utils/tests/test_ThreadBarrier.cpp
similarity index 94%
rename from paddle/utils/tests/test_ThreadBarrier.cpp
rename to paddle/legacy/utils/tests/test_ThreadBarrier.cpp
index 554b1c1d4adce7a0196b304281dcf878a0b6426e..9c8851ae2112320c89aa3e7ed6e850d00cc14006 100644
--- a/paddle/utils/tests/test_ThreadBarrier.cpp
+++ b/paddle/legacy/utils/tests/test_ThreadBarrier.cpp
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Util.h"
 
 DEFINE_int32(test_thread_num, 100, "testing thread number");
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index b66a05aaebda645196721fd6ed840e5584813348..d173b41e86f61954954b6a5ea9957d2e172deca0 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -106,7 +106,7 @@ function cmake_gen() {
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
-        -DWITH_ANAKIN=${WITH_ANAKIN:-ON}
+        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
         -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
     ========================================
 EOF
@@ -135,7 +135,7 @@ EOF
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-        -DWITH_ANAKIN=${WITH_ANAKIN:-ON} \
+        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
         -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
 }
 
@@ -312,6 +312,20 @@ EOF
     fi
 }
 
+function assert_api_not_changed() {
+    mkdir -p ${PADDLE_ROOT}/build/.check_api_workspace
+    cd ${PADDLE_ROOT}/build/.check_api_workspace
+    virtualenv .env
+    source .env/bin/activate
+    pip install ${PADDLE_ROOT}/build/python/dist/*whl
+    curl ${PADDLE_API_SPEC_URL:-https://raw.githubusercontent.com/PaddlePaddle/FluidAPISpec/master/API.spec} \
+        > origin.spec
+    python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
+    python ${PADDLE_ROOT}/tools/diff_api.py origin.spec new.spec
+    deactivate
+}
+
+
 function single_test() {
     TEST_NAME=$1
     if [ -z "${TEST_NAME}" ]; then
@@ -550,6 +564,7 @@ function main() {
       cicheck)
         cmake_gen ${PYTHON_ABI:-""}
         build
+        assert_api_not_changed
         run_test
         gen_capi_package
         gen_fluid_inference_lib
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index a1f446817e0cbc1b4391398a82b0846d01bbec2c..22644818994134d4797edfae8d156a005c103d52 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -6,6 +6,6 @@ if(WITH_TESTING)
   add_library(paddle_test_util STATIC TestUtil.cpp)
   add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
   if(NOT MOBILE_INFERENCE)
-    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init memory gtest gflags)
+    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS device_context memory gtest gflags)
   endif()
 endif()
diff --git a/paddle/testing/TestMain.cpp b/paddle/testing/TestMain.cpp
index 3e14532d1878fa374a5a2241c7b8319da2dc79d3..1811dbbd1a5f3f6078e7acd24b55d13a242c98bf 100644
--- a/paddle/testing/TestMain.cpp
+++ b/paddle/testing/TestMain.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 555be3d00e2dc467eec45210cc997779827ed69f..cfea2059c3ce20fb44732d990e9708ad6f8d81a1 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/init.h"
 
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list
deleted file mode 100644
index 0db50f34dd24b5e6fbc33a1e8dd3c16cb59eb56e..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
diff --git a/paddle/trainer/tests/sample_filelist.txt b/paddle/trainer/tests/sample_filelist.txt
deleted file mode 100644
index 7db4c735359a380dc150e24368653d2a6a55a453..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_filelist.txt
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/sample_data.txt
diff --git a/proto/README.md b/proto/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..dda7ed7b3c8ea4b541eaafbd0fd239eea789b40e
--- /dev/null
+++ b/proto/README.md
@@ -0,0 +1,3 @@
+## protos in this folder are legacy v2 protos.
+
+## Please refer to paddle/fluid for latest version.
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index ea25f3ab351ca1feb085a8fbbfe53d8cee397bbf..797c0fbcc4a2d61f5cbbf691db19b4cba5d38630 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,4 +1,4 @@
-file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
+file(GLOB UTILS_PY_FILES . ./paddle/legacy/utils/*.py)
 file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/*.py)
 set(PY_FILES paddle/__init__.py
   ${UTILS_PY_FILES}
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 145f1423e4b4a2ce35ba8ac3cca37935df90727e..b436dfe70afdb52299222f8ba3f5bdff2842d103 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -348,6 +348,12 @@ class Executor(object):
         ]
         return outs
 
+    def begin_pass(self):
+        self.executor.begin_pass()
+
+    def end_pass(self):
+        self.executor.end_pass()
+
     def run(self,
             program=None,
             feed=None,
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 9dcd907451dacaf95a7fe0d3a510241bc3da7f95..ea3117e02bd993b06de39725b2c3296031065e3c 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -19,7 +19,16 @@ import re
 import numpy as np
 
 import proto.framework_pb2 as framework_pb2
-from . import core
+try:
+    from . import core
+except ImportError, e:
+    raise ImportError(
+        """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
+    if you encounters \"libmkldnn.so not found\" errors. If you have python
+    installed in other directory, replace \"/usr/local/lib\" with your own
+    directory. The original error is: \n""" + e.message)
+except Exception, e:
+    raise e
 import unique_name
 
 __all__ = [
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index edd663cbf7d279c56aff31e689e6d76b66872ea7..99e23f8a7f8d2aefd925baaa4a440c797c0ef886 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -376,9 +376,6 @@ def open_recordio_file(filename,
     if pass_num > 1:
         main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
 
-    if for_parallel:
-        main_prog_var = parallel(reader=main_prog_var)
-
     return monkey_patch_reader_methods(main_prog_var)
 
 
@@ -449,7 +446,7 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
     return monkey_patch_reader_methods(main_prog_var)
 
 
-def py_reader(capacity, shapes, lod_levels, dtypes, for_parallel=True):
+def py_reader(capacity, shapes, lod_levels, dtypes):
     """
     Create a reader and blocking queue for data feeding in Python
     
@@ -466,8 +463,6 @@ def py_reader(capacity, shapes, lod_levels, dtypes, for_parallel=True):
        shapes(list): List of tuples which declaring data shapes.
        lod_levels(list): List of ints which declaring data lod_level.
        dtypes(list): List of strs which declaring data type.
-       for_parallel(Bool): Set it as True if you are going to run
-            subsequent operators in parallel.
 
     Returns:
        tuple(Variable, BlockingQueue):
@@ -496,6 +491,7 @@ def py_reader(capacity, shapes, lod_levels, dtypes, for_parallel=True):
                     queue.push(data)
             
             thread = threading.Thread(target=feed_data, args=(queue, feed_images, feed_labels))
+            thread.start()
     """
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
     shape_concat = []
@@ -527,9 +523,6 @@ def py_reader(capacity, shapes, lod_levels, dtypes, for_parallel=True):
     main_prog_var = _copy_reader_var_(default_main_program().current_block(),
                                       startup_var)
 
-    if for_parallel:
-        main_prog_var = parallel(reader=main_prog_var)
-
     return monkey_patch_reader_methods(main_prog_var), feed_queue
 
 
@@ -614,9 +607,6 @@ def open_files(filenames,
         main_prog_reader = multi_pass(
             reader=main_prog_reader, pass_num=pass_num)
 
-    if for_parallel:
-        main_prog_reader = parallel(reader=main_prog_reader)
-
     return monkey_patch_reader_methods(main_prog_reader)
 
 
@@ -732,11 +722,6 @@ def multi_pass(reader, pass_num):
         'create_multi_pass_reader', reader, {'pass_num': int(pass_num)})
 
 
-def parallel(reader):
-    return __create_shared_decorated_reader__('create_threaded_reader', reader,
-                                              {})
-
-
 def read_file(reader):
     """
     Execute the given reader and get data via it.
diff --git a/python/paddle/fluid/tests/test_mnist_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py
similarity index 66%
rename from python/paddle/fluid/tests/test_mnist_if_else_op.py
rename to python/paddle/fluid/tests/test_if_else_op.py
index d34f52db5ffc889f17513d034ad2c99f696b0cdf..1b58925599de62510ea9048f5210bb0b7e49f933 100644
--- a/python/paddle/fluid/tests/test_mnist_if_else_op.py
+++ b/python/paddle/fluid/tests/test_if_else_op.py
@@ -14,10 +14,11 @@
 
 import paddle
 import paddle.fluid.layers as layers
-from paddle.fluid.framework import Program, program_guard, default_main_program, default_startup_program
+from paddle.fluid.framework import Program, program_guard
 from paddle.fluid.executor import Executor
 from paddle.fluid.optimizer import MomentumOptimizer
 import paddle.fluid.core as core
+import paddle.fluid as fluid
 import unittest
 import numpy as np
 
@@ -31,14 +32,13 @@ class TestMNISTIfElseOp(unittest.TestCase):
 
             label = layers.data(name='y', shape=[1], dtype='int64')
 
-            limit = layers.fill_constant_batch_size_like(
-                input=label, dtype='int64', shape=[1], value=5.0)
+            limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
             cond = layers.less_than(x=label, y=limit)
             true_image, false_image = layers.split_lod_tensor(
                 input=image, mask=cond)
 
             true_out = layers.create_tensor(dtype='float32')
-            true_cond = layers.ConditionalBlock([true_image])
+            true_cond = layers.ConditionalBlock([cond])
 
             with true_cond.block():
                 hidden = layers.fc(input=true_image, size=100, act='tanh')
@@ -46,7 +46,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
                 layers.assign(input=prob, output=true_out)
 
             false_out = layers.create_tensor(dtype='float32')
-            false_cond = layers.ConditionalBlock([false_image])
+            false_cond = layers.ConditionalBlock([cond])
 
             with false_cond.block():
                 hidden = layers.fc(input=false_image, size=200, act='tanh')
@@ -64,7 +64,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
         train_reader = paddle.batch(
             paddle.reader.shuffle(
                 paddle.dataset.mnist.train(), buf_size=8192),
-            batch_size=200)
+            batch_size=10)
 
         place = core.CPUPlace()
         exe = Executor(place)
@@ -94,8 +94,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
 
             label = layers.data(name='y', shape=[1], dtype='int64')
 
-            limit = layers.fill_constant_batch_size_like(
-                input=label, dtype='int64', shape=[1], value=5.0)
+            limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
             cond = layers.less_than(x=label, y=limit)
             ie = layers.IfElse(cond)
 
@@ -125,7 +124,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
         place = core.CPUPlace()
         exe = Executor(place)
 
-        exe.run(kwargs['startup_program'])
+        exe.run(startup_prog)
         PASS_NUM = 100
         for pass_id in range(PASS_NUM):
             for data in train_reader():
@@ -133,7 +132,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
                 y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                 y_data = y_data.reshape((y_data.shape[0], 1))
 
-                outs = exe.run(kwargs['main_program'],
+                outs = exe.run(prog,
                                feed={'x': x_data,
                                      'y': y_data},
                                fetch_list=[avg_loss])
@@ -143,6 +142,67 @@ class TestMNISTIfElseOp(unittest.TestCase):
         self.assertFalse(True)
 
 
+class TestIfElse(unittest.TestCase):
+    def set_test_case(self):
+        # condiction is: self.data < self.cond_value
+        self.cond_value = 0.5
+        self.data = np.random.rand(25, 1).astype(np.float32)
+
+    def compare_ifelse_op_and_numpy(self, place):
+        self.set_test_case()
+
+        prog = Program()
+        startup_prog = Program()
+        with program_guard(prog, startup_prog):
+            src = layers.data(name='data', shape=[1], dtype='float32')
+            cond = layers.fill_constant(
+                [1], dtype='float32', value=self.cond_value)
+            ifcond = layers.less_than(x=src, y=cond)
+            ie = layers.IfElse(ifcond)
+            with ie.true_block():
+                true_target = ie.input(src)
+                ie.output(true_target)
+
+            with ie.false_block():
+                false_target = ie.input(src)
+                ie.output(false_target)
+            if_out = ie()
+            out = layers.reduce_sum(if_out)
+
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            fetch_list = [out]
+            o1, = exe.run(fluid.default_main_program(),
+                          feed={'data': self.data},
+                          fetch_list=[out])
+            o2 = np.sum(self.data)
+            self.assertTrue(
+                np.allclose(
+                    o1, o2, atol=1e-8),
+                "IfElse result : " + str(o1) + "\n Numpy result :" + str(o2))
+
+    def test_cpu(self):
+        self.compare_ifelse_op_and_numpy(fluid.CPUPlace())
+
+    def test_cuda(self):
+        if not core.is_compiled_with_cuda():
+            return
+        self.compare_ifelse_op_and_numpy(fluid.CUDAPlace(0))
+
+
+class TestIfElseTrueBranch(TestIfElse):
+    def set_test_case(self):
+        # condiction is: self.data < self.cond_value
+        self.cond_value = 10.
+        self.data = np.random.rand(25, 1).astype(np.float32)
+
+
+class TestIfElseFalseBranch(TestIfElse):
+    def set_test_case(self):
+        # condiction is: self.data < self.cond_value
+        self.cond_value = -10.
+        self.data = np.random.rand(25, 1).astype(np.float32)
+
+
 if __name__ == '__main__':
-    # temp disable if else unittest since it could be buggy.
-    exit(0)
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/.gitignore b/python/paddle/fluid/tests/unittests/.gitignore
index 3538a9c2009bb133609153427981fb66974377fa..b1e8fda03aa42f5f7528eafb46c16d55b868bae5 100644
--- a/python/paddle/fluid/tests/unittests/.gitignore
+++ b/python/paddle/fluid/tests/unittests/.gitignore
@@ -4,3 +4,5 @@ mnist_1.recordio
 mnist_2.recordio
 flowers.recordio
 wmt16.recordio
+data_balance_test.recordio
+data_balance_with_lod_test.recordio
diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d810920d55ccf069ff408c553069e8f5e590271
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+import numpy as np
+
+
+class TestDataBalance(unittest.TestCase):
+    def prepare_data(self):
+        def fake_data_generator():
+            for n in xrange(self.total_ins_num):
+                yield np.ones((3, 4)) * n, n
+
+        # Prepare data
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(
+                fake_data_generator, batch_size=self.batch_size)
+            feeder = fluid.DataFeeder(
+                feed_list=[
+                    fluid.layers.data(
+                        name='image', shape=[3, 4], dtype='float32'),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            self.num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
+                self.data_file_name, reader, feeder)
+
+    def prepare_lod_data(self):
+        def fake_data_generator():
+            for n in xrange(1, self.total_ins_num + 1):
+                d1 = (np.ones((n, 3)) * n).astype('float32')
+                d2 = (np.array(n).reshape((1, 1))).astype('int32')
+                yield d1, d2
+
+        # Prepare lod data
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            with fluid.recordio_writer.create_recordio_writer(
+                    filename=self.lod_data_file_name) as writer:
+                eof = False
+                generator = fake_data_generator()
+                while (not eof):
+                    data_batch = [
+                        np.array([]).reshape((0, 3)), np.array([]).reshape(
+                            (0, 1))
+                    ]
+                    lod = [0]
+                    for _ in xrange(self.batch_size):
+                        try:
+                            ins = generator.next()
+                        except StopIteration:
+                            eof = True
+                            break
+                        for i, d in enumerate(ins):
+                            data_batch[i] = np.concatenate(
+                                (data_batch[i], d), axis=0)
+                        lod.append(lod[-1] + ins[0].shape[0])
+                    if data_batch[0].shape[0] > 0:
+                        for i, d in enumerate(data_batch):
+                            t = fluid.LoDTensor()
+                            t.set(data_batch[i], fluid.CPUPlace())
+                            if i == 0:
+                                t.set_lod([lod])
+                            writer.append_tensor(t)
+                        writer.complete_append_tensor()
+
+    def setUp(self):
+        self.use_cuda = fluid.core.is_compiled_with_cuda()
+        self.data_file_name = './data_balance_test.recordio'
+        self.lod_data_file_name = './data_balance_with_lod_test.recordio'
+        self.total_ins_num = 50
+        self.batch_size = 10
+        self.prepare_data()
+        self.prepare_lod_data()
+
+    def main(self):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        with fluid.program_guard(main_prog, startup_prog):
+            data_reader = fluid.layers.io.open_files(
+                filenames=[self.data_file_name],
+                shapes=[[-1, 3, 4], [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'])
+            if self.use_cuda:
+                data_reader = fluid.layers.double_buffer(data_reader)
+            image, label = fluid.layers.read_file(data_reader)
+
+            place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(startup_prog)
+
+            build_strategy = fluid.BuildStrategy()
+            build_strategy.enable_data_balance = True
+            parallel_exe = fluid.ParallelExecutor(
+                use_cuda=self.use_cuda,
+                main_program=main_prog,
+                build_strategy=build_strategy)
+
+            if (parallel_exe.device_count > self.batch_size):
+                print("WARNING: Unittest TestDataBalance skipped. \
+                    For the result is not correct when device count \
+                    is larger than batch size.")
+                exit(0)
+            fetch_list = [image.name, label.name]
+
+            data_appeared = [False] * self.total_ins_num
+            while (True):
+                try:
+                    image_val, label_val = parallel_exe.run(fetch_list,
+                                                            return_numpy=True)
+                except fluid.core.EOFException:
+                    break
+                ins_num = image_val.shape[0]
+                broadcasted_label = np.ones(
+                    (ins_num, 3, 4)) * label_val.reshape((ins_num, 1, 1))
+                self.assertEqual(image_val.all(), broadcasted_label.all())
+                for l in label_val:
+                    self.assertFalse(data_appeared[l[0]])
+                    data_appeared[l[0]] = True
+            for i in data_appeared:
+                self.assertTrue(i)
+
+    def main_lod(self):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        with fluid.program_guard(main_prog, startup_prog):
+            data_reader = fluid.layers.io.open_files(
+                filenames=[self.lod_data_file_name],
+                shapes=[[-1, 3], [-1, 1]],
+                lod_levels=[1, 0],
+                dtypes=['float32', 'int32'],
+                thread_num=1)
+            ins, label = fluid.layers.read_file(data_reader)
+
+            place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(startup_prog)
+            build_strategy = fluid.BuildStrategy()
+            build_strategy.enable_data_balance = True
+            parallel_exe = fluid.ParallelExecutor(
+                use_cuda=self.use_cuda,
+                main_program=main_prog,
+                build_strategy=build_strategy)
+
+            if (parallel_exe.device_count > self.batch_size):
+                print("WARNING: Unittest TestDataBalance skipped. \
+                    For the result is not correct when device count \
+                    is larger than batch size.")
+                exit(0)
+            fetch_list = [ins.name, label.name]
+
+            data_appeared = [False] * self.total_ins_num
+            while (True):
+                try:
+                    ins_tensor, label_tensor = parallel_exe.run(
+                        fetch_list, return_numpy=False)
+                except fluid.core.EOFException:
+                    break
+
+                ins_val = np.array(ins_tensor)
+                label_val = np.array(label_tensor)
+                ins_lod = ins_tensor.lod()[0]
+                self.assertEqual(ins_val.shape[1], 3)
+                self.assertEqual(label_val.shape[1], 1)
+                self.assertEqual(len(ins_lod) - 1, label_val.shape[0])
+                for i in range(0, len(ins_lod) - 1):
+                    ins_elem = ins_val[ins_lod[i]:ins_lod[i + 1]][:]
+                    label_elem = label_val[i][0]
+                    self.assertEqual(ins_elem.all(), label_elem.all())
+                    self.assertFalse(data_appeared[int(label_elem - 1)])
+                    data_appeared[int(label_elem - 1)] = True
+
+            for i in data_appeared:
+                self.assertTrue(i)
+
+    def test_all(self):
+        self.main()
+        self.main_lod()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index b4379ad447e01683325dfcbb6a5b322f0b8eac3d..75b4b4e50da04521021dcb1e97cfe495f2619433 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -15,51 +15,248 @@
 import unittest
 import paddle.fluid as fluid
 from paddle.fluid.transpiler.distribute_transpiler import delete_ops
+import traceback
 
-from transpiler_test import TranspilerTest
 
-
-class TestDistTranspiler(TranspilerTest):
+class TranspilerTest(unittest.TestCase):
     def setUp(self):
-        self.current_pserver_ep = "127.0.0.1:6174"
+        self.trainer_id = 0
+        self.trainers = 2
+        self.pservers = 2
+        # NOTE: we do not actually bind this port
+        self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
+        self.pserver1_ep = "127.0.0.1:6174"
+        self.pserver2_ep = "127.0.0.1:6175"
+        self.slice_var_up = True
+        self.sync_mode = True
+        self.transpiler = None
+
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+        sgd_optimizer.minimize(avg_cost)
+        return
+
+    def get_main_program(self):
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            self.net_conf()
+        self.origin_prog = main.clone()
+        return main
+
+    def get_trainer(self):
+        t = self._transpiler_instance()
+        return t.get_trainer_program()
+
+    def get_pserver(self, ep):
+        t = self._transpiler_instance()
+        pserver = t.get_pserver_program(ep)
+        startup = t.get_startup_program(ep, pserver)
+        return pserver, startup
+
+    def _transpiler_instance(self):
+        if not self.transpiler:
+            main = self.get_main_program()
+            self.transpiler = fluid.DistributeTranspiler()
+            self.transpiler.transpile(
+                self.trainer_id,
+                program=main,
+                pservers=self.pserver_eps,
+                trainers=self.trainers,
+                slice_var_up=self.slice_var_up,
+                sync_mode=self.sync_mode)
+        return self.transpiler
 
+
+class TestBasicModel(TranspilerTest):
     def test_transpiler(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        pserver2, startup2 = self.get_pserver(self.pserver2_ep)
+
         trainer = self.get_trainer()
-        pserver, startup = self.get_pserver(self.current_pserver_ep)
-        self.assertEqual([op.type for op in trainer.global_block().ops],
-                         self.get_expect_trainer_ops())
+
+        self.assertEqual([op.type for op in trainer.global_block().ops], [
+            'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean',
+            'fill_constant', 'mean_grad', 'square_grad', 'elementwise_sub_grad',
+            'elementwise_add_grad', 'send', 'mul_grad', 'split_byref', 'send',
+            'send_barrier', 'recv', 'recv', 'fetch_barrier', 'concat'
+        ])
 
         self.assertEqual(len(pserver.blocks), 3)
         # block0: listen_and_serv
         self.assertEqual([op.type for op in pserver.blocks[0].ops],
                          ["listen_and_serv"])
-        # block2: optimize pass
+        # block1~2: optimize pass
         self.assertEqual([op.type for op in pserver.blocks[1].ops],
                          ["sum", "scale", "sgd"])
-
         # confirm startup program
-
-        self.assertEqual([op.type for op in startup.global_block().ops], [
-            "fill_constant", "fill_constant", "uniform_random", "uniform_random"
-        ])
-
+        self.assertEqual([op.type for op in startup.global_block().ops],
+                         ["fill_constant", "fill_constant", "uniform_random"])
         # the variable #fc_w will be split into two blocks
         fc_w_var = startup.global_block().var("fc_w.block1")
         self.assertEqual(fc_w_var.shape, (500, 1000))
+        # all parameters should be optimized on pserver
+
+        pserver_params = []
+        for prog in [pserver, pserver2]:
+            for blk in prog.blocks:
+                for op in blk.ops:
+                    if "Param" in op.input_names:
+                        param_name = op.input("Param")[0]
+                        is_block_idx = param_name.find(".block")
+                        if is_block_idx != -1:
+                            origin_param_name = param_name[:is_block_idx]
+                        else:
+                            origin_param_name = param_name
+                        pserver_params.append(origin_param_name)
+        trainer_params = []
+        for op in self.origin_prog.global_block().ops:
+            if "Param" in op.input_names:
+                trainer_params.append(op.input("Param")[0])
+        self.assertEqual(set(pserver_params), set(trainer_params))
+
+
+class TestNoSliceVar(TranspilerTest):
+    def setUp(self):
+        super(TestNoSliceVar, self).setUp()
+        self.slice_var_up = False
+
+    def test_transpiler(self):
+        _, startup = self.get_pserver(self.pserver1_ep)
+        _, startup2 = self.get_pserver(self.pserver2_ep)
+
+        if startup.global_block().vars.has_key("fc_w"):
+            fc_w_var = startup.global_block().vars["fc_w"]
+        elif startup2.global_block().vars.has_key("fc_w"):
+            fc_w_var = startup2.global_block().vars["fc_w"]
+
+        self.assertEqual(fc_w_var.shape, (1000, 1000))
 
-    def get_expect_trainer_ops(self):
-        trainer = fluid.Program()
 
-        with fluid.program_guard(trainer):
-            optimize_ops, params_grads = self.net_conf()
+class TestLRDecay(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(
+            learning_rate=fluid.layers.exponential_decay(
+                learning_rate=1.0,
+                decay_steps=2100,
+                decay_rate=0.1,
+                staircase=True))
+        sgd_optimizer.minimize(avg_cost)
+        return
+
+    def test_transpiler(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        trainer = self.get_trainer()
+
+        self.assertEqual(len(pserver.blocks), 4)
+        lr_decay_ops = [op.type for op in pserver.blocks[1].ops]
+        self.assertEqual(lr_decay_ops, [
+            "increment", "cast", "fill_constant", "elementwise_div", "floor",
+            "fill_constant", "elementwise_pow", "fill_constant",
+            "elementwise_mul"
+        ])
+
+
+class TestLRDecayConditional(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(
+            learning_rate=fluid.layers.piecewise_decay([10000, 20000],
+                                                       [1.0, 0.5, 1.0]))
+        sgd_optimizer.minimize(avg_cost)
+        return
+
+    def test_transpiler(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        trainer = self.get_trainer()
+
+        serv_op = pserver.blocks[0].ops[0]
+        sub_blocks = []
+        optimize_blocks = []
+        for b in serv_op.attrs["optimize_blocks"]:
+            optimize_blocks.append(b.idx)
+        for b in pserver.blocks:
+            if b.idx not in optimize_blocks:
+                sub_blocks.append(b.idx)
+
+        self.assertEqual(len(pserver.blocks), 7)
+        lr_decay_ops = [op.type for op in pserver.blocks[1].ops]
+        self.assertEqual(lr_decay_ops, [
+            "increment", "cast", "fill_constant", "fill_constant", "less_than",
+            "logical_not", "conditional_block", "fill_constant",
+            "fill_constant", "less_than", "logical_not", "logical_and",
+            "logical_and", "conditional_block", "fill_constant",
+            "conditional_block"
+        ])
+        # test the condition blocks
+        for b in sub_blocks:
+            if b == 0:
+                continue
+            block = pserver.blocks[b]
+            self.assertEqual([op.type for op in block.ops], ["assign"])
+
+
+class TestL2Decay(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(
+            input=x,
+            size=1000,
+            act=None,
+            param_attr=fluid.ParamAttr(
+                name='fc_w',
+                regularizer=fluid.regularizer.L2Decay(),
+                gradient_clip=fluid.clip.GradientClipByValue(0.1)),
+            bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+        sgd_optimizer.minimize(avg_cost)
+        return
+
+    def test_transpiler(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        trainer = self.get_trainer()
+
+        self.assertEqual(len(pserver.blocks), 3)
+        self.assertEqual([op.type for op in pserver.blocks[1].ops],
+                         ["sum", "scale", "clip", "sgd"])
+        self.assertEqual(
+            [op.type for op in pserver.blocks[2].ops],
+            ["sum", "scale", "clip", "scale", "elementwise_add", "sgd"])
+        # TODO(typhoonzero): test clipping and L2Decay ops are removed from trainer
+
 
-        delete_ops(trainer.global_block(), optimize_ops)
-        ops = [op.type for op in trainer.global_block().ops] + [
-            "split_byref", "send", "send_barrier", "recv", "recv",
-            "fetch_barrier", "concat"
-        ]
-        ops.insert(ops.index("elementwise_add_grad") + 1, "send")
-        return ops
+    # FIXME(typhoonzero): need to add test for async case:
+    # see https://github.com/PaddlePaddle/Paddle/issues/11691
+class TestAsyncSGD(TranspilerTest):
+    pass
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
index 281068e945e76a42635868d19573498f79fde1f3..026ac2112b2d78644b3315b9cab8019ca27e9714 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
@@ -40,7 +40,6 @@ class TestFakeDequantizeMaxAbsOp(OpTest):
         self.op_type = "fake_dequantize_max_abs"
         x = np.random.randn(31, 65).astype("float32")
         yq, scale = quantize_max_abs(x, self.num_bits)
-        print 'scale ', scale
         ydq = dequantize_max_abs(yq, self.num_bits, scale)
 
         self.inputs = {'X': yq}
diff --git a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
index 3f940203b9393d266d75b50c9cbf62e89c36cbdf..dbd510e64ffdd6f3b78b22bb0d37d9a7ba3fd9b5 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
@@ -64,8 +64,7 @@ class TestMultipleReader(unittest.TestCase):
             while True:
                 try:
                     img_val, = exe.run(fetch_list=[img])
-                except fluid.core.EnforceNotMet as ex:
-                    self.assertIn("There is no next data.", ex.message)
+                except fluid.core.EOFException:
                     break
                 batch_count += 1
                 self.assertLessEqual(img_val.shape[0], self.batch_size)
diff --git a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
index 52e7cc1ffbba40a63ce3cec645c7c0a7a499c1bf..7fc9f550440d3d0e1a8182a69f5692b3df0aa258 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
@@ -59,8 +59,7 @@ class TestMultipleReader(unittest.TestCase):
             while True:
                 try:
                     img_val, = exe.run(fetch_list=[img])
-                except fluid.core.EnforceNotMet as ex:
-                    self.assertIn("There is no next data.", ex.message)
+                except fluid.core.EOFException:
                     break
                 batch_count += 1
                 self.assertLessEqual(img_val.shape[0], self.batch_size)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_op.py b/python/paddle/fluid/tests/unittests/test_parallel_op.py
index 79bea148f9398152a02d70946cdc5fff1f47ba6b..9ba5f988f317a515b77c0b428da236626419a2c3 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
@@ -113,7 +113,9 @@ class BaseParallelForTest(unittest.TestCase):
             generator = callback()
             # Automatically insert parallel do if use_parallel = True
             if use_parallel:
-                places = fluid.layers.get_places()
+                thread_num = fluid.core.get_cuda_device_count(
+                ) if use_gpu else 8
+                places = fluid.layers.get_places(thread_num)
                 pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
                 data = next(generator)
 
diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
index f32050014d7ace5aee4aca75a47bfc6a75ff91c2..69a522e273db017ac55b408276b4a28f5f907c42 100644
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
@@ -68,8 +68,7 @@ class TestRecordIO(unittest.TestCase):
             while True:
                 try:
                     tmp, = exe.run(fetch_list=[avg_loss])
-                except fluid.core.EnforceNotMet as ex:
-                    self.assertIn("There is no next data.", ex.message)
+                except fluid.core.EOFException:
                     break
 
                 avg_loss_np.append(tmp)
diff --git a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
deleted file mode 100644
index f4aa7426bc315be501348a64e2f15caed6dc8810..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
+++ /dev/null
@@ -1,80 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-
-import paddle.fluid as fluid
-from paddle.fluid.transpiler.distribute_transpiler import delete_ops
-
-from transpiler_test import TranspilerTest
-
-
-class TestSimpleDistTranspiler(TranspilerTest):
-    def setUp(self):
-        self.current_pserver_ep = "127.0.0.1:6175"
-
-    def test_simple_transpiler(self):
-        np.random.seed(1)
-
-        trainer = self.get_trainer()
-        pserver, startup = self.get_pserver(self.current_pserver_ep)
-        self.assertEqual([op.type for op in trainer.global_block().ops],
-                         self.get_expect_trainer_ops())
-
-        self.assertEqual(len(pserver.blocks), 2)
-        # block0: listen_and_serv
-        self.assertEqual([op.type for op in pserver.blocks[0].ops],
-                         ["listen_and_serv"])
-        # block1: optimize pass
-        self.assertEqual([op.type for op in pserver.blocks[1].ops],
-                         ["sum", "scale", "sgd"])
-
-        # confirm startup program
-        self.assertEqual([op.type for op in startup.global_block().ops],
-                         ["fill_constant", "uniform_random", "uniform_random"])
-
-        # the variable #fc_w will NOT be splited
-        fc_w_var = startup.global_block().var("fc_w@GRAD")
-        self.assertEqual(fc_w_var.shape, (1000, 1000))
-
-        fc_w_var = startup.global_block().var("fc_w@GRAD.trainer_0")
-        self.assertEqual(fc_w_var.shape, (1000, 1000))
-
-    def get_expect_trainer_ops(self):
-        trainer = fluid.Program()
-
-        with fluid.program_guard(trainer):
-            optimize_ops, params_grads = self.net_conf()
-
-        delete_ops(trainer.global_block(), optimize_ops)
-        ops = [op.type for op in trainer.global_block().ops] + [
-            "send", "send_barrier", "recv", "recv", "fetch_barrier"
-        ]
-        ops.insert(ops.index("elementwise_add_grad") + 1, "send")
-        return ops
-
-    def _transpiler_instance(self):
-        main = self.get_main_program()
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            self.trainer_id,
-            program=main,
-            pservers=self.pserver_eps,
-            trainers=self.trainers,
-            slice_var_up=False)
-        return t
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/transpiler_test.py b/python/paddle/fluid/tests/unittests/transpiler_test.py
deleted file mode 100644
index d84c5d9c41c705cf6d14cc0b5a8c692b0d646337..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/transpiler_test.py
+++ /dev/null
@@ -1,73 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.layers as layers
-
-
-class TranspilerTest(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.trainer_id = 0
-        self.trainers = 2
-        self.pservers = 2
-        self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
-
-    def net_conf(self):
-        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
-
-        y_predict = fluid.layers.fc(input=x,
-                                    size=1000,
-                                    act=None,
-                                    param_attr=fluid.ParamAttr(name='fc_w'))
-
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
-
-        optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
-        return optimize_ops, params_grads
-
-    def get_main_program(self):
-        main = fluid.Program()
-
-        with fluid.program_guard(main):
-            self.net_conf()
-
-        return main
-
-    def get_trainer(self):
-        return self._transpiler_instance().get_trainer_program()
-
-    def get_pserver(self, ep):
-        t = self._transpiler_instance()
-        pserver = t.get_pserver_program(ep)
-        startup = t.get_startup_program(ep, pserver)
-        return pserver, startup
-
-    def _transpiler_instance(self):
-        main = self.get_main_program()
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            self.trainer_id,
-            program=main,
-            pservers=self.pserver_eps,
-            trainers=self.trainers)
-        return t
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 343901cda3f505c3b3d2ed0c30cf7fea71c8b6b1..53d6ca86a008f798af2854a154cce8b7242d2f35 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -309,10 +309,10 @@ class DistributeTranspiler(object):
     def get_pserver_program(self, endpoint):
         """
         Get parameter server side program.
-        
+
         Args:
             endpoint (str): current parameter server endpoint.
-        
+
         Returns:
             Program: the program for current parameter server to run.
         """
@@ -455,6 +455,8 @@ class DistributeTranspiler(object):
                     __append_optimize_op__(op, per_opt_block, grad_to_block_id,
                                            merged_var, lr_ops)
 
+        # dedup grad to ids list
+        grad_to_block_id = list(set(grad_to_block_id))
         # append global ops
         if global_ops:
             opt_state_block = pserver_program.create_block(
@@ -514,7 +516,7 @@ class DistributeTranspiler(object):
             endpoint (str): current pserver endpoint.
             pserver_program (Program): call get_pserver_program first and
                 pass the result here.
-        
+
         Returns:
             Program: parameter server side startup program.
         """
@@ -550,10 +552,10 @@ class DistributeTranspiler(object):
                     op_on_pserver = True
                     new_outputs[key] = pserver_vars[op.output(key)[0]]
 
-            # most startup program ops have no inputs
-            new_inputs = self._get_input_map_from_op(pserver_vars, op)
-
             if op_on_pserver:
+                # most startup program ops have no inputs
+                new_inputs = self._get_input_map_from_op(pserver_vars, op)
+
                 if op.type in [
                         "gaussian_random", "fill_constant", "uniform_random"
                 ]:
@@ -960,8 +962,6 @@ class DistributeTranspiler(object):
             if not block_map.has_key(varname):
                 block_map[varname] = []
             block_map[varname].append((long(offset), long(size)))
-        # Do not remove this important debug message:
-        print("block map: %s" % block_map)
 
         for varname, splited in block_map.iteritems():
             orig_var = program.global_block().var(varname)
@@ -1401,6 +1401,16 @@ class DistributeTranspiler(object):
                     break
         return lr_ops
 
+    def _is_opt_role_op(self, op):
+        # NOTE: depend on oprole to find out whether this op is for
+        # optimize
+        op_maker = core.op_proto_and_checker_maker
+        optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
+        if op_maker.kOpRoleAttrName() in op.attrs and \
+            int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role):
+            return True
+        return False
+
     def _get_optimize_pass(self):
         """
         Get optimizer operators, paramters and gradients from origin_program
@@ -1413,10 +1423,7 @@ class DistributeTranspiler(object):
         params_grads = []
         origin_var_dict = self.origin_program.global_block().vars
         for op in block.ops:
-            # NOTE(Yancey1989): we can not use op role to distinguish an optimizer op
-            # or not, because all ops in optimizer sub-graph would
-            # sign the optimizer op role
-            if self._is_optimizer_op(op):
+            if self._is_opt_role_op(op):
                 opt_ops.append(op)
                 # HACK(wuyi): if we find grad vars from input of optimize
                 # ops, we may get the output of clip op. Use syntax "@GRAD"
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
index d32c69d148dfa1633ce344611ca3fe7879a234e9..b8afeae5ebd6ef7948a7c0c2775f419af461da04 100644
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -19,7 +19,7 @@ from ..framework import Program
 from ..executor import global_scope
 
 
-class InferenceTranspiler:
+class InferenceTranspiler(object):
     '''
     Convert the fluid program to optimized inference program.
 
diff --git a/python/paddle/libs/__init__.py b/python/paddle/libs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..34d4f4d07ed0d452c1965c5f1f198230571931aa
--- /dev/null
+++ b/python/paddle/libs/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# used for setup.py.in to store the thirdparty shared libraries
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 460eb3b3491a0626eb6ecbf89132e24177a2adaa..5b90facd49d655f56c037e087d86e41372cbfdb9 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -67,7 +67,7 @@ extension_module_name=[MODULE_NAME], then config_parser will call
 MODULE_NAME.get_config_funcs(g_config)
 MODULE_NAME.get_config_funcs() should return a dictionary of name to functions,
 those functions will be available in the config file.
-See trainer/tests/config_parser_test.py for example
+See legacy/trainer/tests/config_parser_test.py for example
 
 To use this from paddle_trainer, paddle_trainer should be called with
 --config_args=extension_module_name=[MODULE_NAME]
diff --git a/python/setup.py.in b/python/setup.py.in
index 032784f4a2ae8f3368e8ed4690f3482f0deae557..a0cb39070bf7a89e3ea4cb1d31f54f919d6ff74e 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,14 +1,13 @@
 from setuptools import setup, Distribution, Extension
 import subprocess
+import os
+import re
+import shutil
 class BinaryDistribution(Distribution):
     def has_ext_modules(foo):
         return True
 
-MAJOR   = 0
-MINOR   = 11
-PATCH   = 0
 RC      = 0
-ISTAGED = False
 
 
 
@@ -20,14 +19,47 @@ def git_commit():
         git_commit = 'Unknown'
     return git_commit
 
+def _get_version_detail(idx):
+    assert idx < 3, "vesion info consists of %(major)d.%(minor)d.%(patch)d, \
+        so detail index must less than 3"
+
+    if re.match('@TAG_VERSION_REGEX@', '@PADDLE_VERSION@'):
+        version_details = '@PADDLE_VERSION@'.split('.')
+
+        if len(version_details) == 3:
+            return version_details[idx]
+
+    return 0
+
+def get_major():
+    return int(_get_version_detail(0))
+
+def get_minor():
+    return int(_get_version_detail(1))
+
+def get_patch():
+    return str(_get_version_detail(2))
+
+def is_taged():
+    try:
+        cmd = ['git', 'describe', '--exact-match', '--tags']
+        git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip()
+    except:
+        return False
+
+    if git_tag.replace('v', '') == '@PADDLE_VERSION@':
+        return True
+    else:
+        return False
+
 def write_version_py(filename='paddle/version.py'):
     cnt = '''
 # THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
 #
-full_version    = '%(major)d.%(minor)d.%(patch)d'
+full_version    = '%(major)d.%(minor)d.%(patch)s'
 major           = '%(major)d'
 minor           = '%(minor)d'
-patch           = '%(patch)d'
+patch           = '%(patch)s'
 rc              = '%(rc)d'
 istaged         = %(istaged)s
 commit          = '%(commit)s'
@@ -49,19 +81,20 @@ def mkl():
     commit = git_commit()
     with open(filename, 'w') as f:
         f.write(cnt % {
-            'major': MAJOR,
-            'minor': MINOR,
-            'patch': PATCH,
+            'major': get_major(),
+            'minor': get_minor(),
+            'patch': get_patch(),
             'rc': RC,
             'version': '${PADDLE_VERSION}',
             'commit': commit,
-            'istaged': ISTAGED,
+            'istaged': is_taged(),
             'with_mkl': '@WITH_MKL@'})
 
 write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py')
 
 
 packages=['paddle',
+          'paddle.libs',
           'paddle.utils',
           'paddle.dataset',
           'paddle.reader',
@@ -93,8 +126,8 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
 paddle_bins = ''
 if '${WITH_FLUID_ONLY}'== 'OFF':
     paddle_bin_dir = 'opt/paddle/bin'
-    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
-                   '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
+    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/legacy/trainer/paddle_trainer',
+                   '${PADDLE_BINARY_DIR}/paddle/legacy/trainer/paddle_merge_model',
                    '${PADDLE_BINARY_DIR}/paddle/legacy/pserver/paddle_pserver_main',
                    '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
 
@@ -113,12 +146,35 @@ package_dir={
 }
 if '${WITH_FLUID_ONLY}'== 'OFF':
     package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle'
-    
 
-paddle_rt_lib_dir = 'lib'
-paddle_rt_libs = ['${WARPCTC_LIBRARIES}']
-if '${MKL_SHARED_LIBS}'!= '':
-  paddle_rt_libs += '${MKL_SHARED_LIBS}'.split(';')
+# put all thirdparty libraries in paddle.libs
+package_data['paddle.libs']=['libwarpctc.so']
+libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
+shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
+if '${WITH_MKL}' == 'ON':
+    shutil.copy('${MKLML_LIB}', libs_path)
+    shutil.copy('${MKLML_IOMP_LIB}', libs_path)
+    package_data['paddle.libs']+=['libmklml_intel.so','libiomp5.so']
+if '${WITH_MKLDNN}' == 'ON':
+    # change rpath of libmkldnn.so.0, add $ORIGIN/ to it.
+    # The reason is that all thirdparty libraries in the same directory,
+    # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so.
+    command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
+    if os.system(command) != 0:
+        raise Exception("patchelf --set-rpath for libmkldnn.so.0 fails")
+    package_data['paddle.libs']+=['libmkldnn.so.0']
+    shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
+# remove unused paddle/libs/__init__.py
+os.remove(libs_path+'/__init__.py')
+package_dir['paddle.libs']=libs_path
+
+# change rpath of core.so, add $ORIGIN/../libs/ to it.
+# The reason is that libwarpctc.so, libiomp5.so etc are in paddle.libs, and
+# core.so is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
+# This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
+command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
+if os.system(command) != 0:
+    raise Exception("patchelf --set-rpath for core.so fails")
 
 setup(name='${PACKAGE_NAME}',
       version='${PADDLE_VERSION}',
@@ -128,6 +184,5 @@ setup(name='${PACKAGE_NAME}',
       ext_modules=[Extension('_foo', ['stub.cc'])],
       package_data=package_data,
       package_dir=package_dir,
-      scripts=paddle_bins,
-      data_files=[(paddle_rt_lib_dir, paddle_rt_libs)]
+      scripts=paddle_bins
 )
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
index 041ba868afad62ec4e45f537fe4a19c9bfb3a301..2c65222c8aa7a019f0f8fea68fe02612f70bd41f 100755
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -4,7 +4,7 @@ TOTAL_ERRORS=0
 
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
-    if [[ $file =~ ^(paddle/api/.*|paddle/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/trainer/.*|paddle/utils/.*|paddle/testing/TestUtil.*) ]]; then
+    if [[ $file =~ ^(paddle/legacy/api/.*|paddle/legacy/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/legacy/trainer/.*|paddle/legacy/utils/.*|paddle/testing/TestUtil.*) ]]; then
         continue;
     else
         cpplint --filter=-readability/fn_size $file;
diff --git a/tools/diff_api.py b/tools/diff_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf9f2c72cb78ddf88ff2a7bb1c0ee4b00ec0ec96
--- /dev/null
+++ b/tools/diff_api.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+from __future__ import print_function
+import difflib
+import sys
+
+with open(sys.argv[1], 'r') as f:
+    origin = f.read()
+    origin = origin.splitlines()
+
+with open(sys.argv[2], 'r') as f:
+    new = f.read()
+    new = new.splitlines()
+
+differ = difflib.Differ()
+result = differ.compare(origin, new)
+
+error = False
+print('API Difference is: ')
+for each_diff in result:
+    if each_diff[0] in ['-', '?']:  # delete or change API is not allowed
+        error = True
+    elif each_diff[0] == '+':
+        # only new layers is allowed.
+        if not each_diff.startswith('+ paddle.fluid.layers.'):
+            error = True
+
+    if each_diff[0] != ' ':
+        print(each_diff)
+
+if error:
+    sys.exit(1)