diff --git a/.travis.yml b/.travis.yml
index a406841f6abf01f15826f34fe4c63b4c24486ccd..361136ac2c8d899a0d7a4d7945083fcc489551b5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -27,15 +27,6 @@ script:
     # 43min timeout
     paddle/scripts/paddle_docker_build.sh ${JOB}
     if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
-  - |
-    if [[ "$JOB" != "doc" ]]; then exit 0; fi;
-    # For document only
-    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
-    if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
-    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
-    export DOCS_DIR=`pwd`
-    cd ..
-    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
 notifications:
   email:
     on_success: change
diff --git a/AUTHORS.md b/AUTHORS.md
index 8c4a113fc276783c945867ceae9612339b7f0bbc..41b7193677a0208ba2fa82b72862292572dcb6ef 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -46,6 +46,7 @@
 | tianbingsz | Tian-Bing Xu |
 | tpatejko | Tomasz Patejko |
 | typhoonzero | Yi Wu |
+| velconia | Qi-Yang Min |
 | wanghaoshuang | Hao-Shuang Wang |
 | wangyang59 | Yang Wang |
 | wangzhen-nlp | Zhen Wang |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 231224f9249848b6e4981a98e0538794bf5d3c08..f56c5d382af8cdfb5a941ee272a0f8d22ec04d67 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,6 +65,7 @@ option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better d
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
+option(WITH_INFERENCE    "Compile fluid inference library"              ON)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 
@@ -72,6 +73,7 @@ option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VER
 if(NOT PY_VERSION)
   set(PY_VERSION 2.7)
 endif()
+set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -158,6 +160,7 @@ endif()
 ########################################################################################
 
 include(external/mklml)     # download mklml package
+include(external/xbyak)     # download xbyak package
 include(external/libxsmm)   # download, build, install libxsmm
 include(external/zlib)      # download, build, install zlib
 include(external/gflags)    # download, build, install gflags
@@ -174,6 +177,7 @@ include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
+include(external/cub)
 
 if(WITH_DISTRIBUTE)
     if(WITH_GRPC)
@@ -200,6 +204,14 @@ include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
 
+if(WITH_GPU)
+    include(cuda)
+    include(tensorrt)
+    include(external/anakin)
+else()
+  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
+endif()
+
 include(cudnn)              # set cudnn libraries, must before configure
 include(cupti)
 include(configure)          # add paddle env configuration
@@ -228,14 +240,6 @@ set(EXTERNAL_LIBS
     ${PYTHON_LIBRARIES}
 )
 
-if(WITH_GPU)
-    include(cuda)
-    include(tensorrt)
-    include(external/anakin)
-else()
-  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
-endif()
-
 if(WITH_AMD_GPU)
     find_package(HIP)
     include(hip)
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index f8aed5a5e06c5e29dbdfb5db9f2ea0344c7eed6d..6b22f8f520e3d9c6c89d41a7455a6f9ebbad6d80 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -85,8 +85,7 @@ def dist_transpile(trainer_id, args):
         trainer_id,
         pservers=pserver_endpoints,
         trainers=trainers,
-        sync_mode=not args.async_mode,
-        slice_var_up=not args.no_split_var)
+        sync_mode=not args.async_mode)
     if training_role == "PSERVER":
         pserver_program = t.get_pserver_program(current_endpoint)
         pserver_startup_program = t.get_startup_program(current_endpoint,
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 2c84061ff572de4687b4d496f8ded6deee8d1011..9eebea816cbfc91052c95ecf99ecc4b0bea4e4c2 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -21,6 +21,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
     ${CUDNN_ROOT}/lib64
     ${CUDNN_ROOT}/lib
     ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+    ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
     $ENV{CUDNN_ROOT}
     $ENV{CUDNN_ROOT}/lib64
     $ENV{CUDNN_ROOT}/lib
diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
index fb3d8ef8d53436f387acc3069a0eb887e6f07c59..8b7d91f234594becdda805c089fac0bb4e4e8e44 100644
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -8,6 +8,7 @@ set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header f
 set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library")
 
 set(ANAKIN_COMPILE_EXTRA_FLAGS 
+    -Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
     -Wno-error=unused-variable -Wno-unused-variable 
     -Wno-error=format-extra-args -Wno-format-extra-args
     -Wno-error=comment -Wno-comment 
@@ -19,7 +20,7 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
     -Wno-reorder 
     -Wno-error=cpp)
 
-set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz")
+set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/Version0.1.0/anakin.tar.gz")
 
 # A helper function used in Anakin, currently, to use it, one need to recursively include
 # nearly all the header files.
@@ -41,9 +42,9 @@ if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
     message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
     execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
     execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
-    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget --no-check-certificate -q ${ANAKIN_LIBRARY_URL}")
     execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
-    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin.tar.gz")
 endif()
 
 if (WITH_ANAKIN)
diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..c94849cf4b96746e6c507db2a6310c2f305dacf5
--- /dev/null
+++ b/cmake/external/cub.cmake
@@ -0,0 +1,35 @@
+if(NOT WITH_GPU)
+  return()
+endif()
+
+include(ExternalProject)
+
+set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub)
+set(CUB_INCLUDE_DIR ${CUB_SOURCE_DIR}/src/extern_cub)
+
+include_directories(${CUB_INCLUDE_DIR})
+
+ExternalProject_Add(
+  extern_cub
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  GIT_REPOSITORY "https://github.com/NVlabs/cub.git"
+  GIT_TAG        "v1.8.0"
+  PREFIX         ${CUB_SOURCE_DIR}
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
+
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cub_dummy.c)
+  file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+  add_library(cub STATIC ${dummyfile})
+else()
+  add_library(cub INTERFACE)
+endif()
+
+add_dependencies(cub extern_cub)
+
+LIST(APPEND externl_project_dependencies cub)
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index 82437a84248fece843c3659c9422d9b579b5066f..7fb67afbe15a5a019c978092d5ba3a4a0f66d996 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -50,7 +50,7 @@ ExternalProject_Add(
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
     BUILD_IN_SOURCE 1
-    PATCH_COMMAND git apply ${PADDLE_SOURCE_DIR}/patches/grpc/fix_too_early_destory.patch
+    PATCH_COMMAND cp ${PADDLE_SOURCE_DIR}/patches/grpc/grpc_library.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/grpc_library.h && cp ${PADDLE_SOURCE_DIR}/patches/grpc/completion_queue.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/completion_queue.h
     # NOTE(yuyang18):
     # Disable -Werror, otherwise the compile will fail in MacOS.
     # It seems that we cannot configure that by make command.
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 20dda35c5ccd98f5672d867c26ab97a215483543..260985cc8aa4ad0f231798666c048703b64c6d15 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -24,7 +24,7 @@ SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 
 IF(WIN32 OR APPLE)
-    MESSAGE(WARNING 
+    MESSAGE(WARNING
         "Windows or Mac is not supported with MKLDNN in Paddle yet."
         "Force WITH_MKLDNN=OFF")
     SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE)
@@ -57,8 +57,10 @@ ExternalProject_Add(
     GIT_TAG             "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51"
     PREFIX              ${MKLDNN_SOURCES_DIR}
     UPDATE_COMMAND      ""
+    CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
-    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} 
+    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
     CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
     CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
     CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..384c2f9328296ce6a8a6293be6cc47e5063dd3c4
--- /dev/null
+++ b/cmake/external/xbyak.cmake
@@ -0,0 +1,58 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set(WITH_XBYAK ON)
+if(WIN32 OR APPLE)
+    SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
+    return()
+endif()
+
+include(ExternalProject)
+
+set(XBYAK_PROJECT       extern_xbyak)
+set(XBYAK_PREFIX_DIR    ${THIRD_PARTY_PATH}/xbyak)
+set(XBYAK_INSTALL_ROOT  ${THIRD_PARTY_PATH}/install/xbyak)
+set(XBYAK_INC_DIR       ${XBYAK_INSTALL_ROOT}/include)
+
+include_directories(${XBYAK_INC_DIR})
+include_directories(${XBYAK_INC_DIR}/xbyak)
+
+add_definitions(-DPADDLE_WITH_XBYAK)
+
+# xbyak options
+add_definitions(-DXBYAK64)
+add_definitions(-DXBYAK_NO_OP_NAMES)
+
+ExternalProject_Add(
+    ${XBYAK_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    DEPENDS             ""
+    GIT_REPOSITORY      "https://github.com/herumi/xbyak.git"
+    GIT_TAG             "v5.661"  # Jul 26th
+    PREFIX              ${XBYAK_PREFIX_DIR}
+    UPDATE_COMMAND      ""
+    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT}
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/xbyak_dummy.c)
+    file(WRITE ${dummyfile} "const char *dummy_xbyak = \"${dummyfile}\";")
+    add_library(xbyak STATIC ${dummyfile})
+else()
+    add_library(xbyak INTERFACE)
+endif()
+
+add_dependencies(xbyak ${XBYAK_PROJECT})
+list(APPEND external_project_dependencies xbyak)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index eafb11b6f21e226fc68556a78d675dea94080140..82c958073cba92f00a341121e36ba45531b22aec 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -263,8 +263,11 @@ function(cc_test TARGET_NAME)
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     if (${cc_test_SERIAL})
-        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
     endif()
   endif()
 endfunction(cc_test)
@@ -328,8 +331,11 @@ function(nv_test TARGET_NAME)
     add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_test(${TARGET_NAME} ${TARGET_NAME})
     if (nv_test_SERIAL)
-        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
     endif()
   endif()
 endfunction(nv_test)
@@ -577,7 +583,9 @@ function(py_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
+             COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
+             FLAGS_cpu_deterministic=true
+             PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index e2c58cd56055455e7fedc598ca8f56183d4b51dc..aeb081e76e5bc5b9d3d81ce625195c800174ab6c 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -148,18 +148,11 @@ if (WITH_ANAKIN AND WITH_GPU)
      list(APPEND inference_deps anakin_inference_lib)
 endif()
 
-copy(inference_api_lib DEPS paddle_inference_api paddle_inference_api_shared
-  SRCS ${src_dir}/${module}/paddle_inference_api.h 
-       ${src_dir}/${module}/demo_ci
-       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libpaddle_inference_api*
-  DSTS ${dst_dir}/inference ${dst_dir}/inference ${dst_dir}/inference
-)
-list(APPEND inference_deps inference_api_lib)
-
 set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
   SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
+       ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
 )
 
 set(module "platform")
diff --git a/doc/fluid/design/ir/draft.md b/doc/fluid/design/ir/draft.md
deleted file mode 100644
index a141dcbca584c6064c8da863410692a8be911d12..0000000000000000000000000000000000000000
--- a/doc/fluid/design/ir/draft.md
+++ /dev/null
@@ -1,89 +0,0 @@
-## Motivation
-
-There is a ```gap``` between the ```Program``` defined by
-user and the ```Executable``` that can be scheduled
-efficiently on heterogeneous hardware, either locally
-or distributedly.
-
-Usually, the ```gap``` is bridged by
-
-* A serious transformations with defined order.
-
-* These transformations usually involve
-```insert, delete, clustering, split, dependency analysis```.
-
-* Has a simple way to verify and debug each transformation.
-
-* Flexible to add, remove or customize transformations to fit
-the requirements of various algorithms (models) and hardware secenarios.
-
-Some other events also push us to a better unified pattern.
-
-* The deep learning framework is built around the concepts of graphs.
-To leverage tools such as compilation (e.g. TVM and nGraph) or
-cross-framework conversion (e.g. ONNX), we also need a intermediate
-representation that can be connected to the rest of the ecosystem.
-
-
-We need a unified pattern to naturally support the requirements
-described above. The pattern should fit both training, inference
-and other offline serielized model transformations.
-Learned from LLVM and other deep learning framework, we draft the
-design below.
-
-
-## Design
-
-### Major Concepts
-
-#### Node
-
-```Node``` represents an operation that performs some computation or
-a variable that is input or output of operation.
-
-```Node```s are connected to other ```Node```s via inputs and outputs.
-
-Other properties (maybe device placement information) can be added
-to ```Node``` in the future if it's a
-common requirement of many other ```Pass```es. Otherwise, it should live
-in a ```Node``` wrapper class that is private to some ```Pass``` or be
-a local member of a ```Pass```.
-
-#### Graph
-
-```Graph``` contains a list of ```Node```s, which are connected to
-each other via inputs and outputs.
-
-TODO: Better definitions for the graph.
-
-```Graph``` can also contain ```Attribute```s. ```Attribute```s
-can be ``any`` thing. For example, it can be a list of "wraper"
-nodes. The ```wrapper``` nodes compose ```Node```s and provide
-helper method for execution or transformation. ```Attribute```
-can also contain other things that describe some properties of
-the ```Graph``` or ```Graph``` nodes. ```Attribute``` can be passed
-across ```Pass```. However, it should be used with care.
-
-#### Pass
-
-```Pass``` represents a transformation of ```Graph```. Its input
-is a ```Graph``` and its output is also a ```Graph```. For example,
-a ```Pass``` can simply print out the ```Graph```. A ```Pass```
-can also fuse some ```Graph```'s ```Node```s.
-
-#### Optimize
-
-```Optimize``` contains a series of ```Pass``` with defined order.
-```Optimize``` transforms a ```Graph``` that only contains raw
-modeling logic to a ```Graph``` that can be run efficiently while
-maintaining the original modeling logic.
-
-
-### Optimize Process
-
-* Program is first converted to Graph.
-* Graph goes through a series of Pass
-* Graph is transformed from raw model logic to a
-form that is efficient to execute.
-
-Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
diff --git a/doc/fluid/design/ir/overview.md b/doc/fluid/design/ir/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..83ef97c99efeaf27a27f93f0cd3857c0f1bc812e
--- /dev/null
+++ b/doc/fluid/design/ir/overview.md
@@ -0,0 +1,185 @@
+## Motivation
+
+There is a `gap` between the `Program` defined by
+user and the `Executable` that can be scheduled
+efficiently on heterogeneous hardware, either locally
+or distributedly.
+
+Usually, the `gap` is bridged by
+
+* A serious transformations with defined order.
+
+* These transformations usually involve
+`insert, delete, clustering, split, dependency analysis`.
+
+* Has a simple way to verify and debug each transformation.
+
+* Flexible to add, remove or customize transformations to fit
+the requirements of various algorithms (models) and hardware secenarios.
+
+Some other events also push us to a better unified pattern.
+
+* The deep learning framework is built around the concepts of graphs.
+To leverage tools such as compilation (e.g. TVM and nGraph) or
+cross-framework conversion (e.g. ONNX), we also need a intermediate
+representation that can be connected to the rest of the ecosystem.
+
+
+We need a unified pattern to naturally support the requirements
+described above. The pattern should fit both training, inference
+and other offline serielized model transformations.
+Learned from LLVM and other deep learning framework, we draft the
+design below.
+
+
+## Design
+
+### Major Concepts
+
+#### Node
+
+`Node` represents an operation that performs some computation or
+a variable that is input or output of operation.
+
+`Node`s are connected to other `Node`s via inputs and outputs.
+
+Other properties (maybe device placement information) can be added
+to `Node` in the future if it's a
+common requirement of many other `Pass`es. Otherwise, it should live
+in a `Node` wrapper class that is private to some `Pass` or be
+a local member of a `Pass`.
+
+#### Graph
+
+`Graph` contains a list of `Node`s, which are connected to
+each other via inputs and outputs.
+
+TODO: Better definitions for the graph.
+
+`Graph` can also contain `Attribute`s. `Attribute`s
+can be `any` thing. For example, it can be a list of "wraper"
+nodes. The `wrapper` nodes compose `Node`s and provide
+helper method for execution or transformation. `Attribute`
+can also contain other things that describe some properties of
+the `Graph` or `Graph` nodes. `Attribute` can be passed
+across `Pass`. However, it should be used with care.
+
+```cpp
+class Graph {
+ public:
+  explicit Graph(const ProgramDesc &program);
+
+  bool Has(const std::string &attr_name) const;
+
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const;
+
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr);
+  const std::unordered_set<ir::Node *> &Nodes() const;
+
+  // Create a normal variable with non-null VarDesc.
+  ir::Node *CreateVarNode(VarDesc *var_desc);
+
+  // Create a normal runnable operator with OpDesc.
+  ir::Node *CreateOpNode(OpDesc *op_desc);
+
+  // Create a control dependency var that connects 2 operations. The
+  // var doesn't hold any data. Other than that, it's no different from
+  // other var, considering dependency analysis.
+  ir::Node *CreateControlDepVar();
+
+  // A more free style way of creating a graph node. Mostly use for test
+  // or "copy" from another node. Avoid using it if possible.
+  ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type);
+
+  // Clear all node information of the graph and return the ownership of the
+  // nodes.
+  std::vector<std::unique_ptr<ir::Node>> ReleaseNodes();
+};
+```
+
+#### Pass
+
+`Pass` represents a transformation of `Graph`. Its input
+is a `Graph` and its output is also a `Graph`. For example,
+a `Pass` can simply print out the `Graph`. A `Pass`
+can also fuse some `Graph`'s `Node`s.
+
+```cpp
+class Pass {
+ public:
+
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const {
+    // Some correctness check.
+    auto new_graph = ApplyImpl(std::move(graph));
+    // Some correctness check.
+    return new_graph;
+  }
+
+  // Get a reference to the attributed previously set.
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const;
+
+  // Set a pointer to the attribute. Pass takes ownership of the attribute.
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr) ;
+
+  // Set a pointer to the attribute. Pass doesn't take ownership. Caller
+  // should delete the attribute.
+  template <typename AttrType>
+  void SetNotOwned(const std::string &attr_name, AttrType *attr);
+
+ protected:
+  virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const = 0;
+};
+
+// In my_pass.cc
+class MyPass : public Pass {
+ protected:
+  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override {
+    // do something.
+    return graph;
+  }
+}
+REGISTER_PASS(my_pass, MyPass)
+.RequirePassAttr("places")
+.RequireGraphAttr("dep_vars");
+
+
+// To use the pass.
+auto my_pass = ir::PassRegistry::Instance().Get("my_pass");
+graph = my_pass->Apply(std::move(graph));
+// Note: to force link my_pass.cc, in the code:
+USE_PASS(my_pass);
+```
+
+#### Optimize
+
+`Optimize` contains a series of `Pass` with defined order.
+`Optimize` transforms a `Graph` that only contains raw
+modeling logic to a `Graph` that can be run efficiently while
+maintaining the original modeling logic.
+
+
+### Optimize Process
+
+* Program is first converted to Graph.
+* Graph goes through a series of Pass
+* Graph is transformed from raw model logic to a
+form that is efficient to execute.
+
+```
+// Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
+auto graph = Graph(program);
+graph = PassRegistry::Instance().Get("op_fuse_pass").Apply(std::move(grah));
+// For more complex Pass, Optimize Process can provide Pass attributes.
+auto mem_opt_pass = PassRegistry::Instance().Get("memory_optimization_pass");
+mem_opt_pass.SetNotOwned<int>("optimize_level", 1);
+mem_opt_pass->Apply(std::move(graph));
+graph = PassRegistry::Instance().Get("multi_devices_pass").Apply(std::move(grah));
+graph = PassRegistry::Instance().Get("multi_devices_check_pass").Apply(std::move(grah));
+Executor exe;
+exe.Run(graph);
+
+```
diff --git a/doc/fluid/howto/optimization/timeline_cn.md b/doc/fluid/howto/optimization/timeline_cn.md
index 5d061e1c00d2ca0194153730a39486b8357fa5b0..faf39f276dbddcd4961407ba2d082c9826051cbe 100644
--- a/doc/fluid/howto/optimization/timeline_cn.md
+++ b/doc/fluid/howto/optimization/timeline_cn.md
@@ -1,21 +1,27 @@
 # 如何使用timeline工具做性能分析
 
-1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
+1. 在训练的主循环外加上`profiler.start_profiler(...)`和`profiler.stop_profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
 
 	**提示：**
 	请不要在timeline记录信息时运行太多次迭代，因为timeline中的记录数量和迭代次数是成正比的。
 
 	```python
-	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
-	    for pass_id in range(pass_num):
-	        for batch_id, data in enumerate(train_reader()):
-	            exe.run(fluid.default_main_program(),
-	                    feed=feeder.feed(data),
-	                    fetch_list=[])
+    for pass_id in range(pass_num):
+        for batch_id, data in enumerate(train_reader()):
+            if pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile")
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
 	            ...
 	```
 
 1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`，这个程序默认会生成一个`/tmp/timeline`文件，你也可以用命令行参数来修改这个路径，请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
 
 1. 打开chrome浏览器，访问<chrome://tracing/>，用`load`按钮来加载生成的`timeline`文件。
 
diff --git a/doc/fluid/howto/optimization/timeline_en.md b/doc/fluid/howto/optimization/timeline_en.md
index 96481ae2a6e4442d40803f8d5361e5f942502df3..6f963c6b4da6967fb2f493ada917a4b08917fa4c 100644
--- a/doc/fluid/howto/optimization/timeline_en.md
+++ b/doc/fluid/howto/optimization/timeline_en.md
@@ -1,15 +1,17 @@
 # how to use timeline tool to do profile
 
-1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
+1. Add `profiler.start_profiler(...)`和`profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
 
 	```python
-	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
-	    for pass_id in range(pass_num):
-	        for batch_id, data in enumerate(train_reader()):
-	            exe.run(fluid.default_main_program(),
-	                    feed=feeder.feed(data),
-	                    fetch_list=[],
-	                    use_program_cache=True)
+    for pass_id in range(pass_num):
+        for batch_id, data in enumerate(train_reader()):
+            if pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile")
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
 	            ...
 	```
 
@@ -17,6 +19,10 @@
 file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
 [timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
 
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
+
 1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
 
 	![chrome tracing](./tracing.jpeg)
diff --git a/doc/survey/op_fusion_design.md b/doc/survey/op_fusion_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..d6e48f4f58269b67450cb012f6dcc59e1083abba
--- /dev/null
+++ b/doc/survey/op_fusion_design.md
@@ -0,0 +1,20 @@
+# Operator fusion  
+Fusing multiple operators together is an important method to optimize the program execution, particularly for GPU or other specialized accelerators. An obvious benefit is to avoid the overhead of saving the intermediate result back into global memory.   
+
+There are generally two ways to fuse operators, fusing directly connected operators and fusing non directly connected operators. The first method is mainly used by [NNVM Compiler](https://github.com/dmlc/tvm/) and [XLA](https://www.tensorflow.org/performance/xla/). The second method is mainly used by Dynet and TensorFlow Fold to do auto-batching. The principle of fusing operator is according to some rules to combine multiple operations into one, for example, `Y = X * W` and `Z = Y + B` can be fused to `Z = X * W + B`, and `Y1 = X1 * W` and `Y2 = X2 * W` can be fused to `[Y1;Y2] = [X1;X2] * W`. In order to get a short-term profit, we decided to try to manually specify these rules.   
+
+## Challenge
+The challenge of fusing operators is:
+  - how to make the rules.
+  - how to implement these rules efficiently.
+
+### How to make the rules?
+
+The problem of determining the best single location for a fusion operator is an NP-hard combinatorial problem. After analysis the operators of the DL model, we found there are two group of operators can be fused explicitly, one is the simple and adjacent operations, for example, `tmp = x + y` and `z = Relu(tmp)`, and the other is the operators that have the same function, for example, a serials of `SGD` or `Momentum`. They usually appear in the model in a large number. So we should think about how to fuse them separately first.
+
+### How to implement these rules efficiently?
+#### How to fuse the adjacent operations efficiently?
+Here we use a template function to represent the fused operations. The pros of using a template function are that it is simple and efficient, and the cons are that it is not easy to expand, and it can only be used to express some simple operations. So taking into account our current needs, the template function is more appropriate.
+
+#### How to fuse the operators that have the same function efficiently?
+We take SGD operator as an example, the training model may have hundreds of parameters and correspondingly have the same number of SGD operators. The expression(`w = w - lr*w_g`) of those operators is the same, so during of training, the executor will execute this expression hundreds time in CPU or other specialized accelerators. If we can fuse them and make the address of all `w` and all `w_g` continuous respectively, we only need execute one time. For some accelerators, the time of launching kernel is not neglected, so the time of hundreds of times of launching and executing kernel may be larger than launching and executing only once. There usually are many operators that similar to `SGD` in the DL model, such as `AllReduce` and `FC`.
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 6efb03dabe89b28f3ff1a55c4a940dfe74e8001d..dd172ff9c97814c089ddb2e5bf729880cf0c9cdb 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -170,6 +170,7 @@ paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], var
 paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
@@ -201,7 +202,6 @@ paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=
 paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Switch.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
@@ -225,17 +225,14 @@ paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=
 paddle.fluid.layers.DynamicRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.DynamicRNN.update_memory ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.StaticRNN.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.memory ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1))
 paddle.fluid.layers.StaticRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
-paddle.fluid.layers.StaticRNN.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.step ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.step_output ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.update_memory ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.reorder_lod_tensor_by_rank ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.ParallelDo.__init__ ArgSpec(args=['self', 'places', 'use_nccl', 'name'], varargs=None, keywords=None, defaults=(False, None))
-paddle.fluid.layers.ParallelDo.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.ParallelDo.do ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.ParallelDo.get_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.ParallelDo.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
@@ -266,9 +263,7 @@ paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='ar
 paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.logsigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
@@ -309,7 +304,9 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', '
 paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
 paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3))
 paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
+paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
 paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
@@ -339,6 +336,7 @@ paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=Non
 paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
 paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index d274d96c29bdbf5973d568d783369c3975bdc436..2577e59d9cf24c26b7c04aa00cdde6cde17f7206 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -5,5 +5,7 @@ add_subdirectory(operators)
 add_subdirectory(pybind)
 add_subdirectory(string)
 add_subdirectory(recordio)
-# NOTE: please add subdirectory inference at last.
-add_subdirectory(inference)
+if(WITH_INFERENCE)
+  # NOTE: please add subdirectory inference at last.
+  add_subdirectory(inference)
+endif()
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index de06c860f550641a58a32d49e85feb7278fed1dd..1d62792b80dd002b894da28be9162fc7d3ce054e 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -7,10 +7,11 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
+cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
 if(WITH_GPU)
-  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type)
+  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
 endif()
 
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
@@ -22,7 +23,12 @@ endif()
 
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
-nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context tensor)
+if(WITH_GPU)
+  nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
+else()
+  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
+endif()
+
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
@@ -94,7 +100,7 @@ else()
 endif()
 
 
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass)
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
@@ -105,7 +111,7 @@ cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
 
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
-      
+
 # cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
 
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index ce48548418478cc5c9f9ca1244df9e66dca884e6..960ca39e1eadd3c064beb0e2c1342a406c4f0b6a 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -88,9 +88,8 @@ class BlockDesc {
   OpDesc *InsertOp(size_t index);
 
   /*
-   * Remove Op and its input/output variables.
-   * Note that for either input or output variable, if it is also an input or
-   * output variable of other ops, we should remain it.
+   * Only remove op itself,
+   * do nothing to its input and output variables
    */
   void RemoveOp(size_t s, size_t e);
 
diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
index 60382faffb8e53870658b2d1ff83abc4008cb4cf..1a9ce746ea840bc088d222cc4e9bc05159d64734 100644
--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -17,6 +17,8 @@
 #include <string>
 #include <unordered_map>
 
+using float16 = paddle::platform::float16;
+
 namespace paddle {
 namespace framework {
 
@@ -53,7 +55,7 @@ static DataTypeMap* InitDataTypeMap() {
   RegisterType<cc_type>(retv, proto_type, #cc_type)
 
   // NOTE: Add your customize type here.
-  RegType(platform::float16, proto::VarType::FP16);
+  RegType(float16, proto::VarType::FP16);
   RegType(float, proto::VarType::FP32);
   RegType(double, proto::VarType::FP64);
   RegType(int, proto::VarType::INT32);
diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54c41c55ba63c0b2001cfcb6a9e94fbb0036d437
--- /dev/null
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/data_type.h"
+
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/tensor.h"
+
+TEST(DataType, float16) {
+  using paddle::framework::Tensor;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::float16;
+  namespace f = paddle::framework;
+  f::proto::VarType::Type dtype = f::proto::VarType::FP16;
+
+  Tensor tensor;
+  CPUPlace cpu;
+  tensor.mutable_data(cpu, f::ToTypeIndex(dtype));
+
+  // test fp16 tensor
+  EXPECT_EQ(tensor.type(), std::type_index(typeid(float16)));
+
+  // test fp16 size
+  EXPECT_EQ(f::SizeOfType(f::ToTypeIndex(dtype)), 2u);
+
+  // test debug info
+  std::string type = "float16";
+  EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
+}
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index df55b3d05402f1aeecfd8d4218a637a81d58ed87..8f6c4163d6ee11fbe83f603f6148c2ac6175324d 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -1,13 +1,13 @@
-cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto)
+cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto node)
 cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
 
-cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS graph)
-cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
-cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder)
+cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
+cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
+cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
 
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
 
@@ -28,12 +28,9 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
 
-cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
+cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
         scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle)
 
-
-cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
-
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 700c73c745bad72637d77385f5cd38c494501c86..bf493a3fa44e48deec734250d04b2a413c3ed9da 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
@@ -45,6 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 #endif
 
 void AllReduceOpHandle::RunImpl() {
+  platform::RecordEvent r("all_reduce", nullptr);
   if (NoDummyInputSize() == 1) {
     return;  // No need to all reduce when GPU count = 1;
   } else {
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index b2e5399e2376a86c1cd310b29c768832665af87f..8714a42162bda3d5ad12e7925fe8cc4e693f51b1 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -21,6 +21,26 @@ namespace framework {
 namespace details {
 
 struct BuildStrategy {
+  // ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and
+  // kReduce, for CPU and GPU. If you use kAllReduce, different threads
+  // optimize their parameters separately. If you use kReduce, the optimizations
+  // of parameters are distributed to different threads.
+  // For example, a model has 100 parameters and is running with four threads,
+  // if you choose kAllReduce, every thread is to optimize 100 parameters
+  // separately, if you choose kReduce, every thread is to optimize 25
+  // parameters.
+  // Of particular note is, if you use kReduce when using CPU training,
+  // all the parameters are shared between different threads. This feature will
+  // save memory.
+  // FIXME(zcd): The result of the two modes(kAllReduce and kReduce) maybe not
+  // equal for GPU. Because, the result of the different order of summing maybe
+  // different, for example, the result of `a+b+c+d` may be different with the
+  // result of `c+a+b+d`.
+  // For GPU, the implementation of kAllReduce and kReduce is adopted NCCL,
+  // so the result of kAllReduce and kReduce maybe not equal.
+  // For CPU, if you want to fix the order of summing to make the result
+  // of kAllReduce and kReduce no diff, you can add
+  // `FLAGS_cpu_deterministic=true` to env.
   enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 };
 
   enum class GradientScaleStrategy {
diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e302a29233b96451df14b4685911be1cd87c1ab
--- /dev/null
+++ b/paddle/fluid/framework/details/exception_holder.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class ExceptionHolder {
+ public:
+  void Catch(const platform::EnforceNotMet& exp) {
+    std::lock_guard<std::mutex> lock(mu_);
+    exception_.reset(new platform::EnforceNotMet(exp));
+    type_ = kEnforceNotMet;
+  }
+
+  void Catch(const platform::EOFException& exp) {
+    std::lock_guard<std::mutex> lock(mu_);
+    // EOFException will not cover up existing EnforceNotMet.
+    if (exception_.get() == nullptr) {
+      exception_.reset(new platform::EOFException(exp));
+      type_ = kEOF;
+    }
+  }
+
+  bool ExceptionCatched() const {
+    std::lock_guard<std::mutex> lock(mu_);
+    return exception_.get() != nullptr;
+  }
+
+  void Throw() {
+    std::lock_guard<std::mutex> lock(mu_);
+    switch (type_) {
+      case kNone:
+        break;
+      case kEnforceNotMet: {
+        auto e = *static_cast<platform::EnforceNotMet*>(exception_.get());
+        throw e;
+        break;
+      }
+      case kEOF: {
+        auto e = *static_cast<platform::EOFException*>(exception_.get());
+        throw e;
+        break;
+      }
+      default:
+        LOG(FATAL) << "Unknown exception.";
+    }
+    exception_.reset();
+    type_ = kNone;
+  }
+
+  void Clear() {
+    std::lock_guard<std::mutex> lock(mu_);
+    exception_.reset();
+    type_ = kNone;
+  }
+
+ private:
+  enum ExceptionType { kNone, kEnforceNotMet, kEOF };
+  ExceptionType type_{kNone};
+
+  std::unique_ptr<std::exception> exception_;
+  mutable std::mutex mu_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.cc b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
similarity index 75%
rename from paddle/fluid/framework/details/ssa_graph_checker.cc
rename to paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
index 7c79d7f1e881c67514634d56caa715c41927dbce..c9c255864a2477ed29873f8521acce37fa928c06 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/ssa_graph_checker.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include <string>
 #include "paddle/fluid/framework/ir/graph.h"
 
@@ -20,7 +20,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-bool SSAGraghBuilderWithChecker::IsValidGraph(const Graph *graph) const {
+bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
   std::unordered_map<OpHandleBase *, size_t> pending_ops;
   std::unordered_set<VarHandleBase *> pending_vars;
   std::unordered_set<VarHandleBase *> ready_vars;
@@ -33,7 +33,7 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const Graph *graph) const {
     }
   };
 
-  for (auto &var_map : graph->Get<GraphVars>("vars")) {
+  for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
     for (auto &name_pair : var_map) {
       for (auto &version_pair : name_pair.second) {
         insert_pending_var(version_pair.get());
@@ -41,11 +41,11 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const Graph *graph) const {
     }
   }
 
-  for (auto &var : graph->Get<GraphDepVars>("dep_vars")) {
+  for (auto &var : graph->Get<GraphDepVars>(kGraphDepVars)) {
     insert_pending_var(var.get());
   }
 
-  for (auto &op : graph->Get<GraphOps>("ops")) {
+  for (auto &op : graph->Get<GraphOps>(kGraphOps)) {
     if (op->Inputs().empty()) {
       ready_ops.insert(op.get());
     } else {
@@ -85,3 +85,10 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const Graph *graph) const {
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
+
+REGISTER_PASS(multi_devices_check_pass,
+              paddle::framework::details::SSAGraghBuilderWithChecker)
+    .RequireGraphAttr(paddle::framework::details::kGraphVars)
+    .RequireGraphAttr(paddle::framework::details::kGraphDepVars)
+    .RequireGraphAttr(paddle::framework::details::kGraphOps)
+    .RequireGraphAttr(paddle::framework::details::kShardedVarDevice);
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/multi_devices_graph_check_pass.h
similarity index 53%
rename from paddle/fluid/framework/details/ssa_graph_checker.h
rename to paddle/fluid/framework/details/multi_devices_graph_check_pass.h
index 2e397e86825a41765a360d31fa8595d17027f3ec..1e2b1867c376956d7d2dac465c13e2f3f64ba7eb 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_check_pass.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
 
 #include <string>
 
@@ -22,26 +22,15 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
- public:
-  explicit SSAGraghBuilderWithChecker(
-      std::unique_ptr<SSAGraphBuilder>&& builder)
-      : builder_(std::move(builder)) {}
-
-  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override {
-    auto new_graph = builder_->Apply(std::move(graph));
-    PADDLE_ENFORCE(IsValidGraph(new_graph.get()));
-    return new_graph;
-  }
-
-  int GetVarDeviceID(const std::string& var_name) const override {
-    return builder_->GetVarDeviceID(var_name);
+class SSAGraghBuilderWithChecker : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override {
+    PADDLE_ENFORCE(IsValidGraph(graph.get()));
+    return graph;
   }
 
-  bool IsValidGraph(const Graph* graph) const;
-
- private:
-  std::unique_ptr<SSAGraphBuilder> builder_;
+  bool IsValidGraph(const ir::Graph* graph) const;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
similarity index 61%
rename from paddle/fluid/framework/details/multi_devices_graph_builder.cc
rename to paddle/fluid/framework/details/multi_devices_graph_pass.cc
index f1f8674caf663ce38df5a2eecbcf690b5ca87dc4..c5a13e7e1f45e1eb9b4271880630c52d30022f4b 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -21,10 +21,11 @@
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/data_balance_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/scope.h"
@@ -32,31 +33,109 @@
 namespace paddle {
 namespace framework {
 namespace details {
+namespace {
+void PolishGraphToSupportDataHazards(ir::Graph *graph) {
+  for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
+    for (auto &name_pair : var_map) {
+      if (name_pair.second.size() <= 1) {
+        continue;
+      }
+      auto it_new = name_pair.second.rbegin();
+      auto it_old = name_pair.second.rbegin();
+      ++it_old;
+      for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
+        OpHandleBase *write_op = (*it_new)->GeneratedOp();
+        const auto &read_ops = (*it_old)->PendingOps();
+
+        for (auto *read_op : read_ops) {
+          // Manually add a dependency var from read_op to write_op;
+          if (read_op == write_op) {
+            // Read Write is the same op.
+            continue;
+          }
+          bool has_dep = false;
+          for (auto *r_out : read_op->Outputs()) {
+            for (auto *w_in : write_op->Inputs()) {
+              if (r_out->Node() == w_in->Node()) {
+                has_dep = true;
+                break;
+              }
+            }
+          }
+          if (has_dep) continue;
+
+          auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+          read_op->AddOutput(dep_var);
+          write_op->AddInput(dep_var);
+          graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+        }
+      }
+    }
+  }
+}
+
+VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node,
+                                      const platform::Place &place,
+                                      size_t place_offset) {
+  auto &var_holders = graph->Get<GraphVars>(kGraphVars)[place_offset];
+  auto &var_holder = var_holders[node->Name()];
+  VarHandle *var = nullptr;
+  if (var_holder.empty()) {
+    if (node->Var()) {
+      var = new VarHandle(graph->CreateVarNode(node->Var()), 0, place_offset,
+                          node->Name(), place);
+    } else {
+      var = new VarHandle(
+          graph->CreateEmptyNode(node->Name(), ir::Node::Type::kVariable), 0,
+          place_offset, node->Name(), place);
+    }
+    var_holder.emplace_back(var);
+  } else {
+    var = var_holder.rbegin()->get();
+  }
+  return var;
+}
 
+void CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle,
+                    ir::Node *new_node, const platform::Place &place,
+                    size_t place_offset) {
+  auto &vars =
+      graph->Get<GraphVars>(kGraphVars)[place_offset][new_node->Name()];
+  size_t version = vars.size();
+  auto var =
+      new VarHandle(new_node, version, place_offset, new_node->Name(), place);
+  vars.emplace_back(var);
+  op_handle->AddOutput(var);
+}
+
+void AddOutputToLeafOps(ir::Graph *graph) {
+  for (auto &op : graph->Get<GraphOps>(kGraphOps)) {
+    if (!op->Outputs().empty()) {
+      continue;
+    }
+    auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
+    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
+    op->AddOutput(dummy_leaf);
+  }
+}
+}  // namespace
+
+static const char kLossVarName[] = "loss_var_name";
+static const char kPlaces[] = "places";
+static const char kParams[] = "params";
+static const char kLocalScopes[] = "local_scopes";
+static const char kStrategy[] = "strategy";
+
+void MultiDevSSAGraphBuilder::Init() const {
+  loss_var_name_ = Get<const std::string>(kLossVarName);
+  places_ = Get<const std::vector<platform::Place>>(kPlaces);
+  local_scopes_ = Get<const std::vector<Scope *>>(kLocalScopes);
+  strategy_ = Get<const BuildStrategy>(kStrategy);
 #ifdef PADDLE_WITH_CUDA
-MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
-    const std::vector<platform::Place> &places,
-    const std::string &loss_var_name,
-    const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes,
-    platform::NCCLContextMap *nccl_ctxs, const BuildStrategy &strategy)
-    : loss_var_name_(loss_var_name),
-      places_(places),
-      local_scopes_(local_scopes),
-      nccl_ctxs_(nccl_ctxs),
-      strategy_(strategy) {
-#else
-MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
-    const std::vector<platform::Place> &places,
-    const std::string &loss_var_name,
-    const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes, const BuildStrategy &strategy)
-    : loss_var_name_(loss_var_name),
-      places_(places),
-      local_scopes_(local_scopes),
-      strategy_(strategy) {
+  nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs");
 #endif
-  for (auto &p : params) {
+
+  for (auto &p : Get<const std::unordered_set<std::string>>(kParams)) {
     grad_names_.insert(GradVarName(p));
   }
   balance_vars_.resize(places_.size(), 0);
@@ -67,10 +146,11 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateOpHandleIOs(Graph *result, ir::Node *node,
+void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result,
+                                                ir::Node *node,
                                                 size_t place_id) const {
   auto p = places_[place_id];
-  auto *op_handle = result->Get<GraphOps>("ops").back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
   op_handle->SetDeviceContext(p,
                               platform::DeviceContextPool::Instance().Get(p));
 
@@ -92,12 +172,11 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(Graph *result, ir::Node *node,
 }
 
 std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
-    const std::vector<std::unique_ptr<ir::Node>> &nodes) const {
+    const std::vector<ir::Node *> &nodes) const {
   std::vector<std::string> send_vars;
   // since parameters are all in block 0,
   // it's enough to only scan send ops in block 0
   for (auto &node : nodes) {
-    if (node->NodeType() != ir::Node::Type::kOperation) continue;
     OpDesc *op = node->Op();
     // TODO(Yancey1989): use a graceful method to find send op,
     // instead of the the hard code string
@@ -112,10 +191,9 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
 }
 
 std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
-    const std::vector<std::unique_ptr<ir::Node>> &nodes) const {
+    const std::vector<ir::Node *> &nodes) const {
   std::vector<std::string> recv_vars;
   for (auto &node : nodes) {
-    if (node->NodeType() != ir::Node::Type::kOperation) continue;
     OpDesc *op = node->Op();
     // TODO(Yancey1989): use a graceful method to find recv op,
     // instead of the hard code string
@@ -170,6 +248,7 @@ size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
     const std::vector<std::string> &var_names) const {
   int64_t numel_sum = 0;
   for (auto var_name : var_names) {
+    if (all_vars_.find(var_name) == all_vars_.end()) continue;
     auto var_desc = all_vars_.at(var_name);
     PADDLE_ENFORCE_NOT_NULL(var_desc);
     auto dim = framework::make_ddim(var_desc->GetShape());
@@ -186,30 +265,83 @@ size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
   return dev_id;
 }
 
-std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
-    std::unique_ptr<Graph> graph) const {
-  // Rebuild the graph structure.
-  auto nodes = std::move(graph->nodes);
-  graph->nodes.clear();
+// Topology sort the graph nodes from inputs to outputs.
+// Since SSAGraphBuilder depends on forward/backward nodes to assign devices
+// to parameter/gradients before optimizer ops, topo sort is insufficient. (
+// some optimizer ops might not depend on any nodes), we manually move all
+// optimizer nodes after last backward nodes.
+// However, the assumption by SSAGraphBuilder should be relaxed in the future.
+std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) {
+  std::vector<ir::Node *> ret = ir::TopologySortOperations(graph);
+  size_t last_backward = 0;
+  for (size_t i = 0; i < ret.size(); ++i) {
+    if (boost::get<int>(
+            ret[i]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+        static_cast<int>(OpRole::kBackward)) {
+      last_backward = i;
+    }
+  }
+
+  std::vector<ir::Node *> optimize_ops;
+  std::vector<ir::Node *> sorted_ret;
+  for (size_t i = 0; i < ret.size(); ++i) {
+    if (i < last_backward) {
+      if (boost::get<int>(ret[i]->Op()->GetAttr(
+              OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+          static_cast<int>(OpRole::kOptimize)) {
+        optimize_ops.push_back(ret[i]);
+      } else {
+        sorted_ret.push_back(ret[i]);
+      }
+    } else if (i == last_backward) {
+      sorted_ret.push_back(ret[i]);
+      // Verify that no operations before optimize ops depends on optimize ops.
+      std::unordered_set<ir::Node *> optimize_set(optimize_ops.begin(),
+                                                  optimize_ops.end());
+      for (ir::Node *n : sorted_ret) {
+        for (ir::Node *in : n->inputs) {
+          for (ir::Node *pre_n : in->inputs) {
+            PADDLE_ENFORCE(optimize_set.find(pre_n) == optimize_set.end(),
+                           "optimize operations cannot be depended by forward "
+                           "or backward node %s -> %s",
+                           pre_n->Name(), n->Name());
+          }
+        }
+      }
+      sorted_ret.insert(sorted_ret.end(), optimize_ops.begin(),
+                        optimize_ops.end());
+    } else {
+      sorted_ret.push_back(ret[i]);
+    }
+  }
+  return sorted_ret;
+}
+
+std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  Init();
+  // Give the topology sort order and rebuild the graph structure.
+  std::vector<ir::Node *> sorted_ops = SortOpsAndDelayOptimizeOp(*graph);
+  auto nodes = graph->ReleaseNodes();
+  ir::Graph &result = *graph;
 
   for (auto &node : nodes) {
-    if (node->NodeType() == ir::Node::Type::kVariable) {
+    if (node->NodeType() == ir::Node::Type::kVariable && node->Var()) {
       all_vars_.emplace(node->Name(), node->Var());
     }
   }
-
-  Graph &result = *graph;
   std::unordered_set<std::string> og_has_been_broadcast;
 
   // We cannot invoke resize. It is a bug of GCC 4.8
-  result.Set("vars", new GraphVars(places_.size()));
-  result.Set("dep_vars", new GraphDepVars);
-  result.Set("ops", new GraphOps);
+  result.Set(kGraphVars, new GraphVars(places_.size()));
+  result.Set(kGraphDepVars, new GraphDepVars);
+  result.Set(kGraphOps, new GraphOps);
+  result.Set(kShardedVarDevice, new ShardedVarDevice);
 
   // find send/recv vars so that we can place the distributed training
-  // realted op in the place 0
-  auto send_vars = FindDistTrainSendVars(nodes);
-  auto recv_vars = FindDistTrainRecvVars(nodes);
+  // related op in the place 0
+  auto send_vars = FindDistTrainSendVars(sorted_ops);
+  auto recv_vars = FindDistTrainRecvVars(sorted_ops);
 
   std::vector<std::unordered_set<std::string>> bcast_var_name_set;
   bcast_var_name_set.resize(places_.size());
@@ -217,23 +349,20 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
   size_t cur_device_id = 0;
   bool is_forwarding = true;
 
-  // NOTE: Currently, passes before SSAGraphBuilder cannot reorder
-  // forward, backward nodes. E.g. you can't append an forward node
-  // at the end of the node list.
-  // TODO(panyx0718): FIXME: Needs to sort by forward->backward order.
-  for (auto &node : nodes) {
-    if (node->NodeType() != ir::Node::Type::kOperation) continue;
+  for (ir::Node *node : sorted_ops) {
     if (boost::get<int>(
             node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
         static_cast<int>(OpRole::kRPC)) {
-      CreateRPCOp(&result, node.get());
-    } else if (IsDistTrainOp(node.get(), send_vars, recv_vars)) {
-      CreateDistTrainOp(&result, node.get());
-    } else if (IsScaleLossOp(node.get())) {
+      CreateRPCOp(&result, node);
+    } else if (IsDistTrainOp(node, send_vars, recv_vars)) {
+      CreateDistTrainOp(&result, node);
+    } else if (IsScaleLossOp(node)) {
       // user can customize loss@grad if not use_default_grad_scale_
       if (strategy_.gradient_scale_ !=
           BuildStrategy::GradientScaleStrategy::kCustomized) {
-        CreateScaleLossGradOp(&result);
+        // TODO(paddle-dev): Why is there no input for this op_handle?
+        auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
+        CreateScaleLossGradOp(&result, loss_grad_name);
       }
       // This assumes the backward generating code will ensure IsScaleLossOp
       // is true only for the op that scale the final scalar loss.
@@ -241,24 +370,24 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
       // the block.
       is_forwarding = false;
     } else {
-      int op_dev_id = GetOpDeviceID(node.get());
+      int op_dev_id = GetOpDeviceID(result, node);
       if (op_dev_id != -1) {  // This op only runs on one specific device.
-        CreateComputationalOp(&result, node.get(), op_dev_id);
+        CreateComputationalOp(&result, node, op_dev_id);
         for (ir::Node *n : node->outputs) {
-          var_name_on_devices_.emplace(n->Name(), op_dev_id);
+          graph->Get<ShardedVarDevice>(kShardedVarDevice)
+              .emplace(n->Name(), op_dev_id);
         }
       } else {
         // This op runs on all devices, and its output may have parameter's
         // gradients.
+        // TODO(paddle-dev): Why is so special about "read" op?
         if (node->Op()->Type() == "read" && strategy_.enable_data_balance_) {
           node->Op()->SetAttr("throw_eof_exp", false);
-          CreateComputationalOps(&result, node.get(), places_.size());
-          // TODO(paddle-dev): builder shouldn't depend on the out logic of
-          // a specific op.
+          CreateComputationalOps(&result, node, places_.size());
           const auto &data_var_names = node->Op()->Output("Out");
           InsertDataBalanceOp(&result, data_var_names);
         } else {
-          CreateComputationalOps(&result, node.get(), places_.size());
+          CreateComputationalOps(&result, node, places_.size());
         }
 
         if (!is_forwarding && places_.size() > 1) {
@@ -283,7 +412,8 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
                   case BuildStrategy::ReduceStrategy::kReduce:
                     cur_device_id = GetAppropriateDeviceID({g_name});
                     CreateReduceOp(&result, g_name, cur_device_id);
-                    var_name_on_devices_.emplace(g_name, cur_device_id);
+                    graph->Get<ShardedVarDevice>(kShardedVarDevice)
+                        .emplace(g_name, cur_device_id);
                     bcast_var_name_set[cur_device_id].emplace(p_name);
                     break;
                   case BuildStrategy::ReduceStrategy::kAllReduce:
@@ -322,17 +452,17 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
       }
     }
   }
-
   /*
-    Dependency graph has been constructed. However, there are still data
-    hazards need to be handled.
-   */
+  Dependency graph has been constructed. However, there are still data
+  hazards need to be handled.
+ */
   PolishGraphToSupportDataHazards(&result);
 
   /*
    * Only variables should be the leaves of graph.
    */
   AddOutputToLeafOps(&result);
+  PADDLE_ENFORCE(!ir::HasCircle(result));
   return graph;
 }
 
@@ -357,7 +487,7 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext(
 #endif
 }
 
-void MultiDevSSAGraphBuilder::CreateBroadcastOp(Graph *result,
+void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
                                                 const std::string &p_name,
                                                 size_t src_dev_id) const {
 #ifdef PADDLE_WITH_CUDA
@@ -369,16 +499,16 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(Graph *result,
       result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
       local_scopes_, places_);
 #endif
-  result->Get<GraphOps>("ops").emplace_back(op_handle);
+  result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
 
   auto *in =
-      result->Get<GraphVars>("vars").at(src_dev_id).at(p_name).back().get();
+      result->Get<GraphVars>(kGraphVars).at(src_dev_id).at(p_name).back().get();
   op_handle->AddInput(in);
 
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &p = places_[i];
     SetCommunicationContext(op_handle, p);
-    auto &vars = result->Get<GraphVars>("vars").at(i).at(p_name);
+    auto &vars = result->Get<GraphVars>(kGraphVars).at(i).at(p_name);
     auto *out_var = new VarHandle(
         result->CreateEmptyNode(p_name, ir::Node::Type::kVariable), vars.size(),
         i, p_name, p);
@@ -387,32 +517,32 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(Graph *result,
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateComputationalOp(Graph *result,
+void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
                                                     ir::Node *node,
                                                     int dev_id) const {
-  result->Get<GraphOps>("ops").emplace_back(
+  result->Get<GraphOps>(kGraphOps).emplace_back(
       new ComputationOpHandle(result->CreateOpNode(node->Op()),
                               local_scopes_[dev_id], places_[dev_id]));
   CreateOpHandleIOs(result, node, dev_id);
 }
 
-void MultiDevSSAGraphBuilder::InsertAllReduceOp(Graph *result,
+void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
                                                 const std::string &og) const {
 #ifdef PADDLE_WITH_CUDA
-  result->Get<GraphOps>("ops").emplace_back(new AllReduceOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
       result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_));
 #else
-  result->Get<GraphOps>("ops").emplace_back(new AllReduceOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
       result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
       local_scopes_, places_));
 #endif
-  auto *op_handle = result->Get<GraphOps>("ops").back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
 
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &p = places_[i];
     SetCommunicationContext(op_handle, p);
-    auto &vars = result->Get<GraphVars>("vars")[i][og];
+    auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
     PADDLE_ENFORCE(!vars.empty());
     auto &prev_grad = vars.back();
     op_handle->AddInput(prev_grad.get());
@@ -426,22 +556,22 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(Graph *result,
 }
 
 void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
-    Graph *result, const std::vector<std::string> &datas) const {
+    ir::Graph *result, const std::vector<std::string> &datas) const {
 #ifdef PADDLE_WITH_CUDA
-  result->Get<GraphOps>("ops").emplace_back(new DataBalanceOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new DataBalanceOpHandle(
       result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_));
 #else
-  result->Get<GraphOps>("ops").emplace_back(new DataBalanceOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new DataBalanceOpHandle(
       result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
       local_scopes_, places_));
 #endif
-  auto *op_handle = result->Get<GraphOps>("ops").back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &p = places_[i];
     SetCommunicationContext(op_handle, p);
     for (const std::string &d_name : datas) {
-      auto &vars = result->Get<GraphVars>("vars")[i][d_name];
+      auto &vars = result->Get<GraphVars>(kGraphVars)[i][d_name];
       PADDLE_ENFORCE(!vars.empty());
       op_handle->AddInput(vars.back().get());
       auto var = new VarHandle(
@@ -465,7 +595,8 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
   return is_pg_once;
 }
 
-int MultiDevSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const {
+int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph,
+                                           ir::Node *node) const {
   if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
     return -1;
   }
@@ -478,18 +609,21 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const {
       node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
 
   PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
-  int dev_id = GetVarDeviceID(param_grad[1]);
-  PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s]",
-                    node->Op()->Type(), param_grad[0]);
+  int dev_id = GetVarDeviceID(graph, param_grad[1]);
+  PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]",
+                    node->Op()->Type(), param_grad[0], param_grad[1]);
   return dev_id;
 }
 
-int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const {
-  auto got = var_name_on_devices_.find(varname);
-  return got == var_name_on_devices_.end() ? -1 : got->second;
+int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph,
+                                            const std::string &varname) const {
+  auto &sharded_var_device = graph.Get<ShardedVarDevice>(kShardedVarDevice);
+  auto got = sharded_var_device.find(varname);
+  return got == sharded_var_device.end() ? -1 : got->second;
 }
 
-void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(Graph *result) const {
+void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
+    ir::Graph *result, const std::string &loss_grad_name) const {
   for (size_t i = 0; i < places_.size(); ++i) {
 // Insert ScaleCost OpHandle
 #ifdef PADDLE_WITH_CUDA
@@ -504,7 +638,7 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(Graph *result) const {
         result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation),
         local_scopes_.size(), local_scopes_[i], places_[i],
         communication_dev_ctx);
-    result->Get<GraphOps>("ops").emplace_back(op_handle);
+    result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
 
     // FIXME: Currently ScaleLossGradOp only use device_count as scale
     // factor. So it does not depend on any other operators.
@@ -512,48 +646,48 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(Graph *result) const {
     // loss->pending_ops_.emplace_back(op_handle);
     // op_handle->inputs_.emplace_back(loss);
 
-    CreateOpOutput(result, op_handle,
-                   result->CreateEmptyNode(GradVarName(loss_var_name_),
-                                           ir::Node::Type::kVariable),
-                   places_[i], i);
+    CreateOpOutput(
+        result, op_handle,
+        result->CreateEmptyNode(loss_grad_name, ir::Node::Type::kVariable),
+        places_[i], i);
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateComputationalOps(Graph *result,
+void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
                                                      ir::Node *node,
                                                      size_t num_places) const {
   for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
     auto p = places_[scope_idx];
     auto s = local_scopes_[scope_idx];
-    result->Get<GraphOps>("ops").emplace_back(
+    result->Get<GraphOps>(kGraphOps).emplace_back(
         new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p));
     CreateOpHandleIOs(result, node, scope_idx);
   }
 }
 
-VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(Graph *result,
+VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
                                                    const std::string &og,
                                                    int dst_dev_id) const {
 #ifdef PADDLE_WITH_CUDA
-  result->Get<GraphOps>("ops").emplace_back(new ReduceOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new ReduceOpHandle(
       result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_));
 #else
-  result->Get<GraphOps>("ops").emplace_back(new ReduceOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new ReduceOpHandle(
       result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
       local_scopes_, places_));
 #endif
-  auto *op_handle = result->Get<GraphOps>("ops").back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
 
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &p = places_[i];
     SetCommunicationContext(op_handle, p);
-    auto &vars = result->Get<GraphVars>("vars")[i][og];
+    auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
     PADDLE_ENFORCE(!vars.empty());
     auto &prev_grad = vars.back();
     op_handle->AddInput(prev_grad.get());
   }
-  auto &vars = result->Get<GraphVars>("vars")[dst_dev_id][og];
+  auto &vars = result->Get<GraphVars>(kGraphVars)[dst_dev_id][og];
   auto var =
       new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
                     vars.size(), dst_dev_id, og, places_[dst_dev_id]);
@@ -564,20 +698,19 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(Graph *result,
 
 // Find the first occurence of `prev_op_name` and make current `op` depend
 // on it.
-void MultiDevSSAGraphBuilder::ConnectOp(Graph *result, OpHandleBase *op,
+void MultiDevSSAGraphBuilder::ConnectOp(ir::Graph *result, OpHandleBase *op,
                                         const std::string &prev_op_name) const {
-  for (auto &prev_op : result->Get<GraphOps>("ops")) {
+  for (auto &prev_op : result->Get<GraphOps>(kGraphOps)) {
     if (prev_op->Name() == prev_op_name) {
-      auto *dep_var = new DummyVarHandle(
-          result->CreateEmptyNode("dummy", ir::Node::Type::kVariable));
+      auto *dep_var = new DummyVarHandle(result->CreateControlDepVar());
       prev_op->AddOutput(dep_var);
-      result->Get<GraphDepVars>("dep_vars").emplace(dep_var);
+      result->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
       op->AddInput(dep_var);
     }
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateDistTrainOp(Graph *result,
+void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
                                                 ir::Node *node) const {
   int op_dev_id = -1;
   std::vector<std::string> input_var_names;
@@ -591,20 +724,24 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(Graph *result,
 
   if (node->Op()->Type() == "split_byref" ||
       node->Op()->Type() == "split_selected_rows") {
-    op_dev_id = GetVarDeviceID(input_var_names[0]);
+    // TODO(paddle-dev): getting the first var is not safe.
+    op_dev_id = GetVarDeviceID(*result, input_var_names[0]);
     if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
       op_dev_id = GetAppropriateDeviceID(input_var_names);
       for (auto &varname : input_var_names) {
-        var_name_on_devices_.emplace(varname, op_dev_id);
+        result->Get<ShardedVarDevice>(kShardedVarDevice)
+            .emplace(varname, op_dev_id);
       }
     }
     for (auto &varname : output_var_names) {
-      var_name_on_devices_.emplace(varname, op_dev_id);
+      result->Get<ShardedVarDevice>(kShardedVarDevice)
+          .emplace(varname, op_dev_id);
     }
   } else if (node->Op()->Type() == "concat") {
-    op_dev_id = GetVarDeviceID(input_var_names[0]);
+    op_dev_id = GetVarDeviceID(*result, input_var_names[0]);
     for (auto &varname : output_var_names) {
-      var_name_on_devices_.emplace(varname, op_dev_id);
+      result->Get<ShardedVarDevice>(kShardedVarDevice)
+          .emplace(varname, op_dev_id);
     }
   } else {
     PADDLE_ENFORCE(
@@ -618,16 +755,20 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(Graph *result,
 
   CreateComputationalOp(result, node, op_dev_id);
   if (node->Op()->Type() == "concat") {
-    ConnectOp(result, result->Get<GraphOps>("ops").back().get(),
+    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(),
               "fetch_barrier");
   }
 }
 
 // Create RPC related op handles that connects its in ops and out ops.
-void MultiDevSSAGraphBuilder::CreateRPCOp(Graph *result, ir::Node *node) const {
+void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
+                                          ir::Node *node) const {
   int op_dev_id = -1;
   if (node->Op()->Type() == "send") {
-    op_dev_id = GetVarDeviceID(node->inputs[0]->Name());
+    // TODO(paddle-dev): getting the first var is not safe.
+    op_dev_id = GetVarDeviceID(*result, node->inputs[0]->Name());
+    PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]),
+                   "This hack no longer holds, please fix.");
     // the variable name which contains .block means it was splited by
     // split_byref op
     // so that we can balance the variable blocks to all the pserver
@@ -640,7 +781,8 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(Graph *result, ir::Node *node) const {
       }
       op_dev_id = GetAppropriateDeviceID(input_var_names);
       for (auto &varname : input_var_names) {
-        var_name_on_devices_.emplace(varname, op_dev_id);
+        result->Get<ShardedVarDevice>(kShardedVarDevice)
+            .emplace(varname, op_dev_id);
       }
     }
   } else if (node->Op()->Type() == "recv") {
@@ -650,7 +792,8 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(Graph *result, ir::Node *node) const {
     }
     op_dev_id = GetAppropriateDeviceID(output_var_names);
     for (auto &varname : output_var_names) {
-      var_name_on_devices_.emplace(varname, op_dev_id);
+      result->Get<ShardedVarDevice>(kShardedVarDevice)
+          .emplace(varname, op_dev_id);
     }
   } else {
     // send_barrier and fetch_barrier op can be scheduled on device 0
@@ -660,17 +803,18 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(Graph *result, ir::Node *node) const {
   PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
                  node->Op()->Type());
 
-  result->Get<GraphOps>("ops").emplace_back(new RPCOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new RPCOpHandle(
       result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
       node->Op()->Type(), places_[op_dev_id]));
 
+  // TODO(panyx0718): This might not be needed anymore.
   if (node->Op()->Type() == "send_barrier") {
-    ConnectOp(result, result->Get<GraphOps>("ops").back().get(), "send");
+    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(), "send");
   } else if (node->Op()->Type() == "recv") {
-    ConnectOp(result, result->Get<GraphOps>("ops").back().get(),
+    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(),
               "send_barrier");
   } else if (node->Op()->Type() == "fetch_barrier") {
-    ConnectOp(result, result->Get<GraphOps>("ops").back().get(), "recv");
+    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(), "recv");
   } else if (node->Op()->Type() == "send") {
     // do nothing
   } else {
@@ -692,3 +836,11 @@ bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
+
+REGISTER_PASS(multi_devices_pass,
+              paddle::framework::details::MultiDevSSAGraphBuilder)
+    .RequirePassAttr(paddle::framework::details::kLossVarName)
+    .RequirePassAttr(paddle::framework::details::kPlaces)
+    .RequirePassAttr(paddle::framework::details::kParams)
+    .RequirePassAttr(paddle::framework::details::kLocalScopes)
+    .RequirePassAttr(paddle::framework::details::kStrategy);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
similarity index 50%
rename from paddle/fluid/framework/details/multi_devices_graph_builder.h
rename to paddle/fluid/framework/details/multi_devices_graph_pass.h
index 2b7f4f586b4e750fde9245286c977258a9db6086..7a6f238f9cf7af18cb10ea271e453fec1902c833 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -18,7 +18,7 @@
 #include <vector>
 
 #include "paddle/fluid/framework/details/build_strategy.h"
-#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
@@ -30,42 +30,32 @@ namespace framework {
 class Scope;
 namespace details {
 
-class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
- public:
-#ifdef PADDLE_WITH_CUDA
-  MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
-                          const std::string &loss_var_name,
-                          const std::unordered_set<std::string> &params,
-                          const std::vector<Scope *> &local_scopes,
-                          platform::NCCLContextMap *nccl_ctxs,
-                          const BuildStrategy &strategy);
-#else
-  MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
-                          const std::string &loss_var_name,
-                          const std::unordered_set<std::string> &params,
-                          const std::vector<Scope *> &local_scopes,
-                          const BuildStrategy &strategy);
-#endif
-  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override;
-  int GetVarDeviceID(const std::string &varname) const override;
+class MultiDevSSAGraphBuilder : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
  private:
-  void CreateOpHandleIOs(Graph *result, ir::Node *node, size_t device_id) const;
+  void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
+                         size_t device_id) const;
+  void Init() const;
 
  private:
-  std::string loss_var_name_;
-  const std::vector<platform::Place> &places_;
-  const std::vector<Scope *> &local_scopes_;
-  std::unordered_set<std::string> grad_names_;
+  mutable std::string loss_var_name_;
+  mutable std::vector<platform::Place> places_;
+  mutable std::vector<Scope *> local_scopes_;
+  mutable std::unordered_set<std::string> grad_names_;
 
 #ifdef PADDLE_WITH_CUDA
-  platform::NCCLContextMap *nccl_ctxs_;
+  mutable platform::NCCLContextMap *nccl_ctxs_;
 #endif
 
+  int GetVarDeviceID(const ir::Graph &graph, const std::string &varname) const;
+
   bool IsScaleLossOp(ir::Node *node) const;
 
-  void CreateRPCOp(Graph *result, ir::Node *node) const;
-  void CreateDistTrainOp(Graph *result, ir::Node *node) const;
+  void CreateRPCOp(ir::Graph *result, ir::Node *node) const;
+  void CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
 
   /**
    * Is this operator as the end-point operator before/after send operator.
@@ -74,34 +64,37 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
                      const std::vector<std::string> &recv_vars) const;
 
   std::vector<std::string> FindDistTrainSendVars(
-      const std::vector<std::unique_ptr<ir::Node>> &nodes) const;
+      const std::vector<ir::Node *> &nodes) const;
 
   std::vector<std::string> FindDistTrainRecvVars(
-      const std::vector<std::unique_ptr<ir::Node>> &nodes) const;
+      const std::vector<ir::Node *> &nodes) const;
 
-  void ConnectOp(Graph *result, OpHandleBase *op,
+  void ConnectOp(ir::Graph *result, OpHandleBase *op,
                  const std::string &prev_op_name) const;
 
-  void CreateComputationalOps(Graph *result, ir::Node *node,
+  void CreateComputationalOps(ir::Graph *result, ir::Node *node,
                               size_t num_places) const;
 
-  void CreateScaleLossGradOp(Graph *result) const;
-  VarHandle *CreateReduceOp(Graph *result, const std::string &og,
+  void CreateScaleLossGradOp(ir::Graph *result,
+                             const std::string &loss_grad_name) const;
+
+  VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
                             int dst_dev_id) const;
-  void CreateComputationalOp(Graph *result, ir::Node *node, int dev_id) const;
+  void CreateComputationalOp(ir::Graph *result, ir::Node *node,
+                             int dev_id) const;
 
   bool IsParameterGradientOnce(
       const std::string &og,
       std::unordered_set<std::string> *og_has_been_broadcast) const;
 
-  int GetOpDeviceID(ir::Node *node) const;
+  int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const;
 
-  void InsertAllReduceOp(Graph *result, const std::string &og) const;
+  void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
 
-  void InsertDataBalanceOp(Graph *result,
+  void InsertDataBalanceOp(ir::Graph *result,
                            const std::vector<std::string> &datas) const;
 
-  void CreateBroadcastOp(Graph *result, const std::string &p_name,
+  void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
                          size_t src_dev_id) const;
 
   bool IsSparseGradient(const std::string &og) const;
@@ -110,9 +103,8 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
       const std::vector<std::string> &var_names) const;
 
  private:
-  BuildStrategy strategy_;
+  mutable BuildStrategy strategy_;
   mutable std::unordered_map<std::string, VarDesc *> all_vars_;
-  mutable std::unordered_map<std::string, int> var_name_on_devices_;
   mutable std::vector<int64_t> balance_vars_;
 
   void SetCommunicationContext(OpHandleBase *op_handle,
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.cc b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
similarity index 82%
rename from paddle/fluid/framework/details/ssa_graph_printer.cc
rename to paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
index 6dd6fd262e35a192ba85eb3aa16660526d2ebca2..69944a42b688a9ea5ff29f75f18dd4b156848a27 100644
--- a/paddle/fluid/framework/details/ssa_graph_printer.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/ssa_graph_printer.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include <string>
 #include "paddle/fluid/framework/ir/graph.h"
 
@@ -21,8 +21,8 @@ namespace framework {
 namespace details {
 
 template <typename Callback>
-static inline void IterAllVar(const Graph &graph, Callback callback) {
-  for (auto &each : graph.Get<GraphVars>("vars")) {
+static inline void IterAllVar(const ir::Graph &graph, Callback callback) {
+  for (auto &each : graph.Get<GraphVars>(kGraphVars)) {
     for (auto &pair1 : each) {
       for (auto &pair2 : pair1.second) {
         callback(*pair2);
@@ -30,12 +30,12 @@ static inline void IterAllVar(const Graph &graph, Callback callback) {
     }
   }
 
-  for (auto &var : graph.Get<GraphDepVars>("dep_vars")) {
+  for (auto &var : graph.Get<GraphDepVars>(kGraphDepVars)) {
     callback(*var);
   }
 }
 
-void GraphvizSSAGraphPrinter::Print(const Graph &graph,
+void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
                                     std::ostream &sout) const {
   size_t var_id = 0;
   std::unordered_map<const VarHandleBase *, size_t> vars;
@@ -61,7 +61,7 @@ void GraphvizSSAGraphPrinter::Print(const Graph &graph,
   });
 
   size_t op_id = 0;
-  for (auto &op : graph.Get<GraphOps>("ops")) {
+  for (auto &op : graph.Get<GraphOps>(kGraphOps)) {
     std::string op_name = "op_" + std::to_string(op_id++);
     sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
          << std::endl;
@@ -81,3 +81,6 @@ void GraphvizSSAGraphPrinter::Print(const Graph &graph,
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
+
+REGISTER_PASS(multi_devices_print_pass,
+              paddle::framework::details::SSAGraghBuilderWithPrinter);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..c00685fa1629c0722c315c726053c2cba8bf17e7
--- /dev/null
+++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <fstream>
+#include <iosfwd>
+#include <ostream>
+#include <string>
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class SSAGraphPrinter {
+ public:
+  virtual ~SSAGraphPrinter() {}
+  virtual void Print(const ir::Graph& graph, std::ostream& sout) const = 0;
+};
+
+class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
+ public:
+  void Print(const ir::Graph& graph, std::ostream& sout) const override;
+};
+
+class SSAGraghBuilderWithPrinter : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override {
+    std::unique_ptr<std::ostream> fout(
+        new std::ofstream(Get<const std::string>("debug_graphviz_path")));
+    PADDLE_ENFORCE(fout->good());
+    Get<GraphvizSSAGraphPrinter>("graph_printer").Print(*graph, *fout);
+    return graph;
+  }
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_helper.cc b/paddle/fluid/framework/details/multi_devices_helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0242274a16c50508f2c0294264c175515c7293ef
--- /dev/null
+++ b/paddle/fluid/framework/details/multi_devices_helper.cc
@@ -0,0 +1,20 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/multi_devices_helper.h
similarity index 61%
rename from paddle/fluid/framework/details/ssa_graph_builder.h
rename to paddle/fluid/framework/details/multi_devices_helper.h
index e8e8acdb38f893302fb92c47d6f1cb2d38453e0f..175c5a9950be69d7bf6ae9e386af762007a18a51 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -39,45 +39,19 @@ namespace details {
 typedef std::vector<
     std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>
     GraphVars;
+const char kGraphVars[] = "vars";
 
 // aux variables to represent dependency. Useful to resolve data hazard.
 typedef std::unordered_set<std::unique_ptr<VarHandleBase>> GraphDepVars;
+const char kGraphDepVars[] = "dep_vars";
 
 // all operators. NOTE that even we use a vector here, the operators is
 // unordered.
 typedef std::vector<std::unique_ptr<OpHandleBase>> GraphOps;
+const char kGraphOps[] = "ops";
 
-class SSAGraphBuilder : public ir::Pass {
- public:
-  SSAGraphBuilder() {}
-  virtual ~SSAGraphBuilder() {}
-
-  virtual int GetVarDeviceID(const std::string &var_name) const = 0;
-
-  DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
-
- protected:
-  /**
-   * We only handle write after read(WAR), since it should not have a write
-   * after write in program. If there are write after write operators, we need
-   * prune them.
-   *
-   * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
-   */
-  static void PolishGraphToSupportDataHazards(Graph *graph);
-
-  static VarHandle *CreateOrGetLatestVarHandle(Graph *graph, ir::Node *node,
-                                               const platform::Place &place,
-                                               size_t place_offset);
-
-  // Add an output variable (each_var_name, place, place_offset) to op_handle,
-  // which belongs to graph
-  static void CreateOpOutput(Graph *graph, OpHandleBase *op_handle,
-                             ir::Node *new_node, const platform::Place &place,
-                             size_t place_offset);
-
-  static void AddOutputToLeafOps(Graph *graph);
-};
+typedef std::unordered_map<std::string, int> ShardedVarDevice;
+const char kShardedVarDevice[] = "sharded_var_device";
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 7160e346dad0615e2fd32b70c096880af0359e1a..6c7e5c1fb06620b1c071b00fcfcc1b4a29bf8d62 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -16,12 +16,18 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_bool(
+    cpu_deterministic, false,
+    "Whether to make the result of computation deterministic in CPU side.");
 
 namespace paddle {
 namespace framework {
 namespace details {
 
 void ReduceOpHandle::RunImpl() {
+  platform::RecordEvent r("reduce", nullptr);
   if (places_.size() == 1) return;
   // the input and output may have dummy var.
   auto in_var_handles = DynamicCast<VarHandle>(inputs_);
@@ -89,11 +95,33 @@ void ReduceOpHandle::RunImpl() {
   } else {
     std::vector<const LoDTensor *> lod_tensors =
         GetInputValues<LoDTensor>(in_var_handles, var_scopes);
+
     if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) {
       this->RunAndRecordEvent([&] {
-        ReduceLoDTensor func(lod_tensors,
-                             out_var->GetMutable<framework::LoDTensor>());
-        VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+        // FIXME(zcd): The order of summing is important,
+        // especially when the type of data is float or double.
+        // For example, the result of `a+b+c+d` may be different
+        // with the result of `c+a+b+d`, so the summing order should be fixed.
+        if (!FLAGS_cpu_deterministic) {
+          ReduceLoDTensor func(lod_tensors,
+                               out_var->GetMutable<framework::LoDTensor>());
+          VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+        } else {
+          // We sum lod_tensors to reduce_sum_trg which is in local_scopes_0
+          // here, but it doesn't mean reduce_sum_trg must be in local_scopes_0.
+          auto &reduce_sum_trg = *this->local_scopes_[0]
+                                      ->FindVar(kLocalExecScopeName)
+                                      ->Get<Scope *>()
+                                      ->FindVar(out_var_handle->name_)
+                                      ->GetMutable<framework::LoDTensor>();
+          ReduceLoDTensor func(lod_tensors, &reduce_sum_trg);
+          VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+
+          auto trg = out_var->GetMutable<framework::LoDTensor>();
+          if (reduce_sum_trg.data<void>() != trg->data<void>()) {
+            TensorCopy(reduce_sum_trg, platform::CPUPlace(), trg);
+          }
+        }
       });
     } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc
index 924ff4d118a192a43e5828a38fd1abbaac1a8526..f44b374edb29228dff5a8bf003d945291f166d49 100644
--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
+#include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
 namespace framework {
@@ -33,7 +34,7 @@ void RPCOpHandle::RunImpl() {
   for (auto *in : inputs_) {
     auto &p = static_cast<VarHandle *>(in)->place_;
     // FIXME(Yancey1989): need a better solution instead of use DebugString()
-    if (in->DebugString() == "dummy") {  // HACK
+    if (ir::IsControlDepVar(*in->Node())) {  // HACK
       continue;
     }
     if (in->GeneratedOp()) {
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index 1d80bab90f513139f807b57258177c6b2ac53ac0..5bd974d6b789a2f085c0a69de5e133187342f587 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
@@ -62,6 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
     eptr = std::current_exception();
   }
 
+  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
   drop_scope_counter_ += 1;
   if (!fetch_tensors.empty() ||
       drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index cbfbcb1c0cd24f16773f9633310166371600790c..5e87e0bf50b51d2b630aba06a5907dd721754d1f 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -40,6 +40,11 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
       ExecutionStrategy strategy, std::vector<Scope*> local_scopes,
       std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
       std::unique_ptr<SSAGraphExecutor>&& underlying_executor);
+
+  const ir::Graph& Graph() const override {
+    return underlying_executor_->Graph();
+  }
+
   FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
 
  private:
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
deleted file mode 100644
index 7bc130ef6e8d2e0caf6e445d12950b87e6dd4dbd..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/details/ssa_graph_builder.h"
-#include <utility>
-
-namespace paddle {
-namespace framework {
-namespace details {
-void SSAGraphBuilder::PolishGraphToSupportDataHazards(Graph *graph) {
-  for (auto &var_map : graph->Get<GraphVars>("vars")) {
-    for (auto &name_pair : var_map) {
-      if (name_pair.second.size() <= 1) {
-        continue;
-      }
-      auto it_new = name_pair.second.rbegin();
-      auto it_old = name_pair.second.rbegin();
-      ++it_old;
-      for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
-        OpHandleBase *write_op = (*it_new)->GeneratedOp();
-        const auto &read_ops = (*it_old)->PendingOps();
-
-        for (auto *read_op : read_ops) {
-          // Manually add a dependency var from read_op to write_op;
-          if (read_op == write_op) {
-            // Read Write is the same op.
-            continue;
-          }
-
-          auto *dep_var = new DummyVarHandle(
-              graph->CreateEmptyNode("dummy", ir::Node::Type::kVariable));
-          read_op->AddOutput(dep_var);
-          write_op->AddInput(dep_var);
-          graph->Get<GraphDepVars>("dep_vars").emplace(dep_var);
-        }
-      }
-    }
-  }
-}
-
-VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
-    Graph *graph, ir::Node *node, const platform::Place &place,
-    size_t place_offset) {
-  auto &var_holders = graph->Get<GraphVars>("vars")[place_offset];
-  auto &var_holder = var_holders[node->Name()];
-  VarHandle *var = nullptr;
-  if (var_holder.empty()) {
-    if (node->Var()) {
-      var = new VarHandle(graph->CreateVarNode(node->Var()), 0, place_offset,
-                          node->Name(), place);
-    } else {
-      var = new VarHandle(
-          graph->CreateEmptyNode(node->Name(), ir::Node::Type::kVariable), 0,
-          place_offset, node->Name(), place);
-    }
-    var_holder.emplace_back(var);
-  } else {
-    var = var_holder.rbegin()->get();
-  }
-  return var;
-}
-
-void SSAGraphBuilder::CreateOpOutput(Graph *graph, OpHandleBase *op_handle,
-                                     ir::Node *new_node,
-                                     const platform::Place &place,
-                                     size_t place_offset) {
-  auto &vars = graph->Get<GraphVars>("vars")[place_offset][new_node->Name()];
-  size_t version = vars.size();
-  auto var =
-      new VarHandle(new_node, version, place_offset, new_node->Name(), place);
-  vars.emplace_back(var);
-  op_handle->AddOutput(var);
-}
-
-void SSAGraphBuilder::AddOutputToLeafOps(Graph *graph) {
-  for (auto &op : graph->Get<GraphOps>("ops")) {
-    if (!op->Outputs().empty()) {
-      continue;
-    }
-    auto *dummy_leaf = new DummyVarHandle(
-        graph->CreateEmptyNode("dummy", ir::Node::Type::kVariable));
-    graph->Get<GraphDepVars>("dep_vars").emplace(dummy_leaf);
-    op->AddOutput(dummy_leaf);
-  }
-}
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
deleted file mode 100644
index b4b49d3de6da2e5fd7836668619e42d10bb6b35a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
-#include <fstream>
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
-#include "paddle/fluid/framework/details/ssa_graph_checker.h"
-#include "paddle/fluid/framework/details/ssa_graph_printer.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-std::unique_ptr<SSAGraphBuilder> SSAGraphBuilderFactory::Create() {
-  std::unique_ptr<SSAGraphBuilder> res(
-#ifdef PADDLE_WITH_CUDA
-      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
-                                  local_scopes_, nccl_ctxs_, strategy_)
-#else
-      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
-                                  local_scopes_, strategy_)
-#endif
-          );  // NOLINT
-
-  if (!strategy_.debug_graphviz_path_.empty()) {
-    std::unique_ptr<std::ostream> fout(
-        new std::ofstream(strategy_.debug_graphviz_path_));
-    PADDLE_ENFORCE(fout->good());
-    std::unique_ptr<GraphvizSSAGraphPrinter> graphviz_printer(
-        new GraphvizSSAGraphPrinter());
-    res.reset(new SSAGraghBuilderWithPrinter(
-        std::move(fout), std::move(graphviz_printer), std::move(res)));
-  }
-  res.reset(new SSAGraghBuilderWithChecker(std::move(res)));
-
-  return res;
-}
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.h b/paddle/fluid/framework/details/ssa_graph_builder_factory.h
deleted file mode 100644
index 91a119de83ed3d1573803e48faf86c874eed98d6..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/ssa_graph_builder_factory.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/details/build_strategy.h"
-#include "paddle/fluid/framework/details/ssa_graph_builder.h"
-#include "paddle/fluid/platform/place.h"
-
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace framework {
-class Scope;
-namespace details {
-
-class SSAGraphBuilderFactory {
- public:
-  SSAGraphBuilderFactory(const std::vector<platform::Place>& places,
-                         const std::string& loss_var_name,
-                         const std::unordered_set<std::string>& param_names,
-                         const std::vector<Scope*>& local_scopes,
-                         const BuildStrategy& strategy)
-      : places_(places),
-        loss_var_name_(loss_var_name),
-        param_names_(param_names),
-        local_scopes_(local_scopes),
-        strategy_(strategy) {
-#ifdef PADDLE_WITH_CUDA
-    nccl_ctxs_ = nullptr;
-#endif
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  void SetNCCLContextMap(platform::NCCLContextMap* nccl_ctxs) {
-    nccl_ctxs_ = nccl_ctxs;
-  }
-#endif
-
-  std::unique_ptr<SSAGraphBuilder> Create();
-
- private:
-  std::vector<platform::Place> places_;
-  std::string loss_var_name_;
-  std::unordered_set<std::string> param_names_;
-  std::vector<Scope*> local_scopes_;
-  BuildStrategy strategy_;
-
-#ifdef PADDLE_WITH_CUDA
-  platform::NCCLContextMap* nccl_ctxs_;
-#endif
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h
index 8815ec89b23bc874471eefde5fa855cd2a4bde1f..96fffb7d9430cd00b3823ada9fbe9a65a6bd718c 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -32,7 +32,9 @@ class SSAGraphExecutor {
 
   virtual ~SSAGraphExecutor();
 
-  virtual FeedFetchList Run(const std::vector<std::string> &fetch_tensors) = 0;
+  virtual const ir::Graph& Graph() const = 0;
+
+  virtual FeedFetchList Run(const std::vector<std::string>& fetch_tensors) = 0;
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h
deleted file mode 100644
index cd72162f44ca76aa6340606cf79a73601eae89af..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/ssa_graph_printer.h
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <iosfwd>
-#include <string>
-#include "paddle/fluid/framework/details/ssa_graph_builder.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-class SSAGraphPrinter {
- public:
-  virtual ~SSAGraphPrinter() {}
-  virtual void Print(const Graph& graph, std::ostream& sout) const = 0;
-};
-
-class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
- public:
-  void Print(const Graph& graph, std::ostream& sout) const override;
-};
-
-class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
- public:
-  SSAGraghBuilderWithPrinter(std::ostream& sout,
-                             std::unique_ptr<SSAGraphPrinter>&& printer,
-                             std::unique_ptr<SSAGraphBuilder>&& builder)
-      : printer_(std::move(printer)),
-        builder_(std::move(builder)),
-        stream_ref_(sout) {}
-
-  SSAGraghBuilderWithPrinter(std::unique_ptr<std::ostream>&& sout,
-                             std::unique_ptr<SSAGraphPrinter>&& printer,
-                             std::unique_ptr<SSAGraphBuilder>&& builder)
-      : printer_(std::move(printer)),
-        builder_(std::move(builder)),
-        stream_ptr_(std::move(sout)),
-        stream_ref_(*stream_ptr_) {}
-
-  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override {
-    auto new_graph = builder_->Apply(std::move(graph));
-    printer_->Print(*new_graph, stream_ref_);
-    return new_graph;
-  }
-
-  int GetVarDeviceID(const std::string& var_name) const override {
-    return builder_->GetVarDeviceID(var_name);
-  }
-
- private:
-  std::unique_ptr<SSAGraphPrinter> printer_;
-  std::unique_ptr<SSAGraphBuilder> builder_;
-  std::unique_ptr<std::ostream> stream_ptr_;
-  std::ostream& stream_ref_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index f85c62dd6c4a8033a037b1e001ece6a9cc90ca98..994bb6492f685138d02971a6caf12572aecd6d6f 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -14,14 +14,16 @@
 
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 
-#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places, std::unique_ptr<Graph> &&graph)
+    const std::vector<platform::Place> &places,
+    std::unique_ptr<ir::Graph> &&graph)
     : graph_(std::move(graph)),
       pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                        : nullptr),
@@ -33,6 +35,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
 
 FeedFetchList ThreadedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
+  std::unique_ptr<platform::RecordEvent> event(
+      new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
   std::unordered_map<OpHandleBase *, size_t> pending_ops;
   std::unordered_set<VarHandleBase *> pending_vars;
   BlockingQueue<VarHandleBase *> ready_vars;
@@ -44,18 +48,18 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   std::unordered_set<OpHandleBase *> delayed_ops;
 
   // Transform SSAGraph to pending_ops & pending_vars
-  for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
+  for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
     for (auto &name_pair : var_map) {
       for (auto &version_pair : name_pair.second) {
         InsertPendingVar(&pending_vars, &ready_vars, version_pair.get());
       }
     }
   }
-  for (auto &var : graph_->Get<details::GraphDepVars>("dep_vars")) {
+  for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
     InsertPendingVar(&pending_vars, &ready_vars, var.get());
   }
 
-  for (auto &op : graph_->Get<details::GraphOps>("ops")) {
+  for (auto &op : graph_->Get<details::GraphOps>(details::kGraphOps)) {
     if (op->Inputs().empty()) {  // Special case, Op has no input.
       ready_ops.insert(op.get());
     } else {
@@ -82,7 +86,8 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 
   // Clean run context
   run_op_futures_.clear();
-  exception_.reset();
+  exception_holder_.Clear();
+  event.reset(nullptr);
 
   // Step 3. Execution
   while (!pending_vars.empty()) {
@@ -102,23 +107,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
 
     if (timeout) {
-      std::unique_lock<std::mutex> l(exception_mu_);
-      if (exception_) {
-        l.unlock();
+      if (exception_holder_.ExceptionCatched()) {
         for (auto &run_op_future : run_op_futures_) {
           run_op_future.wait();
         }
-        l.lock();
-        std::exception *exp = exception_.get();
-        if (dynamic_cast<platform::EOFException *>(exp)) {
-          auto e = *static_cast<platform::EOFException *>(exp);
-          throw e;
-        } else if (dynamic_cast<platform::EnforceNotMet *>(exp)) {
-          auto e = *static_cast<platform::EnforceNotMet *>(exp);
-          throw e;
-        } else {
-          LOG(FATAL) << "Unknown exception.";
-        }
+        exception_holder_.Throw();
       } else {
         continue;
       }
@@ -161,7 +154,7 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
   std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
 
   for (auto &fetch_var_name : fetch_tensors) {
-    for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
+    for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
       auto it = var_map.find(fetch_var_name);
       if (it != var_map.end()) {
         fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
@@ -228,14 +221,9 @@ void ThreadedSSAGraphExecutor::RunOp(
       ready_var_q->Extend(op->Outputs());
       VLOG(10) << op << " " << op->Name() << "Signal posted";
     } catch (platform::EOFException ex) {
-      std::lock_guard<std::mutex> l(exception_mu_);
-      // EOFException will not cover up existing EnforceNotMet.
-      if (exception_.get() == nullptr) {
-        exception_.reset(new platform::EOFException(ex));
-      }
+      exception_holder_.Catch(ex);
     } catch (platform::EnforceNotMet ex) {
-      std::lock_guard<std::mutex> l(exception_mu_);
-      exception_.reset(new platform::EnforceNotMet(ex));
+      exception_holder_.Catch(ex);
     } catch (...) {
       LOG(FATAL) << "Unknown exception catched";
     }
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index bf7c0a367a19ff4ac9462334516f1577672faa68..9135c1f5d435d5e2c60eb90c80803361aa31a3c4 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -24,6 +24,7 @@
 #include <functional>
 #include "ThreadPool.h"  // ThreadPool in thrird party
 #include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
@@ -40,8 +41,9 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                            const std::vector<Scope *> &local_scopes,
                            const std::vector<platform::Place> &places,
-                           std::unique_ptr<Graph> &&graph);
+                           std::unique_ptr<ir::Graph> &&graph);
 
+  const ir::Graph &Graph() const override { return *graph_; }
   // Run a SSAGraph by a thread pool
   // Use topological sort algorithm
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
@@ -53,13 +55,12 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
              details::OpHandleBase *op);
 
  private:
-  std::unique_ptr<Graph> graph_;
+  std::unique_ptr<ir::Graph> graph_;
   std::unique_ptr<::ThreadPool> pool_;
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
   platform::DeviceContextPool fetch_ctxs_;
-  std::mutex exception_mu_;
-  std::unique_ptr<std::exception> exception_;
+  ExceptionHolder exception_holder_;
   std::atomic<int> running_ops_;
 
   void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc
index 6f00abd9473a84a77ed1a39015e2ae079e00be79..5457870e9ff5d7cf67c9c7076b9aae94eeada779 100644
--- a/paddle/fluid/framework/details/var_handle.cc
+++ b/paddle/fluid/framework/details/var_handle.cc
@@ -26,7 +26,7 @@ std::string VarHandle::DebugString() const {
   return ss.str();
 }
 
-std::string DummyVarHandle::DebugString() const { return "dummy"; }
+std::string DummyVarHandle::DebugString() const { return node_->Name(); }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index c2800c972a5501859672fbfd6921499e84d09cb0..dad170ed78c64202b5c812bd8682887fe3b736d6 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -330,12 +330,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }
 
   for (auto& op : ctx->ops_) {
-    VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
     op->Run(*local_scope, place_);
-    // NOTE! Please do not delete this line, it's usefull because the debug
-    // string before and after op.run are different, after run the output
-    // will have right shape which is usefull for debug.
-    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
 
     if (FLAGS_benchmark) {
       VLOG(2) << "Memory used after operator " + op->Type() + " running: "
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index ee0604383ec9df826fa2abaef1f643ba0da6a096..bf7d76a8a6e173e648cea5aaba9b7202d787173b 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -1,5 +1,9 @@
 cc_library(node SRCS node.cc DEPS proto_desc)
 cc_library(graph SRCS graph.cc DEPS node)
-cc_library(pass SRCS pass.cc DEPS graph node)
+cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
+cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
+cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
 
-cc_test(graph_test SRCS graph_test.cc DEPS graph proto_desc op_registry)
+cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
+cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
+cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index e4021aa92b6da2343b604fb7bc01d31edb97d842..f87d5212c0cd87a5a63cf2d54ca677516ab45816 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -12,14 +12,80 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <algorithm>
+#include <unordered_set>
+
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 
 namespace paddle {
 namespace framework {
+namespace ir {
+
+std::vector<std::string> FindDistTrainSendVars(
+    const std::vector<ir::Node *> &nodes) {
+  std::vector<std::string> send_vars;
+  // since parameters are all in block 0,
+  // it's enough to only scan send ops in block 0
+  for (auto &node : nodes) {
+    auto op_vars = node->Op()->InputArgumentNames();
+    send_vars.reserve(send_vars.size() +
+                      std::distance(op_vars.begin(), op_vars.end()));
+    send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end());
+  }
+  return send_vars;
+}
+
+std::vector<std::string> FindDistTrainRecvVars(
+    const std::vector<ir::Node *> &nodes) {
+  std::vector<std::string> recv_vars;
+  for (auto &node : nodes) {
+    auto op_vars = node->Op()->OutputArgumentNames();
+    recv_vars.reserve(recv_vars.size() +
+                      std::distance(op_vars.begin(), op_vars.end()));
+    recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end());
+  }
+  return recv_vars;
+}
+
+bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars,
+                   const std::vector<std::string> &recv_vars) {
+  if (send_vars.size() == 0 || recv_vars.size() == 0) {
+    return false;
+  }
+
+  /**
+   * Check any of opvars contains `.block` and in sendvars
+   */
+  auto checker = [](const std::vector<std::string> &opvars,
+                    const std::vector<std::string> &rpc_vars) -> bool {
+    for (auto &var : opvars) {
+      // a variable name with the suffix `.block` means it's a splited
+      // variable by (DistributeTranspiler)
+      // [python/paddle/fluid/transpiler/distribute_transpiler.py]
+      if (var.find(".block") != std::string::npos &&
+          std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  std::vector<std::string> input_var_names;
+  std::vector<std::string> output_var_names;
+  for (ir::Node *input : node->inputs) {
+    input_var_names.push_back(input->Name());
+  }
+  for (ir::Node *output : node->outputs) {
+    output_var_names.push_back(output->Name());
+  }
+
+  return checker(output_var_names, send_vars) ||
+         checker(input_var_names, recv_vars);
+}
 
-// NOTE(paddle-dev): This graph contains circle.
 Graph::Graph(const ProgramDesc &program) : program_(program) {
   VLOG(3) << "block in program:" << program_.Size();
   std::unordered_map<std::string, VarDesc *> all_vars;
@@ -27,40 +93,160 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
     all_vars.emplace(var->Name(), var);
   }
 
-  std::map<std::string, ir::Node *> var_nodes;
+  std::map<std::string, std::vector<ir::Node *>> var_nodes;
   for (auto *op : program.Block(0).AllOps()) {
     ir::Node *node = CreateOpNode(op);
-
+    // For input args, reuse the same var name if it was created before.
+    // Otherwise, create a new one.
     for (auto &each_var_name : op->InputArgumentNames()) {
       ir::Node *var = nullptr;
       if (var_nodes.find(each_var_name) != var_nodes.end()) {
-        var = var_nodes.at(each_var_name);
+        var = var_nodes.at(each_var_name).back();
       } else if (all_vars.count(each_var_name) != 0) {
         var = CreateVarNode(all_vars.at(each_var_name));
-        var_nodes[each_var_name] = var;
+        var_nodes[each_var_name].push_back(var);
       } else {
-        // TODO(paddle-dev): Seems some assumption doesn't hold?
-        VLOG(3) << op->Type()
-                << " input var not in all_var list: " << each_var_name;
+        // Operation input var can be optional (dispensable). Which means
+        // the operation doesn't really need the var at runtime. In this
+        // case, the no-existed var is ready at the beginning.
         var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable);
-        var_nodes[each_var_name] = var;
+        var_nodes[each_var_name].push_back(var);
       }
       node->inputs.push_back(var);
       var->outputs.push_back(node);
     }
-
+    // For output args, always create a new var.
     for (auto &each_var_name : op->OutputArgumentNames()) {
-      ir::Node *var = nullptr;
-      if (var_nodes.find(each_var_name) != var_nodes.end()) {
-        var = var_nodes.at(each_var_name);
-      } else {
-        var = CreateVarNode(all_vars.at(each_var_name));
-        var_nodes[each_var_name] = var;
-      }
+      ir::Node *var = CreateVarNode(all_vars.at(each_var_name));
+      var_nodes[each_var_name].push_back(var);
       node->outputs.push_back(var);
       var->inputs.push_back(node);
     }
   }
+
+  std::vector<ir::Node *> send_ops;
+  ir::Node *send_bar = nullptr;
+  std::vector<ir::Node *> recv_ops;
+  ir::Node *fetch_bar = nullptr;
+  for (ir::Node *node : Nodes()) {
+    if (node->Name() == "send") {
+      send_ops.push_back(node);
+    } else if (node->Name() == "send_barrier") {
+      PADDLE_ENFORCE(!send_bar, "only has one send barrier");
+      send_bar = node;
+    } else if (node->Name() == "recv") {
+      recv_ops.push_back(node);
+    } else if (node->Name() == "fetch_barrier") {
+      PADDLE_ENFORCE(!fetch_bar, "only has one fetch barrier");
+      fetch_bar = node;
+    }
+  }
+  if (send_bar) {
+    for (ir::Node *send : send_ops) {
+      ir::Node *dep_var = CreateControlDepVar();
+      send->outputs.push_back(dep_var);
+      dep_var->inputs.push_back(send);
+      send_bar->inputs.push_back(dep_var);
+      dep_var->outputs.push_back(send_bar);
+    }
+    for (ir::Node *recv : recv_ops) {
+      ir::Node *dep_var = CreateControlDepVar();
+      recv->inputs.push_back(dep_var);
+      dep_var->outputs.push_back(recv);
+      send_bar->outputs.push_back(dep_var);
+      dep_var->inputs.push_back(send_bar);
+    }
+  }
+  if (fetch_bar) {
+    for (ir::Node *recv : recv_ops) {
+      ir::Node *dep_var = CreateControlDepVar();
+      recv->outputs.push_back(dep_var);
+      dep_var->inputs.push_back(recv);
+      fetch_bar->inputs.push_back(dep_var);
+      dep_var->outputs.push_back(fetch_bar);
+    }
+  }
+
+  std::vector<std::string> send_vars = FindDistTrainSendVars(send_ops);
+  std::vector<std::string> recv_vars = FindDistTrainRecvVars(recv_ops);
+  for (ir::Node *node : Nodes()) {
+    if (IsDistTrainOp(node, send_vars, recv_vars)) {
+      if (fetch_bar && node->Name() == "concat") {
+        ir::Node *dep_var = CreateControlDepVar();
+        fetch_bar->outputs.push_back(dep_var);
+        dep_var->inputs.push_back(fetch_bar);
+        node->inputs.push_back(dep_var);
+        dep_var->outputs.push_back(node);
+      }
+    }
+  }
+
+  /**
+   * We should handle write after read(WAR) and write after write(WAW) here.
+   * Because some of the operators of the program can be executed parallelly.
+   * So, to make the program running in the right order, we should add the
+   * dependence of WAR and WAW.
+   *
+   *
+   * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
+   */
+
+  for (auto &var : var_nodes) {
+    auto &versions = var.second;
+    if (versions.size() <= 1) continue;
+
+    auto it_new = versions.rbegin();
+    auto it_old = versions.rbegin();
+    ++it_old;
+    for (; it_old != versions.rend(); it_new = it_old, ++it_old) {
+      ir::Node *write_op =
+          (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
+      const auto &read_ops = (*it_old)->outputs;
+
+      PADDLE_ENFORCE(write_op, "The write_op should not be empty.");
+
+      // Add write after write dependence
+      ir::Node *upstream_op =
+          (*it_old)->inputs.empty() ? nullptr : (*it_old)->inputs[0];
+      if (upstream_op) {
+        ir::Node *dep_var = CreateControlDepVar();
+        write_op->inputs.push_back(dep_var);
+        upstream_op->outputs.push_back(dep_var);
+        dep_var->outputs.push_back(write_op);
+        dep_var->inputs.push_back(upstream_op);
+      }
+
+      for (auto *read_op : read_ops) {
+        // Manually add a dependency var from read_op to write_op;
+        if (read_op == write_op) {
+          // Read Write is the same op.
+          continue;
+        }
+        // 2 ops might have been connected via other vars.
+        bool has_dep = false;
+        for (ir::Node *r_out : read_op->outputs) {
+          for (ir::Node *w_in : write_op->inputs) {
+            if (r_out == w_in) {
+              has_dep = true;
+              break;
+            }
+          }
+        }
+        if (has_dep) continue;
+
+        ir::Node *dep_var = CreateControlDepVar();
+        read_op->outputs.push_back(dep_var);
+        dep_var->inputs.push_back(read_op);
+        write_op->inputs.push_back(dep_var);
+        dep_var->outputs.push_back(write_op);
+      }
+    }
+  }
+}
+
+bool IsControlDepVar(const ir::Node &var) {
+  return var.Name().find(ir::Node::kControlDepVarName) != std::string::npos;
 }
+}  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index b4ac135b029005b723abca2cb9b9a9aa175eda40..c9d55fbf525a1a476ac469e8e57462169a7db2da 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -26,27 +26,35 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+namespace ir {
 
 class Graph {
  public:
-  explicit Graph(const ProgramDesc& program);
+  explicit Graph(const ProgramDesc &program);
 
   virtual ~Graph() {
-    for (auto& attr : attrs_) {
+    for (auto &attr : attrs_) {
       attr_dels_[attr.first]();
     }
     attrs_.clear();
     attr_dels_.clear();
   }
 
+  bool Has(const std::string &attr_name) const {
+    return attrs_.find(attr_name) != attrs_.end();
+  }
+
   template <typename AttrType>
-  AttrType& Get(const std::string& attr_name) const {
-    return *boost::any_cast<AttrType*>(attrs_.at(attr_name));
+  AttrType &Get(const std::string &attr_name) const {
+    PADDLE_ENFORCE(Has(attr_name), "%s attr not registered for graph.",
+                   attr_name);
+    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
   }
 
   template <typename AttrType>
-  void Set(const std::string& attr_name, AttrType* attr) {
-    PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
+  void Set(const std::string &attr_name, AttrType *attr) {
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the graph",
+                   attr_name);
     attrs_[attr_name] = attr;
     attr_dels_[attr_name] = [attr, attr_name]() {
       VLOG(3) << "deleting " << attr_name;
@@ -54,29 +62,70 @@ class Graph {
     };
   }
 
-  ir::Node* CreateVarNode(VarDesc* var_desc) {
-    nodes.emplace_back(new ir::Node(var_desc));
-    return nodes.back().get();
+  const std::unordered_set<ir::Node *> &Nodes() const { return node_set_; }
+
+  // Create a normal variable with non-null VarDesc.
+  ir::Node *CreateVarNode(VarDesc *var_desc) {
+    return AddNode(new ir::Node(var_desc));
   }
 
-  ir::Node* CreateOpNode(OpDesc* op_desc) {
-    nodes.emplace_back(new ir::Node(op_desc));
-    return nodes.back().get();
+  // Create a normal runnable operator with OpDesc.
+  ir::Node *CreateOpNode(OpDesc *op_desc) {
+    return AddNode(new ir::Node(op_desc));
   }
 
-  ir::Node* CreateEmptyNode(const std::string& name, ir::Node::Type type) {
-    nodes.emplace_back(new ir::Node(name, type));
-    return nodes.back().get();
+  // Create a control dependency var that connects 2 operations. The
+  // var doesn't hold any data. Other than that, it's no different from
+  // other var, considering dependency analysis.
+  ir::Node *CreateControlDepVar() {
+    // TODO(panyx0718): control var name should be really unique.
+    const std::string name = string::Sprintf(
+        "%s@%llu", ir::Node::kControlDepVarName, node_set_.size());
+    return AddNode(new ir::Node(name, ir::Node::Type::kVariable));
   }
 
-  std::vector<std::unique_ptr<ir::Node>> nodes;
+  // A more free style way of creating a graph node. Mostly use for test
+  // or "copy" from another node. Avoid using it if possible.
+  ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type) {
+    return AddNode(new ir::Node(name, type));
+  }
+
+  // Clear all node information of the graph and return the ownership of the
+  // nodes.
+  std::vector<std::unique_ptr<ir::Node>> ReleaseNodes() {
+    std::vector<std::unique_ptr<ir::Node>> ret;
+    for (auto &n : nodes_) {
+      ret.emplace_back(n.second.release());
+    }
+    nodes_.clear();
+    node_set_.clear();
+    return ret;
+  }
 
  private:
+  // This method takes ownership of `node`.
+  ir::Node *AddNode(ir::Node *node) {
+    PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());
+    nodes_[node].reset(node);
+    node_set_.insert(node);
+    return node;
+  }
+
+  void RemoveNode(ir::Node *node) {
+    PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
+    node_set_.erase(node);
+    nodes_.erase(node);
+  }
+
   // NOTE: program_ shouldn't be exposed to user.
-  const ProgramDesc& program_;
+  const ProgramDesc &program_;
   std::map<std::string, boost::any> attrs_;
   std::map<std::string, std::function<void(void)>> attr_dels_;
+  std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_;
+  std::unordered_set<ir::Node *> node_set_;
 };
 
+bool IsControlDepVar(const ir::Node &var);
+}  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b1c19e6535150130822e9f48685241e62de5b064
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -0,0 +1,118 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace {
+void SortHelper(
+    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
+    ir::Node *node, std::unordered_set<ir::Node *> *visited,
+    std::vector<ir::Node *> *ret) {
+  visited->insert(node);
+
+  for (auto adj : adj_list.at(node)) {
+    if (visited->find(adj) == visited->end()) {
+      SortHelper(adj_list, adj, visited, ret);
+    }
+  }
+
+  VLOG(3) << "topology sort insert: " << node->Name()
+          << reinterpret_cast<void *>(node) << " input " << node->inputs.size();
+  ret->push_back(node);
+}
+
+bool HasCircleHelper(
+    ir::Node *node,
+    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
+    std::unordered_set<ir::Node *> *visited,
+    std::unordered_set<ir::Node *> *in_trace) {
+  if (visited->find(node) == visited->end()) {
+    visited->insert(node);
+    in_trace->insert(node);
+
+    for (ir::Node *in : adj_list.at(node)) {
+      if (visited->find(in) == visited->end() &&
+          HasCircleHelper(in, adj_list, visited, in_trace)) {
+        return true;
+      } else if (in_trace->find(in) != in_trace->end()) {
+        return true;
+      }
+    }
+  }
+  in_trace->erase(node);
+  return false;
+}
+
+bool HasCircleInternal(
+    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list) {
+  std::unordered_set<ir::Node *> visited;
+  std::unordered_set<ir::Node *> in_trace;
+  for (auto &adj : adj_list) {
+    if (HasCircleHelper(adj.first, adj_list, &visited, &in_trace)) {
+      return true;
+    }
+  }
+  return false;
+}
+}  // namespace
+
+bool HasCircle(const Graph &graph) {
+  return HasCircleInternal(BuildOperationAdjList(graph));
+}
+
+std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
+  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list =
+      BuildOperationAdjList(graph);
+  PADDLE_ENFORCE(!HasCircleInternal(adj_list));
+  std::unordered_set<ir::Node *> visited;
+  std::vector<ir::Node *> ret;
+  for (auto adj : adj_list) {
+    if (visited.find(adj.first) == visited.end()) {
+      SortHelper(adj_list, adj.first, &visited, &ret);
+    }
+  }
+  return ret;
+}
+
+std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
+    const Graph &graph) {
+  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
+
+  for (auto &n : graph.Nodes()) {
+    if (n->NodeType() != ir::Node::Type::kOperation) continue;
+    if (adj_list.find(n) == adj_list.end()) {
+      adj_list[n] = std::unordered_set<ir::Node *>();
+    }
+    for (auto &var : n->inputs) {
+      for (auto &adj_n : var->inputs) {
+        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
+        adj_list[n].insert(adj_n);
+        VLOG(3) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
+                << " -> " << n->Name() << reinterpret_cast<void *>(n)
+                << "  via " << var->Name() << reinterpret_cast<void *>(var);
+      }
+    }
+  }
+  return adj_list;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd6c53a07f8f56781989739d995226bd02b3d3d0
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+// Test if the graph contains circle.
+bool HasCircle(const Graph &graph);
+
+// Topology Sort the operations in the graph from inputs to outputs.
+// `graph` cannot contain circle.
+std::vector<ir::Node *> TopologySortOperations(const Graph &graph);
+
+// Build an adjacency list of operations for the `graph`.
+std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
+    const Graph &graph);
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a260dd3da2a7863c06e51aa4feafd824ea254139
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
@@ -0,0 +1,125 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void BuildCircleGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+
+  o1->outputs.push_back(v1);
+  o1->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o1);
+}
+
+void BuildCircleGraph2(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+
+  o2->outputs.push_back(v2);
+  o1->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o1);
+}
+
+void BuildNoCircleGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
+  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
+  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
+  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
+
+  // o1->v1->o2
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2->outputs.push_back(v2);
+  o3->inputs.push_back(v2);
+  o4->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o3);
+  v2->outputs.push_back(o4);
+  // o2->v3->o5
+  o2->outputs.push_back(v3);
+  o5->inputs.push_back(v3);
+  v3->inputs.push_back(o2);
+  v3->outputs.push_back(o5);
+  // o3-v4->o5
+  o3->outputs.push_back(v4);
+  o5->inputs.push_back(v4);
+  v4->inputs.push_back(o3);
+  v4->outputs.push_back(o5);
+}
+
+TEST(GraphHelperTest, Basic) {
+  ProgramDesc prog;
+
+  Graph g(prog);
+  BuildCircleGraph(&g);
+  ASSERT_TRUE(HasCircle(g));
+
+  Graph g2(prog);
+  BuildCircleGraph2(&g2);
+  ASSERT_TRUE(HasCircle(g2));
+
+  auto adj_list = BuildOperationAdjList(g2);
+  for (auto& adj : adj_list) {
+    auto& adj_set = adj.second;
+    if (adj.first->Name() == "op1") {
+      ASSERT_EQ((*adj_set.begin())->Name(), "op2");
+    } else if (adj.first->Name() == "op2") {
+      ASSERT_EQ((*adj_set.begin())->Name(), "op1");
+    } else {
+      ASSERT_TRUE(false);
+    }
+  }
+
+  Graph g3(prog);
+  BuildNoCircleGraph(&g3);
+  ASSERT_FALSE(HasCircle(g3));
+  auto sorted = TopologySortOperations(g3);
+  std::map<std::string, size_t> node_map;
+  for (size_t i = 0; i < sorted.size(); ++i) {
+    node_map[sorted[i]->Name()] = i;
+  }
+  ASSERT_EQ(node_map.at("op1"), 0UL);
+  ASSERT_EQ(node_map.at("op2"), 1UL);
+  ASSERT_TRUE(node_map.at("op3") < node_map.at("op5"));
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index 4e23bf124f8822e25be0f6b1c7c8c5de4e4f600a..f9e6bdf3625bdced9d1a9195a979b0f46016d8bf 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -76,6 +76,7 @@ TEST(GraphTest, Basic) {
   op->SetType("sum");
   op->SetInput("X", {"test_a", "test_b", "test_c"});
   op->SetOutput("Out", {"test_out"});
+  op->SetAttr("op_role", 1);
 
   prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarType::SELECTED_ROWS);
   prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::SELECTED_ROWS);
@@ -92,21 +93,22 @@ TEST(GraphTest, Basic) {
   ASSERT_EQ(proto::VarType::LOD_TENSOR,
             prog.MutableBlock(0)->Var("test_out")->GetType());
 
-  std::unique_ptr<Graph> g(new Graph(prog));
-  ASSERT_EQ(g->nodes[0]->Name(), "sum");
-  ASSERT_EQ(g->nodes[0]->inputs[0]->Name(), "test_a");
-  ASSERT_EQ(g->nodes[0]->inputs[1]->Name(), "test_b");
-  ASSERT_EQ(g->nodes[0]->inputs[2]->Name(), "test_c");
-  ASSERT_EQ(g->nodes[0]->outputs[0]->Name(), "test_out");
-  ASSERT_EQ(g->nodes[1]->Name(), "test_a");
-  ASSERT_EQ(g->nodes[1]->outputs[0]->Name(), "sum");
-  ASSERT_EQ(g->nodes[2]->Name(), "test_b");
-  ASSERT_EQ(g->nodes[2]->outputs[0]->Name(), "sum");
-  ASSERT_EQ(g->nodes[3]->Name(), "test_c");
-  ASSERT_EQ(g->nodes[3]->outputs[0]->Name(), "sum");
-  ASSERT_EQ(g->nodes[4]->Name(), "test_out");
-  ASSERT_EQ(g->nodes[4]->inputs[0]->Name(), "sum");
-  ASSERT_EQ(g->nodes.size(), 5);
+  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
+  std::vector<ir::Node *> nodes(g->Nodes().begin(), g->Nodes().end());
+  for (ir::Node *n : nodes) {
+    if (n->Name() == "sum") {
+      ASSERT_EQ(n->inputs.size(), 3UL);
+      ASSERT_EQ(n->outputs.size(), 1UL);
+    } else if (n->Name() == "test_a" || n->Name() == "test_b" ||
+               n->Name() == "test_c") {
+      ASSERT_EQ(n->inputs.size(), 0UL);
+      ASSERT_EQ(n->outputs.size(), 1UL);
+    } else if (n->Name() == "test_out") {
+      ASSERT_EQ(n->inputs.size(), 1UL);
+      ASSERT_EQ(n->outputs.size(), 0UL);
+    }
+  }
+  ASSERT_EQ(nodes.size(), 5);
 }
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8cb812d1388bf74d173a4dc7a99561e730f8e95a
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+static const char kGraphVizPath[] = "graph_viz_path";
+
+std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string graph_viz_path = Get<std::string>(kGraphVizPath);
+  std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path));
+  PADDLE_ENFORCE(fout->good());
+  std::ostream& sout = *fout;
+
+  size_t var_id = 0;
+  std::unordered_map<const ir::Node*, size_t> vars;
+
+  sout << "digraph G {\n";
+
+  for (const ir::Node* n : graph->Nodes()) {
+    if (n->NodeType() != ir::Node::Type::kVariable) continue;
+    size_t cur_var_id = var_id++;
+    vars[n] = cur_var_id;
+
+    sout << "var_" << cur_var_id << " [label=\"" << n->Name() << "\"]"
+         << std::endl;
+  }
+
+  size_t op_id = 0;
+  for (const ir::Node* n : graph->Nodes()) {
+    if (n->NodeType() != ir::Node::Type::kOperation) continue;
+    std::string op_name = "op_" + std::to_string(op_id++);
+    sout << op_name << " [label=\"" << n->Name() << "\", shape=rect]"
+         << std::endl;
+    for (auto in : n->inputs) {
+      std::string var_name = "var_" + std::to_string(vars[in]);
+      sout << var_name << " -> " << op_name << std::endl;
+    }
+
+    for (auto out : n->outputs) {
+      std::string var_name = "var_" + std::to_string(vars[out]);
+      sout << op_name << " -> " << var_name << std::endl;
+    }
+  }
+
+  sout << "}\n";
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass)
+    .RequirePassAttr(paddle::framework::ir::kGraphVizPath);
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fd8c8a26e9581ccf605d4271a49ec2e90d8b997
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_viz_pass.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class GraphVizPass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
index 86376e7e8bc8bee2ddbc18f7f24bcdd849a06cbf..aca77da8d674f29b89c023717cdcd061232d023a 100644
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -15,5 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/node.h"
 
 namespace paddle {
-namespace framework {}  // namespace framework
+namespace framework {
+namespace ir {
+const char Node::kControlDepVarName[] = "__control_var";
+}  // namespace ir
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index b98c29b81ddc2f57553b8fe76fcfeb0936ddd837..b3138fccee86fb274abe72007961fc1c982b1e96 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -27,6 +27,8 @@ namespace ir {
 class Node {
  public:
   enum class Type { kOperation, kVariable };
+  static const char kControlDepVarName[];
+
   explicit Node(const std::string& name, Type type)
       : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}
 
@@ -50,6 +52,7 @@ class Node {
     PADDLE_ENFORCE(type_ == Type::kVariable);
     return var_desc_;
   }
+
   OpDesc* Op() {
     PADDLE_ENFORCE(type_ == Type::kOperation);
     return op_desc_;
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index c05d7d0bb54c8ba5938e08f7e8dace8f607d7b89..d7158eba62686be57499df697466797e4034ea8f 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -13,7 +13,34 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 
 namespace paddle {
-namespace framework {}  // namespace framework
+namespace framework {
+namespace ir {
+std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
+  PADDLE_ENFORCE(!applied_, "Pass can only Apply() once.");
+  PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty.");
+  for (const std::string& attr : required_pass_attrs_) {
+    PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(),
+                   "Required pass atrribute %s not set.", attr);
+  }
+  for (const std::string& attr : required_graph_attrs_) {
+    PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not set.",
+                   attr);
+  }
+  auto applied_graph = ApplyImpl(std::move(graph));
+  // TODO(panyx0718): Add more verifications.
+  PADDLE_ENFORCE(!HasCircle(*applied_graph),
+                 "Illegal Pass. Generated graph shouldn't has cycle.");
+  applied_ = true;
+  return applied_graph;
+}
+
+PassRegistry& PassRegistry::Instance() {
+  static PassRegistry g_pass_info_map;
+  return g_pass_info_map;
+}
+}  // namespace ir
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index f52ba788d55ddb9ed27baa3f6ff0a97e52370fe0..0f14083d259172f5b5f1ed80c7d38312d711beb5 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -14,21 +14,187 @@ limitations under the License. */
 
 #pragma once
 
+#include <functional>
+#include <map>
+#include <string>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
+template <typename PassType>
+struct PassRegistrar;
 
 class Pass {
  public:
   Pass() = default;
-  virtual ~Pass() {}
+  virtual ~Pass() {
+    for (auto &attr : attrs_) {
+      if (attr_dels_.find(attr.first) != attr_dels_.end()) {
+        attr_dels_[attr.first]();
+      }
+    }
+    attrs_.clear();
+    attr_dels_.clear();
+  }
+
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const;
+
+  // Get a reference to the attributed previously set.
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const {
+    PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(),
+                   "%s attr not registered for pass.", attr_name);
+    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+  }
+
+  // Set a pointer to the attribute. Pass takes ownership of the attribute.
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr) {
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass",
+                   attr_name);
+    attrs_[attr_name] = attr;
+    attr_dels_[attr_name] = [attr, attr_name]() {
+      VLOG(3) << "deleting " << attr_name;
+      delete attr;
+    };
+  }
+
+  // Set a pointer to the attribute. Pass doesn't take ownership. Caller
+  // should delete the attribute.
+  template <typename AttrType>
+  void SetNotOwned(const std::string &attr_name, AttrType *attr) {
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
+    attrs_[attr_name] = attr;
+  }
+
+ protected:
+  virtual std::unique_ptr<Graph> ApplyImpl(
+      std::unique_ptr<Graph> graph) const = 0;
+
+ private:
+  template <typename PassType>
+  friend struct PassRegistrar;
+
+  void RegisterRequiredPassAttrs(const std::unordered_set<std::string> &attrs) {
+    required_pass_attrs_.insert(attrs.begin(), attrs.end());
+  }
+
+  void RegisterRequiredGraphAttrs(
+      const std::unordered_set<std::string> &attrs) {
+    required_graph_attrs_.insert(attrs.begin(), attrs.end());
+  }
+
+  mutable bool applied_{false};
+  std::unordered_set<std::string> required_pass_attrs_;
+  std::unordered_set<std::string> required_graph_attrs_;
+  std::map<std::string, boost::any> attrs_;
+  std::map<std::string, std::function<void(void)>> attr_dels_;
+};
+
+using PassCreator = std::function<std::unique_ptr<Pass>()>;
+
+class Registrar {
+ public:
+  // In our design, various kinds of passes,
+  // have their corresponding registry and registrar. The action of
+  // registration is in the constructor of a global registrar variable, which
+  // are not used in the code that calls package framework, and would
+  // be removed from the generated binary file by the linker. To avoid such
+  // removal, we add Touch to all registrar classes and make USE_PASS macros to
+  // call this method. So, as long as the callee code calls USE_PASS, the global
+  // registrar variable won't be removed by the linker.
+  void Touch() {}
+};
 
-  virtual std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const = 0;
+class PassRegistry {
+ public:
+  static PassRegistry &Instance();
+
+  bool Has(const std::string &pass_type) const {
+    return map_.find(pass_type) != map_.end();
+  }
+
+  void Insert(const std::string &pass_type, const PassCreator &pass_creator) {
+    PADDLE_ENFORCE(!Has(pass_type), "Pass %s has been registered", pass_type);
+    map_.insert({pass_type, pass_creator});
+  }
+
+  std::unique_ptr<Pass> Get(const std::string &pass_type) const {
+    PADDLE_ENFORCE(Has(pass_type), "Pass %s has not been registered",
+                   pass_type);
+    return map_.at(pass_type)();
+  }
+
+ private:
+  PassRegistry() = default;
+  std::unordered_map<std::string, PassCreator> map_;
+
+  DISABLE_COPY_AND_ASSIGN(PassRegistry);
 };
+
+template <typename PassType>
+struct PassRegistrar : public Registrar {
+  explicit PassRegistrar(const char *pass_type) {
+    PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type),
+                   "'%s' is registered more than once.", pass_type);
+    PassRegistry::Instance().Insert(
+        pass_type, [this]() -> std::unique_ptr<Pass> {
+          std::unique_ptr<Pass> pass(new PassType());
+          pass->RegisterRequiredPassAttrs(this->required_pass_attrs_);
+          pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_);
+          return pass;
+        });
+  }
+
+  PassRegistrar<PassType> &RequirePassAttr(const std::string &attr) {
+    required_pass_attrs_.insert(attr);
+    return *this;
+  }
+
+  PassRegistrar<PassType> &RequireGraphAttr(const std::string &attr) {
+    required_graph_attrs_.insert(attr);
+    return *this;
+  }
+
+ private:
+  std::unordered_set<std::string> required_pass_attrs_;
+  std::unordered_set<std::string> required_graph_attrs_;
+};
+
+#define STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(uniq_name, msg)                   \
+  struct __test_global_namespace_##uniq_name##__ {};                          \
+  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
+                             __test_global_namespace_##uniq_name##__>::value, \
+                msg)
+
+// Register a new pass that can be applied on the IR.
+#define REGISTER_PASS(pass_type, pass_class)                          \
+  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                                \
+      __reg_pass__##pass_type,                                        \
+      "REGISTER_PASS must be called in global namespace");            \
+  static ::paddle::framework::ir::PassRegistrar<pass_class>           \
+      __pass_registrar_##pass_type##__(#pass_type);                   \
+  int TouchPassRegistrar_##pass_type() {                              \
+    __pass_registrar_##pass_type##__.Touch();                         \
+    return 0;                                                         \
+  }                                                                   \
+  static ::paddle::framework::ir::PassRegistrar<pass_class>           \
+      &__pass_tmp_registrar_##pass_type##__ __attribute__((unused)) = \
+          __pass_registrar_##pass_type##__
+
+#define USE_PASS(pass_type)                                           \
+  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                                \
+      __use_pass_itself_##pass_type,                                  \
+      "USE_PASS must be called in global namespace");                 \
+  extern int TouchPassRegistrar_##pass_type();                        \
+  static int use_pass_itself_##pass_type##_ __attribute__((unused)) = \
+      TouchPassRegistrar_##pass_type()
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5b5011412ed39e033a7a65921e9c64ce2d54c638
--- /dev/null
+++ b/paddle/fluid/framework/ir/pass_test.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/pass.h"
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+void BuildCircleGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+
+  o2->outputs.push_back(v2);
+  o1->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o1);
+}
+
+class TestPass : public Pass {
+ protected:
+  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const {
+    graph->Set<int>("copy_test_pass_attr", new int);
+    graph->Set<int>("copy_test_graph_attr", new int);
+
+    int test_pass_attr = this->Get<int>("test_pass_attr");
+    graph->Get<int>("copy_test_pass_attr") = test_pass_attr + 1;
+
+    int test_graph_attr = graph->Get<int>("test_graph_attr");
+    graph->Get<int>("copy_test_graph_attr") = test_graph_attr + 1;
+    return graph;
+  }
+};
+
+TEST(PassTest, TestPassAttrCheck) {
+  ProgramDesc prog;
+  auto pass = PassRegistry::Instance().Get("test_pass");
+  std::unique_ptr<Graph> graph(new Graph(prog));
+  std::string exception;
+  try {
+    graph = pass->Apply(std::move(graph));
+  } catch (paddle::platform::EnforceNotMet e) {
+    exception = std::string(e.what());
+  }
+  ASSERT_TRUE(exception.find("test_pass_attr not set") != exception.npos);
+
+  int val = 1;
+  graph.reset(new Graph(prog));
+  pass->SetNotOwned<int>("test_pass_attr", &val);
+
+  try {
+    graph = pass->Apply(std::move(graph));
+  } catch (paddle::platform::EnforceNotMet e) {
+    exception = std::string(e.what());
+  }
+  ASSERT_TRUE(exception.find("test_graph_attr not set") != exception.npos);
+
+  graph.reset(new Graph(prog));
+  graph->Set<int>("test_graph_attr", new int);
+  graph->Get<int>("test_graph_attr") = 1;
+  graph = pass->Apply(std::move(graph));
+  ASSERT_EQ(graph->Get<int>("copy_test_pass_attr"), 2);
+  ASSERT_EQ(graph->Get<int>("copy_test_graph_attr"), 2);
+
+  try {
+    graph = pass->Apply(std::move(graph));
+  } catch (paddle::platform::EnforceNotMet e) {
+    exception = std::string(e.what());
+  }
+  ASSERT_TRUE(exception.find("Pass can only Apply() once") != exception.npos);
+
+  pass = PassRegistry::Instance().Get("test_pass");
+  pass->SetNotOwned<int>("test_pass_attr", &val);
+  graph.reset(new Graph(prog));
+  BuildCircleGraph(graph.get());
+  graph->Set<int>("test_graph_attr", new int);
+  graph->Get<int>("test_graph_attr") = 2;
+  try {
+    auto tmp = pass->Apply(std::move(graph));
+  } catch (paddle::platform::EnforceNotMet e) {
+    exception = std::string(e.what());
+  }
+  ASSERT_TRUE(exception.find("shouldn't has cycle") != exception.npos);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(test_pass, paddle::framework::ir::TestPass)
+    .RequirePassAttr("test_pass_attr")
+    .RequireGraphAttr("test_graph_attr");
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 71bebeea637a7eb6e3bfddc0b2b641477b06bcdf..7836ecb1272a07a79a70c9cb040335f9a42e5684 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -16,6 +16,7 @@
 
 #include <algorithm>
 #include <initializer_list>
+#include <memory>
 #include <vector>
 
 #include "paddle/fluid/framework/tensor.h"
@@ -386,13 +387,14 @@ template <typename T>
 class CPUVector : public std::vector<T, std::allocator<T>> {
  public:
   CPUVector() : std::vector<T>() {}
-  CPUVector(size_t count, const T &value = T())
+  CPUVector(size_t count, const T &value = T())  // NOLINT
       : std::vector<T>(count, value) {}
   CPUVector(std::initializer_list<T> init) : std::vector<T>(init) {}
-  CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}
-  explicit CPUVector(const CPUVector<T> &other) : std::vector<T>(other) {}
+  CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}  // NOLINT
+  CPUVector(const CPUVector<T> &other) : std::vector<T>(other) {}
   CPUVector(CPUVector<T> &&other) : std::vector<T>(std::move(other)) {}
-  CPUVector(std::vector<T> &&other) : std::vector<T>(std::move(other)) {}
+  CPUVector(std::vector<T> &&other)  // NOLINT
+      : std::vector<T>(std::move(other)) {}
   CPUVector &operator=(const CPUVector &other) {
     this->assign(other.begin(), other.end());
     return *this;
@@ -410,8 +412,6 @@ class CPUVector : public std::vector<T, std::allocator<T>> {
     return os;
   }
 
-  void resize(size_t size) { this->resize(size); }
-
   T &operator[](size_t id) { return this->at(id); }
 
   const T &operator[](size_t id) const { return this->at(id); }
diff --git a/paddle/fluid/framework/mixed_vector_test.cc b/paddle/fluid/framework/mixed_vector_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0599c8d384641606b0a5ebb5ba1781b56f539e63
--- /dev/null
+++ b/paddle/fluid/framework/mixed_vector_test.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <memory>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/mixed_vector.h"
+
+template <typename T>
+using vec = paddle::framework::Vector<T>;
+
+TEST(mixed_vector, CPU_VECTOR) {
+  vec<int> tmp;
+  for (int i = 0; i < 10; ++i) {
+    tmp.push_back(i);
+  }
+  ASSERT_EQ(tmp.size(), 10UL);
+  vec<int> tmp2;
+  tmp2 = tmp;
+  ASSERT_EQ(tmp2.size(), 10UL);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(tmp2[i], i);
+    ASSERT_EQ(tmp2[i], tmp[i]);
+  }
+  int cnt = 0;
+  for (auto& t : tmp2) {
+    ASSERT_EQ(t, cnt);
+    ++cnt;
+  }
+}
+
+TEST(mixed_vector, InitWithCount) {
+  paddle::framework::Vector<int> vec(10, 10);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(vec[i], 10);
+  }
+}
+
+TEST(mixed_vector, ForEach) {
+  vec<int> tmp;
+  for (auto& v : tmp) {
+    VLOG(3) << v;
+  }
+}
+
+TEST(mixed_vector, Reserve) {
+  paddle::framework::Vector<int> vec;
+  vec.reserve(1);
+  vec.push_back(0);
+  vec.push_back(0);
+  vec.push_back(0);
+}
+
+TEST(mixed_vector, Resize) {
+  paddle::framework::Vector<int> vec;
+  vec.resize(1);
+  vec.push_back(0);
+  vec.push_back(0);
+  vec.push_back(0);
+}
diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu
index d57f82510833d6a0cea7009cf1f0b49543812f8d..4b0caa8d350dde0462e5fdcca743df919358a364 100644
--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -11,7 +11,9 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. */
+
 #include <cuda_runtime.h>
+#include <memory>
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
@@ -21,26 +23,6 @@
 template <typename T>
 using vec = paddle::framework::Vector<T>;
 
-TEST(mixed_vector, CPU_VECTOR) {
-  vec<int> tmp;
-  for (int i = 0; i < 10; ++i) {
-    tmp.push_back(i);
-  }
-  ASSERT_EQ(tmp.size(), 10UL);
-  vec<int> tmp2;
-  tmp2 = tmp;
-  ASSERT_EQ(tmp2.size(), 10UL);
-  for (int i = 0; i < 10; ++i) {
-    ASSERT_EQ(tmp2[i], i);
-    ASSERT_EQ(tmp2[i], tmp[i]);
-  }
-  int cnt = 0;
-  for (auto& t : tmp2) {
-    ASSERT_EQ(t, cnt);
-    ++cnt;
-  }
-}
-
 static __global__ void multiply_10(int* ptr) {
   for (int i = 0; i < 10; ++i) {
     ptr[i] *= 10;
@@ -91,24 +73,3 @@ TEST(mixed_vector, MultiGPU) {
     ASSERT_EQ(tmp[i], i * 100);
   }
 }
-
-TEST(mixed_vector, InitWithCount) {
-  paddle::framework::Vector<int> vec(10, 10);
-  for (int i = 0; i < 10; ++i) {
-    ASSERT_EQ(vec[i], 10);
-  }
-}
-
-TEST(mixed_vector, ForEach) {
-  vec<int> tmp;
-  for (auto& v : tmp) {
-  }
-}
-
-TEST(mixed_vector, Reserve) {
-  paddle::framework::Vector<int> vec;
-  vec.reserve(1);
-  vec.push_back(0);
-  vec.push_back(0);
-  vec.push_back(0);
-}
diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc
index db95861c510b52a5b52229541434e6437d3fb9f4..3e17a512ce154de88ac890f3b29f03385595d95c 100644
--- a/paddle/fluid/framework/op_kernel_type_test.cc
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
@@ -29,6 +29,13 @@ TEST(OpKernelType, ToString) {
   ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
             "data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type["
             "CUDNN]");
+
+  using CUDAPlace = paddle::platform::CUDAPlace;
+  OpKernelType op_kernel_type2(DataType::FP16, CUDAPlace(0), DataLayout::kNCHW,
+                               LibraryType::kCUDNN);
+  ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2),
+            "data_type[float16]:data_layout[NCHW]:place[CUDAPlace(0)]:library_"
+            "type[CUDNN]");
 }
 
 TEST(OpKernelType, Hash) {
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index 001b5cb5a8eb57cbe0a2e0ad7f64ef05f8149922..2288c7fe6609a765612b468d69ad35101b92b384 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -40,6 +40,40 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
   return OpProtoAndCheckerMaker::VariableBuilder{output};
 }
 
+void OpProtoAndCheckerMaker::Reuse(const std::string& name,
+                                   const std::string& reused_name) {
+  bool found = false;
+  proto::OpProto::Var* var;
+
+  for (auto& var : proto_->inputs()) {
+    if (var.name() == reused_name) {
+      found = true;
+      break;
+    }
+  }
+  PADDLE_ENFORCE(found == true,
+                 "Input/Output name: %s reused_name: %s, one of them is not "
+                 "exists or not matched.",
+                 name, reused_name);
+
+  found = false;
+  for (int i = 0; i < proto_->outputs().size(); ++i) {
+    var = proto_->mutable_outputs()->Mutable(i);
+    if (var->name() == name) {
+      PADDLE_ENFORCE(!var->has_reuse(),
+                     "Output(%s) has been set reused var of %s", name,
+                     var->reuse());
+      found = true;
+      var->set_reuse(reused_name);
+      break;
+    }
+  }
+  PADDLE_ENFORCE(found == true,
+                 "Input/Output name: %s reused_name: %s, one of them is not "
+                 "exists or not matched.",
+                 name, reused_name);
+}
+
 void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
   std::unordered_set<std::string> names;
   auto checker = [&](const std::string& name) {
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 92f86bb5de520878d0a7b8d7214620580242c061..80970291c9c234f1306162f4ffa3c2528f88c35f 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -78,6 +78,8 @@ class OpProtoAndCheckerMaker {
   VariableBuilder AddOutput(const std::string &name,
                             const std::string &comment);
 
+  void Reuse(const std::string &name, const std::string &reused_name);
+
   template <typename T>
   TypedAttrChecker<T> &AddAttr(const std::string &name,
                                const std::string &comment,
diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc
index 58f70cb39c0d96ed3b9ff35ea132ba75a37f5405..b71c7b646857e11f291748c4c7c2af92b6d53231 100644
--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -49,6 +49,15 @@ TEST(ProtoMaker, DuplicatedInOut) {
 }
 
 class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "input of test op");
+    AddOutput("XOut", "output of test op").Reuse("X");
+  }
+};
+
+class TestInplaceProtoMaker2
+    : public paddle::framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
     AddInput("X", "input of test op");
@@ -58,12 +67,100 @@ class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 };
 
 TEST(ProtoMaker, InplaceOutput) {
-  paddle::framework::proto::OpProto op_proto;
+  paddle::framework::proto::OpProto op_proto, op_proto2;
   paddle::framework::OpAttrChecker op_checker;
   TestInplaceProtoMaker proto_maker;
-  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
+  TestInplaceProtoMaker2 proto_maker2;
+
+  proto_maker(&op_proto, &op_checker);
+
+  ASSERT_THROW(proto_maker2(&op_proto2, &op_checker),
                paddle::platform::EnforceNotMet);
-  // proto_maker(&op_proto, &op_checker);
-  // proto_maker.Make();
-  // ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
 }
+
+// normal reuse
+class TestReuseProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "input of test op");
+    AddInput("Y", "input of test op");
+    AddOutput("Out", "output of test op");
+    AddOutput("XOut", "output of test op");
+    // avoid destructor exception.
+    // Validate();
+    TestReuse();
+  }
+
+  virtual void TestReuse() {}
+};
+
+// test duplicate reuse error
+class TestReuseProtoMaker2 : public TestReuseProtoMaker {
+ public:
+  void TestReuse() {
+    Reuse("Out", "X");
+    Reuse("Out", "Y");
+  }
+};
+
+// NotExists Input
+class TestReuseProtoMaker3 : public TestReuseProtoMaker {
+ public:
+  void TestReuse() {
+    Reuse("Out", "NotExists");
+    Reuse("XOut", "X");
+  }
+};
+
+// NotExists Output
+class TestReuseProtoMaker4 : public TestReuseProtoMaker {
+ public:
+  void TestReuse() { Reuse("NotExists", "X"); }
+};
+
+TEST(ProtoMaker, Reuse) {
+  paddle::framework::proto::OpProto op_proto;
+  paddle::framework::OpAttrChecker op_checker;
+  TestReuseProtoMaker proto_maker;
+  proto_maker(&op_proto, &op_checker);
+}
+
+// NOTE(dzhwinter):
+// There is a Fatal CHECK on base class destructor, which will call abort inside
+// instead of
+// throw an exception. If we throw an exception in Make(), we will trigger the
+// CHECK and terminate the tests.
+//
+// I had tried to replace the default CHECK with a exception, however, it's
+// still not supported by glog.
+// the details:
+// https://github.com/google/glog/issues/249
+// https://github.com/facebookresearch/TensorComprehensions/issues/351
+/*
+TEST(ProtoMaker, ReuseWithException) {
+  paddle::framework::proto::OpProto op_proto2, op_proto3, op_proto4;
+  paddle::framework::OpAttrChecker op_checker;
+  TestReuseProtoMaker2 proto_maker2;
+  TestReuseProtoMaker3 proto_maker3;
+  TestReuseProtoMaker4 proto_maker4;
+  EXPECT_THROW(proto_maker2(&op_proto2, &op_checker),
+               paddle::platform::EnforceNotMet);
+
+  EXPECT_THROW(proto_maker3(&op_proto3, &op_checker),
+               paddle::platform::EnforceNotMet);
+
+  EXPECT_THROW(proto_maker4(&op_proto4, &op_checker),
+               paddle::platform::EnforceNotMet);
+}
+
+void FailureFunction() {
+  throw std::runtime_error("Check failed in destructor.");
+  // return 0;
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  google::InstallFailureFunction(&FailureFunction);
+  return RUN_ALL_TESTS();
+}
+*/
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index d1dc5fcd97b77fb7707c7d48f6eaeef140d3f306..d04f7744961b2561977f4d36d0073a97557043da 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/var_type.h"
@@ -57,7 +58,11 @@ static DDim GetDims(const Scope& scope, const std::string& name,
   }
 
   if (var->IsType<LoDTensor>()) {
-    return var->Get<LoDTensor>().dims();
+    const LoDTensor& tensor = var->Get<LoDTensor>();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return DDim({-1});
+    }
+    return tensor.dims();
   } else if (var->IsType<SelectedRows>()) {
     if (get_actual_dim) {
       return var->Get<SelectedRows>().value().dims();
@@ -69,6 +74,26 @@ static DDim GetDims(const Scope& scope, const std::string& name,
   }
 }
 
+static std::string GetDtype(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  if (var == nullptr) {
+    return "";
+  }
+
+  if (var->IsType<LoDTensor>()) {
+    const LoDTensor& tensor = var->Get<LoDTensor>();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return "";
+    }
+    return DataTypeToString(ToDataType(tensor.type()));
+  } else if (var->IsType<SelectedRows>()) {
+    return DataTypeToString(
+        ToDataType(var->Get<SelectedRows>().value().type()));
+  } else {
+    return "";
+  }
+}
+
 static int GetRowSize(const Scope& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
@@ -91,14 +116,18 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
   }
 
   if (var->IsType<LoDTensor>()) {
-    return var->Get<LoDTensor>().lod();
+    const LoDTensor& tensor = var->Get<LoDTensor>();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return default_lod;
+    }
+    return tensor.lod();
   } else {
     return default_lod;
   }
 }
 
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  VLOG(10) << "- " << DebugStringEx(&scope);
+  VLOG(4) << place << " " << DebugStringEx(&scope);
   if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
     PADDLE_THROW("Cannot run operator on place %s", place);
@@ -107,8 +136,10 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     platform::SetDeviceId(dev_id);
 #endif
   }
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  platform::RecordEvent record_event(Type(), pool.Get(place));
   RunImpl(scope, place);
-  VLOG(10) << "+ " << DebugStringEx(&scope);
+  VLOG(3) << place << " " << DebugStringEx(&scope);
 }
 
 bool OperatorBase::HasInputs(const std::string& name) const {
@@ -172,6 +203,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
         if (row_size >= 0) {
           ss << "[row_size=" << row_size << "]";
         }
+        std::string dtype = GetDtype(*scope, input.second[i]);
+        ss << ":" << dtype;
         ss << "[" << GetDims(*scope, input.second[i], true) << "]";
         ss << "(" << GetLoD(*scope, input.second[i]) << ")";
       }
@@ -608,9 +641,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
-  // For profiling, don't move out of this function because that will result
-  // in the failure of multi-GPU profiling.
-  platform::RecordEvent record_event(Type(), dev_ctx);
   // check if op[type] has kernel registered.
   auto& all_op_kernels = AllOpKernels();
   auto kernels_iter = all_op_kernels.find(type_);
@@ -679,6 +709,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       if (var == nullptr) continue;
       if (var->IsType<framework::LoDTensor>()) {
         CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+      } else if (var->IsType<framework::SelectedRows>()) {
+        CheckTensorNANOrInf(vname, var->Get<framework::SelectedRows>().value());
       }
     }
   }
@@ -746,6 +778,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
   auto& scope = ctx.scope();
   int data_type = -1;
+  std::string last_input_name;
   for (auto& input : this->inputs_) {
     for (auto& ipt_name : input.second) {
       auto* var = scope.FindVar(ipt_name);
@@ -762,9 +795,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           int tmp = static_cast<int>(ToDataType(t->type()));
           PADDLE_ENFORCE(
               tmp == data_type || data_type == -1,
-              "DataType of Paddle Op %s must be the same. Get %d != %d", Type(),
-              data_type, tmp);
+              "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)",
+              Type(), last_input_name, data_type, ipt_name, tmp);
           data_type = tmp;
+          last_input_name = ipt_name;
         }
       }
     }
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 1e5bba62b53025dacdbf2d74b35f266cf4e422c2..275cb8c592c3c0b153d31149570cd6596b9e1a7f 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -19,19 +19,80 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
+#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
 
+std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
+    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
+    const std::string &loss_var_name,
+    const std::unordered_set<std::string> &param_names,
+    const std::vector<Scope *> &local_scopes, const bool use_cuda,
+#ifdef PADDLE_WITH_CUDA
+    const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) {
+#else
+    const BuildStrategy &strategy) {
+#endif
+  // Convert the program to graph.
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
+
+  // Apply a graph viz pass to record a graph.
+  if (!strategy.debug_graphviz_path_.empty()) {
+    auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
+    const std::string graph_path = string::Sprintf(
+        "%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph");
+    viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
+    graph = viz_pass->Apply(std::move(graph));
+  }
+
+  // Convert graph to run on multi-devices.
+  auto multi_devices_pass =
+      ir::PassRegistry::Instance().Get("multi_devices_pass");
+  multi_devices_pass->SetNotOwned<const std::vector<platform::Place>>("places",
+                                                                      &places);
+  multi_devices_pass->SetNotOwned<const std::string>("loss_var_name",
+                                                     &loss_var_name);
+  multi_devices_pass->SetNotOwned<const std::unordered_set<std::string>>(
+      "params", &param_names);
+  multi_devices_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
+                                                              &local_scopes);
+  multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
+
+#ifdef PADDLE_WITH_CUDA
+  platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
+  multi_devices_pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
+#endif
+  graph = multi_devices_pass->Apply(std::move(graph));
+
+  // Apply a graph print pass to record a graph with device info.
+  if (!strategy.debug_graphviz_path_.empty()) {
+    auto multi_devices_print_pass =
+        ir::PassRegistry::Instance().Get("multi_devices_print_pass");
+    multi_devices_print_pass->SetNotOwned<const std::string>(
+        "debug_graphviz_path", &strategy.debug_graphviz_path_);
+    multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
+        "graph_printer", new details::GraphvizSSAGraphPrinter);
+    graph = multi_devices_print_pass->Apply(std::move(graph));
+  }
+
+  // Verify that the graph is correct for multi-device executor.
+  auto multi_devices_check_pass =
+      ir::PassRegistry::Instance().Get("multi_devices_check_pass");
+  graph = multi_devices_check_pass->Apply(std::move(graph));
+  return graph;
+}
+
 class ParallelExecutorPrivate {
  public:
   explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
@@ -119,21 +180,19 @@ ParallelExecutor::ParallelExecutor(
     var_infos.back().persistable_ = var->Persistable();
   }
 
-  // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
-  // ncclOp
-  details::SSAGraphBuilderFactory builder_factory(
-      member_->places_, loss_var_name, params, member_->local_scopes_,
-      build_strategy);
-  if (member_->use_cuda_) {
+// Step 3. Convert main_program to SSA form and dependency graph. Also, insert
+// ncclOp
 #ifdef PADDLE_WITH_CUDA
-    builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get());
+  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
+      main_program, member_->places_, loss_var_name, params,
+      member_->local_scopes_, member_->use_cuda_, build_strategy,
+      member_->nccl_ctxs_.get());
 #else
-    PADDLE_THROW("Not compiled with CUDA.");
+  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
+      main_program, member_->places_, loss_var_name, params,
+      member_->local_scopes_, member_->use_cuda_, build_strategy);
 #endif
-  }
-  builder_ = builder_factory.Create();
-  std::unique_ptr<Graph> graph(new Graph(main_program));
-  graph = builder_->Apply(std::move(graph));
+
   member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
       exec_strategy, member_->local_scopes_, places, std::move(graph)));
   member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
@@ -146,11 +205,18 @@ void ParallelExecutor::BCastParamsToDevices(
   // the initializing bcast, all vars would be bcast from device(0),
   // otherwise
   // bcast from the specified device.
-  bool initializing = builder_.get() == nullptr ? true : false;
-
+  bool initializing = member_->executor_ ? false : true;
   for (auto &var : vars) {
-    int var_dev_id =
-        builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var);
+    int var_dev_id = -1;
+    if (member_->executor_) {
+      auto &sharded_var_device =
+          member_->executor_->Graph().Get<details::ShardedVarDevice>(
+              details::kShardedVarDevice);
+      if (sharded_var_device.find(var) != sharded_var_device.end()) {
+        var_dev_id = sharded_var_device.at(var);
+      }
+    }
+
     if (!initializing && var_dev_id == -1) continue;
 
     framework::Variable *main_var = nullptr;
@@ -286,3 +352,8 @@ ParallelExecutor::~ParallelExecutor() {
 
 }  // namespace framework
 }  // namespace paddle
+
+USE_PASS(graph_viz_pass);
+USE_PASS(multi_devices_pass);
+USE_PASS(multi_devices_check_pass);
+USE_PASS(multi_devices_print_pass);
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index ffb9934a2d702b2bf6db7ad75a6bf9867e1e9901..5fb748fa205d5e9dbd2943b615c69aedd0e7a26f 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/details/execution_strategy.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -70,7 +70,6 @@ class ParallelExecutor {
 
  private:
   ParallelExecutorPrivate *member_;
-  std::unique_ptr<details::SSAGraphBuilder> builder_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index ef224d68f1fc561f45e9d7a81425e62655457648..0bbfd66148e9bc9080654bf1b0b34477115a0e6b 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -82,7 +82,7 @@ class Tensor {
   template <typename T>
   const T* data() const;
 
-  bool IsInitialized() const;
+  inline bool IsInitialized() const;
 
   /**
    * @brief   Return a pointer to mutable memory block.
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index 0a1cb6d5703dace5e6be73285655ecd9d2ad89fb..cb2061c06a429d8e8116001a4aa4e8c46ea13428 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/tensor.h"
 #include <gtest/gtest.h>
 #include <string>
+#include "paddle/fluid/platform/float16.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
@@ -213,3 +214,17 @@ TEST(Tensor, Layout) {
   src.set_layout(framework::DataLayout::kAnyLayout);
   ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
 }
+
+TEST(Tensor, FP16) {
+  using platform::float16;
+  framework::Tensor src;
+  float16* src_ptr = src.mutable_data<float16>({2, 3}, platform::CPUPlace());
+  for (int i = 0; i < 2 * 3; ++i) {
+    src_ptr[i] = static_cast<float16>(i);
+  }
+  EXPECT_EQ(src.memory_size(), 2 * 3 * sizeof(float16));
+  // EXPECT a human readable error message
+  // src.data<uint8_t>();
+  // Tensor holds the wrong type, it holds N6paddle8platform7float16E at
+  // [/paddle/Paddle/paddle/fluid/framework/tensor_impl.h:43]
+}
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 16c9c81258a9fdb7730b9b3e34be990798c91639..ba7645aa02413f28a648f35e381da7824604a455 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -14,8 +14,15 @@ cc_library(paddle_fluid_api
 
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 
+# paddle_fluid_origin exclude inference api interface
+cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+
+if(NOT APPLE)
+  add_subdirectory(api)
+endif()
+
 # Create static library
-cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api)
+cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api)
 if(NOT APPLE)
   # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
   set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
@@ -24,7 +31,7 @@ endif()
 
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
-    SRCS io.cc
+    SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
     DEPS ${fluid_modules} paddle_fluid_api)
 
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
@@ -32,12 +39,21 @@ if(NOT APPLE)
   # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
   set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map")
   set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+  # check symbol hidden
+  FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
+    "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
+    " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_fluid.so\" RESULT_VARIABLE symbol_res)\n"
+    "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
+    "  message(FATAL_ERROR \"Check symbol failed.\")\n"
+    "endif()\n")
+  add_custom_command(
+    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol"
+    COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake"
+    DEPENDS paddle_fluid_shared)
+  add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
 endif()
 
 if(WITH_TESTING)
   # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
   add_subdirectory(tests/book)
 endif()
-if(NOT APPLE)
-  add_subdirectory(api)
-endif()
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 67d355d10d3c9e11b59c9ce9d208826523095459..27fe575cb6167a726ff92a8f3d2e47b6f536ba39 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -6,9 +6,11 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph
   tensorrt_subgraph_node_mark_pass.cc
   analyzer.cc
   helper.cc
+        model_store_pass.cc
   DEPS framework_proto proto_desc)
 cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
+cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis)
 
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
 
@@ -40,3 +42,4 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_
 inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
 inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
 inference_analysis_test(test_analyzer SRCS analyzer_tester.cc)
+inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index b3a1075e5adf4a24bf32017574c061f36c46ba8c..c4ab26a2288bb9d8f3cd54a797d2062e0606b219 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -17,18 +17,22 @@
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/model_store_pass.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
 
 namespace paddle {
 
-DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
+DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, true,
             "Enable subgraph to TensorRT engine for acceleration");
 
 DEFINE_string(inference_analysis_graphviz_log_root, "./",
               "Graphviz debuger for data flow graphs.");
 
+DEFINE_string(inference_analysis_output_storage_path, "",
+              "optimized model output path");
+
 namespace inference {
 namespace analysis {
 
@@ -38,15 +42,27 @@ class DfgPassManagerImpl final : public DfgPassManager {
     // TODO(Superjomn) set the key with pass reprs.
     AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
     if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
-      auto trt_teller = [](const Node* node) {
+      auto trt_teller = [&](const Node* node) {
+        std::unordered_set<std::string> teller_set(
+            {"elementwise_add", "mul", "conv2d", "pool2d", "relu"});
         if (!node->IsFunction()) return false;
-        return static_cast<const Function*>(node)->func_type() == "mul";
+
+        const auto* func = static_cast<const Function*>(node);
+        if (teller_set.count(func->func_type()))
+          return true;
+        else {
+          return false;
+        }
       };
+
       AddPass("tensorrt-subgraph-marker",
               new TensorRTSubgraphNodeMarkPass(trt_teller));
       AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
     }
     AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
+    if (!FLAGS_inference_analysis_output_storage_path.empty()) {
+      AddPass("model-store-pass", new ModelStorePass);
+    }
   }
 
   std::string repr() const override { return "dfg-pass-manager"; }
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index 0132bf5b9c6552391aaa19542669487f42b685a7..c82fdfff86c91b4e07e3c1b80987d3d8d796ad23 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -16,28 +16,23 @@ limitations under the License. */
 
 /*
  * This file contains Analyzer, an class that exposed as a library that analyze
- * and optimize
- * Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to
- * control whether
- * an process is applied on the program.
+ * and optimize Fluid ProgramDesc for inference. Similar to LLVM, it has
+ * multiple flags to
+ * control whether an process is applied on the program.
  *
  * The processes are called Passes in analysis, the Passes are placed in a
- * pipeline, the first
- * Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to
- * a data flow
- * graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow
- * graph to a
- * Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes
- * which take a
- * node or data flow graph as input.
+ * pipeline, the first Pass is the FluidToDataFlowGraphPass which transforms a
+ * Fluid ProgramDesc to
+ * a data flow graph, the last Pass is DataFlowGraphToFluidPass which transforms
+ * a data flow graph to a Fluid ProgramDesc. The passes in the middle of the
+ * pipeline can be any Passes
+ * which take a node or data flow graph as input.
  *
  * The Analyzer can be used in two methods, the first is a executable file which
- * can be used to
- * pre-process the inference model and can be controlled by passing difference
- * command flags;
+ * can be used to pre-process the inference model and can be controlled by
+ * passing difference command flags;
  * the other way is to compose inside the inference API as a runtime pre-process
- * phase in the
- * inference service.
+ * phase in the inference service.
  */
 
 #include <gflags/gflags.h>
@@ -50,6 +45,7 @@ namespace paddle {
 // flag if not available.
 DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
 DECLARE_string(inference_analysis_graphviz_log_root);
+DECLARE_string(inference_analysis_output_storage_path);
 
 namespace inference {
 namespace analysis {
diff --git a/paddle/fluid/inference/analysis/analyzer_main.cc b/paddle/fluid/inference/analysis/analyzer_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e1fe3eb797cdced56a61aa2db0c3d18601824f8
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer_main.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file implements analysizer -- an executation help to analyze and
+ * optimize trained model.
+ */
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  using paddle::inference::analysis::Analyzer;
+  using paddle::inference::analysis::Argument;
+
+  Argument argument;
+  Analyzer analyzer;
+  analyzer.Run(&argument);
+
+  return 0;
+}
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 25a440e7e71fddb38cc515f99d15231675a8172e..24bfb3993cf569561980006b6627b56327dd0f67 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -20,14 +20,18 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, analysis_without_tensorrt) {
+TEST(Analyzer, analysis_without_tensorrt) {
   FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false;
+  Argument argument;
+  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   Analyzer analyser;
   analyser.Run(&argument);
 }
 
-TEST_F(DFG_Tester, analysis_with_tensorrt) {
+TEST(Analyzer, analysis_with_tensorrt) {
   FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true;
+  Argument argument;
+  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   Analyzer analyser;
   analyser.Run(&argument);
 }
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 6d316f20bff7a68754b0afec6463bd5d7579227f..a17d6281a2976f0600c7ce94c2d43e65d30de265 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -23,6 +23,7 @@
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 
@@ -36,6 +37,16 @@ namespace analysis {
  * All the fields should be registered here for clearness.
  */
 struct Argument {
+  Argument() = default;
+  explicit Argument(const std::string& fluid_model_dir)
+      : fluid_model_dir(new std::string(fluid_model_dir)) {}
+  // The directory of the trained model.
+  std::unique_ptr<std::string> fluid_model_dir;
+  // The path of `__model__` and `param`, this is used when the file name of
+  // model and param is changed.
+  std::unique_ptr<std::string> fluid_model_program_path;
+  std::unique_ptr<std::string> fluid_model_param_path;
+
   // The graph that process by the Passes or PassManagers.
   std::unique_ptr<DataFlowGraph> main_dfg;
 
@@ -44,6 +55,9 @@ struct Argument {
 
   // The processed program desc.
   std::unique_ptr<framework::proto::ProgramDesc> transformed_program_desc;
+
+  // The output storage path of ModelStorePass.
+  std::unique_ptr<std::string> model_output_store_path;
 };
 
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
index 8a3af0a8ebd5bad7be7046fa399cca4920da3d71..7f64bc75ae8ad40a268739cdc36051e76af9f49a 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -337,6 +337,34 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
                         std::vector<Node *>(outputs.begin(), outputs.end()));
 }
 
+void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
+  std::vector<Node *> op_nodes;
+  for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
+    if (node.type() == Node::Type::kValue || node.deleted()) {
+      continue;
+    }
+    op_nodes.push_back(&node);
+  }
+  size_t op_num = op_nodes.size();
+  for (size_t i = 0; i < op_num; i++) {
+    if (op_nodes[i]->type() == Node::Type::kFunction) continue;
+    std::unordered_set<std::string> follow_up_input_names;
+    for (size_t j = i + 1; j < op_num; j++) {
+      for (auto *in : op_nodes[j]->inlinks) {
+        follow_up_input_names.insert(in->name());
+      }
+    }
+    std::vector<Node *> filtered_subgraph_outlinks;
+    for (auto *out : op_nodes[i]->outlinks) {
+      if (follow_up_input_names.count(out->name())) {
+        filtered_subgraph_outlinks.push_back(out);
+      }
+    }
+    PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL);
+    op_nodes[i]->outlinks = filtered_subgraph_outlinks;
+  }
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index 1c60d5de21538043962cc58a6f508aea635fe8c4..bb3ec6bbc1d9555386aba8837b019d2511653258 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -36,6 +36,8 @@ namespace analysis {
 
 /*
  * DataFlowGraph - A container of Value and Function Nodes.
+ *
+ * This is the base graph for any other type of graphs, such as SSA or CFG.
  */
 struct DataFlowGraph {
   NodeMap nodes;
@@ -174,8 +176,9 @@ struct GraphTraits<DataFlowGraph> {
 // sub-graph is the inputs nodes and output nodes that doesn't inside the
 // sub-graph.
 std::pair<std::vector<Node *>, std::vector<Node *>>
-ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);  // NOLINT
 
+void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph);
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
index 7912f8d7f17ae3c79e8f73f36b7095fd52c9ac86..a881262665f156812da9e1576aa29b05fc398499 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -20,7 +20,7 @@ namespace inference {
 namespace analysis {
 
 TEST(DataFlowGraph, BFS) {
-  auto desc = LoadProgramDesc();
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
   dfg.Build();
 
@@ -44,7 +44,7 @@ TEST(DataFlowGraph, BFS) {
 }
 
 TEST(DataFlowGraph, DFS) {
-  auto desc = LoadProgramDesc();
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
   dfg.Build();
   GraphTraits<DataFlowGraph> trait(&dfg);
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
index 2328d870422c5a31c22d7b09980aae35e01b2b25..18c32fa09199003f17183207828cdfe4e627ae1a 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -23,7 +23,7 @@
 namespace paddle {
 namespace inference {
 
-DEFINE_int32(tensorrt_max_batchsize, 300, "TensorRT maximum batch size");
+DEFINE_int32(tensorrt_max_batchsize, 3, "TensorRT maximum batch size");
 DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size");
 
 namespace analysis {
@@ -52,6 +52,7 @@ bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
 bool DataFlowGraphToFluidPass::Finalize() { return true; }
 
 void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
+  FilterRedundantOutputOfSubGraph(graph);
   LOG(INFO) << "graph.inputs " << graph->inputs.size();
   for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
     if (node.deleted()) continue;
@@ -87,34 +88,113 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
 }
 
 void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
-                       const framework::proto::BlockDesc &block) {
+                       framework::proto::BlockDesc *block) {
   static int counter{0};
   PADDLE_ENFORCE(node->IsFunctionBlock());
   framework::OpDesc desc;
   auto *func = static_cast<FunctionBlock *>(node);
 
   // collect inputs
-  std::vector<std::string> io;
+  std::unordered_set<std::string> input_names;
   for (auto *x : func->inlinks) {
-    io.push_back(x->name());
+    input_names.insert(x->name());
   }
-  desc.SetInput("Xs", io);
+  desc.SetInput(
+      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
 
-  // collect outputs
-  io.clear();
+  std::unordered_set<std::string> output_names;
   for (auto *x : func->outlinks) {
-    io.push_back(x->name());
+    output_names.insert(x->name());
   }
-  desc.SetOutput("Ys", io);
+
+  std::vector<std::string> output_temp(output_names.begin(),
+                                       output_names.end());
+  desc.SetOutput("Ys", output_temp);
   desc.SetType("tensorrt_engine");
 
-  PADDLE_ENFORCE(!block.vars().empty(), "the block has no var-desc");
+  std::unordered_map<std::string, std::string> output_name_map;
+
+  // The following procedure is used to rename all the intermediate
+  // variables and the output variables of the subgraph.
+  // Why we do this?
+  // During the transition from fluid OP to tensorrt OP, we map
+  // the input and output Tensor(fluid data structure) of fluid OP
+  // to the correspondin ITensor (trt data structure) through the
+  // Tensor name. When we set up ITensor for an variable, we must
+  // ensure that it has not been set before.
+  // If there is variable in the fluid graph, which is not only the
+  // input of a OP, but also the output of a Op, there will be problems.
+  // So we have to rename the variable in the subgraph to make sure
+  // it is either an OP's input or an OP's output.
+
+  auto subgraph_nodes = func->subgraph;
+  for (int index = 0; index < block->ops_size(); index++) {
+    framework::proto::OpDesc *op = block->mutable_ops(index);
+    auto correspond_node = subgraph_nodes[index];
+    PADDLE_ENFORCE_EQ(correspond_node->name(), op->type());
+
+    std::unordered_map<std::string, size_t> var2id;
+    for (auto *in_var : correspond_node->inlinks) {
+      var2id[in_var->name()] = in_var->id();
+    }
+    // rename for the input variables of op inside subgraph
+    for (int i = 0; i < op->inputs_size(); i++) {
+      framework::proto::OpDesc_Var *in_var = op->mutable_inputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < in_var->arguments_size(); k++) {
+        std::string arg_value = in_var->arguments(k);
+        if (input_names.count(arg_value)) {
+          replaced_names.push_back(arg_value);
+        } else {
+          replaced_names.push_back(arg_value +
+                                   std::to_string(var2id[arg_value]));
+        }
+      }
+      in_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        in_var->add_arguments(replaced_names[k]);
+      }
+    }
+    var2id.clear();
+    for (auto out_var : correspond_node->outlinks) {
+      var2id[out_var->name()] = out_var->id();
+    }
+
+    // rename for the output variables of op inside subgraph
+    for (int i = 0; i < op->outputs_size(); i++) {
+      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < out_var->arguments_size(); k++) {
+        std::string arg_value = out_var->arguments(k);
+        if (output_names.count(arg_value)) {
+          output_name_map[arg_value] =
+              arg_value + std::to_string(var2id[arg_value]);
+        }
+        replaced_names.push_back(arg_value + std::to_string(var2id[arg_value]));
+      }
+      out_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        out_var->add_arguments(replaced_names[k]);
+      }
+    }
+  }
+  // When tensorrt engine runs at the end of the operation,
+  // output_mapping help us copy the data from the renamed ITensor
+  // to Tensor.
+  std::vector<std::string> output_mapping;
+  for (auto name : output_names) {
+    PADDLE_ENFORCE(output_name_map.count(name) != 0);
+    output_mapping.push_back(output_name_map[name]);
+  }
+
+  PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc");
   // Set attrs
-  SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
+  SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
   SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
   SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize);
   SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size);
   SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
+  SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
   node->SetPbMsg(desc.Proto()->SerializeAsString());
 }
 
@@ -146,15 +226,17 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
   LOG(INFO) << "transformed variable size: "
             << block_desc.Proto()->vars().size();
   // copy ops.
+
   for (auto *node : block_node->subgraph) {
     auto *op = block_desc.AppendOp();
     PADDLE_ENFORCE(!node->pb_msg().empty());
     op->Proto()->ParseFromString(node->pb_msg());
   }
+
   *block_desc.Proto()->mutable_vars() =
       argument_->origin_program_desc->blocks(0).vars();
   PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
-  CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
+  CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto());
   auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
   auto *op = main_block->add_ops();
   PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
index d8fc5e580a98f76233f01fdc4d7987311f78ee45..4ef381db295b986b91173a728b6d98640f6f4f51 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
@@ -26,21 +26,21 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, Test) {
-  DataFlowGraph graph;
+TEST(DataFlowGraph, Test) {
+  Argument argument(FLAGS_inference_model_dir);
 
   FluidToDataFlowGraphPass pass0;
   DataFlowGraphToFluidPass pass1;
   ASSERT_TRUE(pass0.Initialize(&argument));
   ASSERT_TRUE(pass1.Initialize(&argument));
 
-  pass0.Run(&graph);
-  pass1.Run(&graph);
+  pass0.Run(argument.main_dfg.get());
+  pass1.Run(argument.main_dfg.get());
 
   pass0.Finalize();
   pass1.Finalize();
 
-  LOG(INFO) << graph.nodes.size();
+  LOG(INFO) << argument.main_dfg->nodes.size();
 }
 
 };  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
index a6f85484756417e103cbb60bcb664e8b800b9f28..c05b0e5d4690d0a447edf63a149903704bc2c9be 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
@@ -46,9 +46,9 @@ std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
   for (size_t i = 0; i < graph->nodes.size(); i++) {
     const Node &node = graph->nodes.Get(i);
     if (!config_.display_deleted_node && node.deleted()) continue;
-    for (auto &in : node.inlinks) {
-      if (!config_.display_deleted_node && in->deleted()) continue;
-      dot.AddEdge(in->repr(), node.repr(), {});
+    for (auto &out : node.outlinks) {
+      if (!config_.display_deleted_node && out->deleted()) continue;
+      dot.AddEdge(node.repr(), out->repr(), {});
     }
   }
   return dot.Build();
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
index 65842b1e850953e77e3d4d28416609be271af9f1..928be7917047382d9b86294f6039b26b0ebf6f49 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -23,12 +23,18 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
-  auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
+TEST(DFG_GraphvizDrawPass, dfg_graphviz_draw_pass_tester) {
+  Argument argument(FLAGS_inference_model_dir);
+  FluidToDataFlowGraphPass pass0;
+  ASSERT_TRUE(pass0.Initialize(&argument));
+  pass0.Run(argument.main_dfg.get());
+
+  // auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
+
   DFG_GraphvizDrawPass::Config config("./", "test");
   DFG_GraphvizDrawPass pass(config);
   pass.Initialize(&argument);
-  pass.Run(&dfg);
+  pass.Run(argument.main_dfg.get());
 
   // test content
   std::ifstream file("./0-graph_test.dot");
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index 496921db9eabce1b1e40c7cb13089446ca93321c..511631d3e067f14bc1230d9e4b4d92dbe604e1d4 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <glog/logging.h>
 #include <string>
 #include <vector>
 
@@ -25,8 +26,20 @@ namespace analysis {
 
 bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
-  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc);
-  PADDLE_ENFORCE(argument);
+  if (argument->origin_program_desc) {
+    LOG(WARNING) << "argument's origin_program_desc is already set, might "
+                    "duplicate called";
+  }
+  if (!argument->fluid_model_program_path) {
+    ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir);
+    argument->fluid_model_program_path.reset(
+        new std::string(*argument->fluid_model_dir + "/__model__"));
+  }
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
+  auto program = LoadProgramDesc(*argument->fluid_model_program_path);
+  argument->origin_program_desc.reset(
+      new framework::proto::ProgramDesc(program));
+
   if (!argument->main_dfg) {
     argument->main_dfg.reset(new DataFlowGraph);
   }
@@ -40,6 +53,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
   PADDLE_ENFORCE(graph);
   PADDLE_ENFORCE(desc_);
   // insert vars
+  // The `var2id` keeps a map from a variable's name to its Node-id, the Node-id
+  // will keep updating to its latest alias during the graph-building.
   std::unordered_map<std::string, size_t> var2id;
   auto &main_block = desc_->blocks(framework::kRootBlockIndex);
   for (int i = 0; i < main_block.vars_size(); i++) {
@@ -51,6 +66,15 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     var2id[var.name()] = v->id();
   }
 
+  // The variables in a SSA can only write once, so if a variable is written
+  // multiple times(quite common in our ProgramDesc design), multiple alias
+  // Nodes of this variable will be created, and each will just write once.
+
+  // An set that keep all the names of the variables(the original, not alias)
+  // that have been written(as outputs). Once an Op's output variable hit the
+  // set, it should create a new alias and update the global alias for this
+  // variable. And that make a Data Flow Graph a SSA.
+  std::unordered_set<Node *> unique_written_vars;
   for (int i = 0; i < main_block.ops_size(); i++) {
     const auto &op = main_block.ops(i);
     auto *o = graph->nodes.Create(Node::Type::kFunction);
@@ -62,33 +86,33 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     o->SetPbMsg(op.SerializeAsString());
 
     // set inputs and outputs
-    std::unordered_set<Node *> inlinks;
     for (int j = 0; j < op.inputs_size(); j++) {
       auto &in_var = op.inputs(j);
       for (int k = 0; k < in_var.arguments_size(); k++) {
         auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
         in->outlinks.push_back(o);
         o->inlinks.push_back(in);
-        inlinks.insert(in);
       }
     }
     for (int j = 0; j < op.outputs_size(); j++) {
       auto &out_var = op.outputs(j);
       for (int k = 0; k < out_var.arguments_size(); k++) {
         auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
-        if (inlinks.count(out)) {
+        if (unique_written_vars.count(out)) {
           // Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
           auto *out_alias = graph->nodes.Create(Node::Type::kValue);
           out_alias->SetName(out->name());
           out_alias->SetPbDesc(out->pb_desc());
           out_alias->SetPbMsg(out->pb_msg());
-          var2id[out_alias->name()] = out_alias->id();  // update a -> a0
+          var2id[out_alias->name()] =
+              out_alias->id();  // update variable's alias Node
           LOG(INFO) << "loop found in graph, create SSA alias node ["
                     << out_alias->repr() << "] for [" << out->repr() << "]";
           out = out_alias;
         }
         out->inlinks.push_back(o);
         o->outlinks.push_back(out);
+        unique_written_vars.insert(out);
       }
     }
   }
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
index da8463b63bd0bb1633bfcb9d7d41a884ddd632c7..fb948bf2242abcbc1e841fd3b8457e63358782c5 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -30,7 +30,7 @@ namespace inference {
 namespace analysis {
 
 /*
- * Transform a FluidDesc to a data flow graph.
+ * Transform a FluidDesc to a SSA.
  */
 class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
  public:
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
index dadb84059d21adab44159a6145b345460663cb96..d218dcd05015aa4636c16569de4addf4936c8cd5 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -21,8 +21,9 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, Init) {
+TEST(FluidToDataFlowGraphPass, Test) {
   FluidToDataFlowGraphPass pass;
+  Argument argument(FLAGS_inference_model_dir);
   pass.Initialize(&argument);
   pass.Run(argument.main_dfg.get());
   // Analysis is sensitive to ProgramDesc, careful to change the original model.
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index f1064cd20f28092d80d3fd23a862da080b6cc2f3..a0f912b251d5ea29594a7f601d5b2bce91201790 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cstdio>
+#include <fstream>
 #include <string>
 #include <typeindex>
 #include <unordered_map>
@@ -136,6 +137,20 @@ static void ExecShellCommand(const std::string &cmd, std::string *message) {
   }
 }
 
+static framework::proto::ProgramDesc LoadProgramDesc(
+    const std::string &model_path) {
+  std::ifstream fin(model_path, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(fin.is_open(), "Cannot open file %s", model_path);
+  fin.seekg(0, std::ios::end);
+  std::string buffer(fin.tellg(), ' ');
+  fin.seekg(0, std::ios::beg);
+  fin.read(&buffer[0], buffer.size());
+  fin.close();
+  framework::proto::ProgramDesc program_desc;
+  program_desc.ParseFromString(buffer);
+  return program_desc;
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass.cc b/paddle/fluid/inference/analysis/model_store_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c429176424bd5c1d8fa5e015c19d698f966880e
--- /dev/null
+++ b/paddle/fluid/inference/analysis/model_store_pass.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/argument.h"
+#include "paddle/fluid/inference/analysis/model_store_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void ModelStorePass::Run(DataFlowGraph *x) {
+  if (!argument_->fluid_model_param_path) {
+    PADDLE_ENFORCE_NOT_NULL(argument_->fluid_model_dir);
+    argument_->fluid_model_param_path.reset(
+        new std::string(*argument_->fluid_model_dir + "param"));
+  }
+  PADDLE_ENFORCE_NOT_NULL(argument_->model_output_store_path);
+  // Directly copy param file to destination.
+  std::stringstream ss;
+  // NOTE these commands only works on linux.
+  ss << "mkdir -p " << *argument_->model_output_store_path;
+  LOG(INFO) << "run command: " << ss.str();
+  PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
+  ss.str("");
+
+  ss << "cp " << *argument_->fluid_model_dir << "/*"
+     << " " << *argument_->model_output_store_path;
+  LOG(INFO) << "run command: " << ss.str();
+  PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
+
+  // Store program
+  PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc,
+                          "program desc is not transformed, should call "
+                          "DataFlowGraphToFluidPass first.");
+  const std::string program_output_path =
+      *argument_->model_output_store_path + "/__model__";
+  std::ofstream file(program_output_path, std::ios::binary);
+  PADDLE_ENFORCE(file.is_open(), "failed to open %s to write.",
+                 program_output_path);
+  const std::string serialized_message =
+      argument_->transformed_program_desc->SerializeAsString();
+  file.write(serialized_message.c_str(), serialized_message.size());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass.h b/paddle/fluid/inference/analysis/model_store_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..fac7083925776b6209d49255c9e67b930cb1250b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/model_store_pass.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines ModelStorePass, which store the runtime DFG to a Paddle
+ * model in the disk, and that model can be reloaded for prediction.
+ */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class ModelStorePass : public DataFlowGraphPass {
+ public:
+  bool Initialize(Argument* argument) override {
+    if (!argument) {
+      LOG(ERROR) << "invalid argument";
+      return false;
+    }
+    argument_ = argument;
+    return true;
+  }
+
+  void Run(DataFlowGraph* x) override;
+
+  std::string repr() const override { return "DFG-store-pass"; }
+  std::string description() const override {
+    return R"DD(This file defines ModelStorePass, which store the runtime DFG to a Paddle
+    model in the disk, and that model can be reloaded for prediction again.)DD";
+  }
+
+ private:
+  Argument* argument_{nullptr};
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass_tester.cc b/paddle/fluid/inference/analysis/model_store_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5f3526dd504e77e58d79b4f675db86a22fd0f26b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/model_store_pass_tester.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/model_store_pass.h"
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/analyzer.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+DEFINE_string(inference_model_dir, "", "Model path");
+
+TEST(DFG_StorePass, test) {
+  Analyzer analyzer;
+  Argument argument(FLAGS_inference_model_dir);
+  argument.model_output_store_path.reset(
+      new std::string("./_dfg_store_pass_tmp"));
+  // disable storage in alalyzer
+  FLAGS_inference_analysis_output_storage_path = "";
+  analyzer.Run(&argument);
+
+  ModelStorePass pass;
+  pass.Initialize(&argument);
+  pass.Run(argument.main_dfg.get());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h
index 6b4dbb3bb5ddd9f15f26758beef1d1b5bbf49142..6806f9ff7dada2c1e2328e1ffbfd225afefcf474 100644
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -50,6 +50,7 @@ class Pass {
   // Create a debugger Pass that draw the DFG by graphviz toolkit.
   virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
 
+  virtual void Run() { LOG(FATAL) << "not valid"; }
   // Run on a single Node.
   virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
   // Run on a single Function.
diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc
index dac1c509d728114bd24a2ea1150c407646026fd4..13423e4837e12a96e7a5dfc9ca3f59bf8b14746a 100644
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -56,7 +56,7 @@ class TestNodePass final : public NodePass {
   std::string description() const override { return "some doc"; }
 };
 
-TEST_F(DFG_Tester, DFG_pass_manager) {
+TEST(PassManager, DFG_pass_manager) {
   TestDfgPassManager manager;
   DFG_GraphvizDrawPass::Config config("./", "dfg.dot");
 
@@ -64,12 +64,15 @@ TEST_F(DFG_Tester, DFG_pass_manager) {
   manager.Register("graphviz", new DFG_GraphvizDrawPass(config));
   manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass);
 
+  Argument argument(FLAGS_inference_model_dir);
+
   ASSERT_TRUE(&argument);
   ASSERT_TRUE(manager.Initialize(&argument));
   manager.RunAll();
 }
 
-TEST_F(DFG_Tester, Node_pass_manager) {
+TEST(PassManager, Node_pass_manager) {
+  Argument argument(FLAGS_inference_model_dir);
   // Pre-process: initialize the DFG with the ProgramDesc first.
   FluidToDataFlowGraphPass pass0;
   pass0.Initialize(&argument);
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc
index 389f9e1a9148a4daf0e5b751cce5cb6325252a4e..80809d4c43ca08298bad25cf614dcb4117d3f99a 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -76,7 +76,7 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
 
 std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
   std::vector<Node *> marked_nodes;
-  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) {
+  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes_in_TS()) {
     if (node.attr(kMarkerAttrName).Bool()) {
       marked_nodes.push_back(&node);
     }
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
index 67dd4da54b95add703428e1fded61065f60353e8..39cc433b40fad17f4f12359d4e907a250a88bd63 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -31,8 +31,8 @@ SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
   return false;
 };
 
-TEST_F(DFG_Tester, Split) {
-  auto desc = LoadProgramDesc();
+TEST(SubGraphSplitter, Split) {
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
   LOG(INFO) << "spliter\n" << dfg.DotString();
 
@@ -63,8 +63,8 @@ TEST_F(DFG_Tester, Split) {
   ASSERT_EQ(subgraphs.back().size(), 6UL);
 }
 
-TEST_F(DFG_Tester, Fuse) {
-  auto desc = LoadProgramDesc();
+TEST(SubGraphSplitter, Fuse) {
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
 
   size_t count0 = dfg.nodes.size();
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
index a6c15e848b99ca318f4583e3d4b88345fe8e5ebc..c1d932878e559180af987594535959afdf475587 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
@@ -22,11 +22,11 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
+TEST(TensorRTSubgraphNodeMarkPass, test) {
   // init
   FluidToDataFlowGraphPass pass;
+  Argument argument(FLAGS_inference_model_dir);
   ASSERT_TRUE(pass.Initialize(&argument));
-  argument.main_dfg.reset(new DataFlowGraph);
   pass.Run(argument.main_dfg.get());
 
   TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) {
@@ -41,7 +41,7 @@ TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
   for (auto& node : argument.main_dfg->nodes.nodes()) {
     counter += node->attr(ATTR_supported_by_tensorrt).Bool();
   }
-
+  ASSERT_EQ(counter, 2);
   LOG(INFO) << counter << " nodes marked";
 }
 
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
index 1d749d3fa3f39b351ccee6ebeb82467f7220a0b6..67a5af83d89b771536ea11be51b35244ff5c09d6 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
@@ -25,7 +25,7 @@ namespace analysis {
 
 DEFINE_string(dot_dir, "./", "");
 
-TEST_F(DFG_Tester, tensorrt_single_pass) {
+TEST(TensorRTSubGraphPass, main) {
   std::unordered_set<std::string> teller_set(
       {"elementwise_add", "mul", "sigmoid"});
   SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) {
@@ -35,7 +35,8 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
     return false;
   };
 
-  LOG(INFO) << "init";
+  Argument argument(FLAGS_inference_model_dir);
+
   DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
   DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
 
@@ -44,13 +45,11 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
   FluidToDataFlowGraphPass pass0;
   TensorRTSubGraphPass trt_pass(std::move(teller));
 
-  LOG(INFO) << "Initialize";
   dfg_pass.Initialize(&argument);
   dfg_pass1.Initialize(&argument);
   pass0.Initialize(&argument);
   trt_pass.Initialize(&argument);
 
-  LOG(INFO) << "Run";
   argument.main_dfg.reset(new DataFlowGraph);
   pass0.Run(argument.main_dfg.get());
   dfg_pass.Run(argument.main_dfg.get());
diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h
index ce1191a567a4198f003520c40bf02487c48c56eb..1073a6f686eaeeaaae2d93ab044149b7df518085 100644
--- a/paddle/fluid/inference/analysis/ut_helper.h
+++ b/paddle/fluid/inference/analysis/ut_helper.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 
 namespace paddle {
 namespace inference {
@@ -32,27 +32,12 @@ namespace analysis {
 
 DEFINE_string(inference_model_dir, "", "inference test model dir");
 
-static framework::proto::ProgramDesc LoadProgramDesc(
-    const std::string& model_dir = FLAGS_inference_model_dir) {
-  std::string msg;
-  std::string net_file = FLAGS_inference_model_dir + "/__model__";
-  std::ifstream fin(net_file, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", net_file);
-  fin.seekg(0, std::ios::end);
-  msg.resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(msg.at(0)), msg.size());
-  fin.close();
-  framework::proto::ProgramDesc program_desc;
-  program_desc.ParseFromString(msg);
-  return program_desc;
-}
-
 static DataFlowGraph ProgramDescToDFG(
     const framework::proto::ProgramDesc& desc) {
   DataFlowGraph graph;
   FluidToDataFlowGraphPass pass;
   Argument argument;
+  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
   pass.Initialize(&argument);
   pass.Run(&graph);
@@ -63,7 +48,7 @@ static DataFlowGraph ProgramDescToDFG(
 class DFG_Tester : public ::testing::Test {
  protected:
   void SetUp() override {
-    auto desc = LoadProgramDesc(FLAGS_inference_model_dir);
+    auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
     argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
   }
 
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 7e4b3e9a2dcae6b34d1af089bc7da55e09315c58..08d0f493ab30d92a121d089d9003bc575429b4dd 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -19,6 +19,7 @@ endif(APPLE)
 
 
 set(inference_deps paddle_inference_api paddle_fluid_api)
+
 if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
 endif()
@@ -42,35 +43,8 @@ function(inference_api_test TARGET_NAME)
     endif(WITH_TESTING)
 endfunction(inference_api_test)
 
-cc_library(paddle_inference_api
-    SRCS api.cc api_impl.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
-if(NOT APPLE)
-  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/api.sym")
-  set_target_properties(paddle_inference_api PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-endif()
-
-# Here the shared library doesn't depend on other fluid libraries, or double free will occur.
-cc_library(paddle_inference_api_shared SHARED
-    SRCS api.cc api_impl.cc)
-add_dependencies(paddle_inference_api_shared ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
-set_target_properties(paddle_inference_api_shared PROPERTIES OUTPUT_NAME paddle_inference_api)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc DEPS lod_tensor)
 
-if(NOT APPLE)
-  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/api.map")
-  set_target_properties(paddle_inference_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-  FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
-    "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
-    " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_inference_api.so\" RESULT_VARIABLE symbol_res)\n"
-    "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
-    "  message(FATAL_ERROR \"Check symbol failed.\")\n"
-    "endif()\n")
-  add_custom_command(
-    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol"
-    COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake"
-    DEPENDS paddle_inference_api_shared)
-  add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
-endif()
 
 cc_test(test_paddle_inference_api
         SRCS api_tester.cc
@@ -90,6 +64,8 @@ endif()
 if (WITH_ANAKIN) # only needed in CI
     # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
     # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
+    # compile the libinference_anakin_api.a and compile with anakin.so.
+    fetch_include_recursively(${ANAKIN_INCLUDE})
     # compile the libinference_anakin_api.a and anakin.so.
     nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc)
     nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc)
@@ -98,9 +74,10 @@ if (WITH_ANAKIN) # only needed in CI
     target_link_libraries(inference_anakin_api anakin anakin_saber_common)
     target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
     if (WITH_TESTING)
-        cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
-                                  ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
-                                  DEPS inference_anakin_api)
-        target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+        # this test is unstable, disable it first.
+        #cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
+                                  #ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
+                                  #DEPS inference_anakin_api_shared)
+        #target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
      endif(WITH_TESTING)
 endif()
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index e74f23ff969f5a8f58a71da337c16dcbc14f10c0..63c3f0d7b3f5c2b9246e2b041796caf5eb562826 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <glog/logging.h>
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 
 namespace paddle {
@@ -40,19 +41,36 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other)
 PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
 
 PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
+  if (!other.memory_owned_) {
+    data_ = other.data_;
+    length_ = other.length_;
+    memory_owned_ = other.memory_owned_;
+  } else {
+    Resize(other.length());
+    memcpy(data_, other.data(), other.length());
+    length_ = other.length();
+    memory_owned_ = true;
+  }
+  return *this;
+}
+
+PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) {
   // only the buffer with external memory can be copied
-  assert(!other.memory_owned_);
   data_ = other.data_;
   length_ = other.length_;
   memory_owned_ = other.memory_owned_;
+  other.data_ = nullptr;
+  other.length_ = 0;
+  other.memory_owned_ = false;
   return *this;
 }
 
 void PaddleBuf::Resize(size_t length) {
   // Only the owned memory can be reset, the external memory can't be changed.
   if (length_ == length) return;
-  assert(memory_owned_);
-  Free();
+  if (memory_owned_) {
+    Free();
+  }
   data_ = new char[length];
   length_ = length;
   memory_owned_ = true;
@@ -68,7 +86,7 @@ void PaddleBuf::Reset(void* data, size_t length) {
 void PaddleBuf::Free() {
   if (memory_owned_ && data_) {
     assert(length_ > 0);
-    delete static_cast<char*>(data_);
+    delete[] static_cast<char*>(data_);
     data_ = nullptr;
     length_ = 0;
   }
diff --git a/paddle/fluid/inference/api/api.map b/paddle/fluid/inference/api/api.map
deleted file mode 100644
index 5203784dc1fcb672eb6a26d9dfd3ffbe02e08038..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/api.map
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-	global:
-		*paddle*;
-	local:
-		*;
-};
diff --git a/paddle/fluid/inference/api/api.sym b/paddle/fluid/inference/api/api.sym
deleted file mode 100644
index ef2a04d788aa86b7f6a61c4af479d70d1137f374..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/api.sym
+++ /dev/null
@@ -1 +0,0 @@
-*paddle*
diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc
index 0206ac60103759deda91be741617bde63e003de6..6b374ceefbc180a5c22abe591f12e1c3d89bc64a 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
@@ -18,26 +18,36 @@
 
 namespace paddle {
 
-PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor(
+template <typename Target>
+PaddleInferenceAnakinPredictor<Target>::PaddleInferenceAnakinPredictor(
     const AnakinConfig &config) {
   CHECK(Init(config));
 }
 
-bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
+template <typename Target>
+bool PaddleInferenceAnakinPredictor<Target>::Init(const AnakinConfig &config) {
   if (!(graph_.load(config.model_file))) {
+    LOG(FATAL) << "fail to load graph from " << config.model_file;
     return false;
   }
-  graph_.ResetBatchSize("input_0", config.max_batch_size);
+  auto inputs = graph_.get_ins();
+  for (auto &input_str : inputs) {
+    graph_.ResetBatchSize(input_str, config.max_batch_size);
+  }
   // optimization for graph
   if (!(graph_.Optimize())) {
     return false;
   }
   // construct executer
-  executor_.init(graph_);
+  if (executor_p_ == nullptr) {
+    executor_p_ = new anakin::Net<Target, anakin::saber::AK_FLOAT,
+                                  anakin::Precision::FP32>(graph_, true);
+  }
   return true;
 }
 
-bool PaddleInferenceAnakinPredictor::Run(
+template <typename Target>
+bool PaddleInferenceAnakinPredictor<Target>::Run(
     const std::vector<PaddleTensor> &inputs,
     std::vector<PaddleTensor> *output_data, int batch_size) {
   for (const auto &input : inputs) {
@@ -46,7 +56,29 @@ bool PaddleInferenceAnakinPredictor::Run(
                  << "'s type is not float";
       return false;
     }
-    auto d_tensor_in_p = executor_.get_in(input.name);
+    auto d_tensor_in_p = executor_p_->get_in(input.name);
+    auto net_shape = d_tensor_in_p->valid_shape();
+    if (net_shape.size() != input.shape.size()) {
+      LOG(ERROR) << " input  " << input.name
+                 << "'s shape size should be equal to that of net";
+      return false;
+    }
+    int sum = 1;
+    for_each(input.shape.begin(), input.shape.end(), [&](int n) { sum *= n; });
+    if (sum > net_shape.count()) {
+      graph_.Reshape(input.name, input.shape);
+      delete executor_p_;
+      executor_p_ = new anakin::Net<Target, anakin::saber::AK_FLOAT,
+                                    anakin::Precision::FP32>(graph_, true);
+      d_tensor_in_p = executor_p_->get_in(input.name);
+    }
+
+    anakin::saber::Shape tmp_shape;
+    for (auto s : input.shape) {
+      tmp_shape.push_back(s);
+    }
+    d_tensor_in_p->reshape(tmp_shape);
+
     float *d_data_p = d_tensor_in_p->mutable_data();
     if (cudaMemcpy(d_data_p, static_cast<float *>(input.data.data()),
                    d_tensor_in_p->valid_size() * sizeof(float),
@@ -56,16 +88,17 @@ bool PaddleInferenceAnakinPredictor::Run(
     }
     cudaStreamSynchronize(NULL);
   }
-
-  executor_.prediction();
+  cudaDeviceSynchronize();
+  executor_p_->prediction();
+  cudaDeviceSynchronize();
 
   if (output_data->empty()) {
     LOG(ERROR) << "At least one output should be set with tensors' names.";
     return false;
   }
   for (auto &output : *output_data) {
-    auto *tensor = executor_.get_out(output.name);
-    output.shape = tensor->shape();
+    auto *tensor = executor_p_->get_out(output.name);
+    output.shape = tensor->valid_shape();
     if (output.data.length() < tensor->valid_size() * sizeof(float)) {
       output.data.Resize(tensor->valid_size() * sizeof(float));
     }
@@ -81,19 +114,23 @@ bool PaddleInferenceAnakinPredictor::Run(
   return true;
 }
 
-anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
-    &PaddleInferenceAnakinPredictor::get_executer() {
-  return executor_;
+template <typename Target>
+anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+    &PaddleInferenceAnakinPredictor<Target>::get_executer() {
+  return *executor_p_;
 }
 
 // the cloned new Predictor of anakin share the same net weights from original
 // Predictor
-std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
+template <typename Target>
+std::unique_ptr<PaddlePredictor>
+PaddleInferenceAnakinPredictor<Target>::Clone() {
   VLOG(3) << "Anakin Predictor::clone";
-  std::unique_ptr<PaddlePredictor> cls(new PaddleInferenceAnakinPredictor());
+  std::unique_ptr<PaddlePredictor> cls(
+      new PaddleInferenceAnakinPredictor<Target>());
   // construct executer from other graph
   auto anakin_predictor_p =
-      dynamic_cast<PaddleInferenceAnakinPredictor *>(cls.get());
+      dynamic_cast<PaddleInferenceAnakinPredictor<Target> *>(cls.get());
   if (!anakin_predictor_p) {
     LOG(ERROR) << "fail to call Init";
     return nullptr;
@@ -103,14 +140,28 @@ std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
   return std::move(cls);
 }
 
+template class PaddleInferenceAnakinPredictor<anakin::NV>;
+template class PaddleInferenceAnakinPredictor<anakin::X86>;
+
 // A factory to help create difference predictor.
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) {
   VLOG(3) << "Anakin Predictor create.";
-  std::unique_ptr<PaddlePredictor> x(
-      new PaddleInferenceAnakinPredictor(config));
-  return x;
-}
+  if (config.target_type == AnakinConfig::NVGPU) {
+    VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ].";
+    std::unique_ptr<PaddlePredictor> x(
+        new PaddleInferenceAnakinPredictor<anakin::NV>(config));
+    return x;
+  } else if (config.target_type == AnakinConfig::X86) {
+    VLOG(3) << "Anakin Predictor create on [ Intel X86 ].";
+    std::unique_ptr<PaddlePredictor> x(
+        new PaddleInferenceAnakinPredictor<anakin::X86>(config));
+    return x;
+  } else {
+    VLOG(3) << "Anakin Predictor create on unknown platform.";
+    return nullptr;
+  }
+};
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h
index def096c867ec85624f5b221782ef8b6240923c05..836badd9799228c6c294dcad5df73d039d36a1ff 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
@@ -20,14 +20,16 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
 
-// from anakin
 #include "framework/core/net/net.h"
+#include "framework/graph/graph.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "saber/core/shape.h"
 #include "saber/saber_types.h"
 
 namespace paddle {
 
+template <typename Target>
 class PaddleInferenceAnakinPredictor : public PaddlePredictor {
  public:
   PaddleInferenceAnakinPredictor() {}
@@ -42,19 +44,21 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
 
   std::unique_ptr<PaddlePredictor> Clone() override;
 
-  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
+  anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
   get_executer();
 
-  ~PaddleInferenceAnakinPredictor() override{};
+  ~PaddleInferenceAnakinPredictor() override {
+    delete executor_p_;
+    executor_p_ = nullptr;
+  };
 
  private:
   bool Init(const AnakinConfig& config);
 
-  anakin::graph::Graph<anakin::NV, anakin::saber::AK_FLOAT,
-                       anakin::Precision::FP32>
+  anakin::graph::Graph<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
       graph_;
-  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
-      executor_;
+  anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>*
+      executor_p_{nullptr};
   AnakinConfig config_;
 };
 
diff --git a/paddle/fluid/inference/api/api_anakin_engine_tester.cc b/paddle/fluid/inference/api/api_anakin_engine_tester.cc
index d6d631bfbad4278fe99e4553a410a9d9162dcc7b..62e820b68c79a47d963bb174663bfc8c4ac22de3 100644
--- a/paddle/fluid/inference/api/api_anakin_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine_tester.cc
@@ -12,18 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 
-DEFINE_string(model, "", "Directory of the inference model.");
+DEFINE_string(model, "", "Directory of the inference model(mobile_v2).");
 
 namespace paddle {
 
 AnakinConfig GetConfig() {
   AnakinConfig config;
+  // using AnakinConfig::X86 if you need to use cpu to do inference
+  config.target_type = AnakinConfig::NVGPU;
   config.model_file = FLAGS_model;
   config.device = 0;
   config.max_batch_size = 1;
@@ -36,28 +38,27 @@ TEST(inference, anakin) {
       CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
 
   float data[1 * 3 * 224 * 224] = {1.0f};
-
-  PaddleTensor tensor{.name = "input_0",
-                      .shape = std::vector<int>({1, 3, 224, 224}),
-                      .data = PaddleBuf(data, sizeof(data)),
-                      .dtype = PaddleDType::FLOAT32};
+  PaddleTensor tensor;
+  tensor.name = "input_0";
+  tensor.shape = std::vector<int>({1, 3, 224, 224});
+  tensor.data = PaddleBuf(data, sizeof(data));
+  tensor.dtype = PaddleDType::FLOAT32;
 
   // For simplicity, we set all the slots with the same data.
-  std::vector<PaddleTensor> paddle_tensor_feeds;
-  paddle_tensor_feeds.emplace_back(std::move(tensor));
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
 
-  PaddleTensor tensor_out{.name = "prob_out",
-                          .shape = std::vector<int>({1000, 1}),
-                          .data = PaddleBuf(),
-                          .dtype = PaddleDType::FLOAT32};
+  PaddleTensor tensor_out;
+  tensor_out.name = "prob_out";
+  tensor_out.shape = std::vector<int>({});
+  tensor_out.data = PaddleBuf();
+  tensor_out.dtype = PaddleDType::FLOAT32;
 
-  std::vector<PaddleTensor> outputs;
-  outputs.emplace_back(std::move(tensor_out));
+  std::vector<PaddleTensor> outputs(1, tensor_out);
 
   ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
 
   float* data_o = static_cast<float*>(outputs[0].data.data());
-  for (size_t j = 0; j < 1000; ++j) {
+  for (size_t j = 0; j < outputs[0].data.length(); ++j) {
     LOG(INFO) << "output[" << j << "]: " << data_o[j];
   }
 }
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 58fd7c6f8b05a846bd4a82068f09f5d9ef5a6516..08d7af6d3af7054061b15b904c69b2862c629562 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -183,6 +183,13 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
     // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
     std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
                 inputs[i].data.length());
+    // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
+    framework::LoD lod;
+    for (auto &level : inputs[i].lod) {
+      lod.emplace_back(level);
+    }
+    input.set_lod(lod);
+
     feeds->push_back(input);
   }
   return true;
@@ -248,6 +255,10 @@ bool NativePaddlePredictor::GetFetch(
       buffer.Resize(sizeof(float) * data.size());
     }
     std::memcpy(buffer.data(), data.data(), buffer.length());
+    // copy LoD
+    for (const auto &level : fetchs[i].lod()) {
+      outputs->at(i).lod.emplace_back(level);
+    }
     outputs->at(i).dtype = PaddleDType::FLOAT32;
     // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
   }
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
index c0891e9c281961fa03d278a0f5c676f92672c419..45b5a7638b7dc6a54bbd905766fd5c284cb6aea1 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -90,6 +90,18 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
   void OptimizeInferenceProgram() {
     // Analyze inference_program
     Argument argument;
+    if (!config_.model_dir.empty()) {
+      argument.fluid_model_dir.reset(new std::string(config_.model_dir));
+    } else {
+      PADDLE_ENFORCE(
+          !config_.param_file.empty(),
+          "Either model_dir or (param_file, prog_file) should be set.");
+      PADDLE_ENFORCE(!config_.prog_file.empty());
+      argument.fluid_model_program_path.reset(
+          new std::string(config_.prog_file));
+      argument.fluid_model_param_path.reset(
+          new std::string(config_.param_file));
+    }
     argument.origin_program_desc.reset(
         new ProgramDesc(*inference_program_->Proto()));
     Singleton<Analyzer>::Global().Run(&argument);
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
index 62d98a796708612e7d4ff8abfd85125978ce22c7..fcbf9b89d608e7961e3ef81ac1c70e083dae1cc0 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
@@ -49,11 +49,10 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
     std::vector<int64_t> data(20);
     for (int i = 0; i < 20; i++) data[i] = i;
 
-    PaddleTensor tensor{
-        .name = "",
-        .shape = std::vector<int>({10, 1}),
-        .data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)),
-        .dtype = PaddleDType::INT64};
+    PaddleTensor tensor;
+    tensor.shape = std::vector<int>({10, 1});
+    tensor.data = PaddleBuf(data.data(), data.size() * sizeof(int64_t));
+    tensor.dtype = PaddleDType::INT64;
 
     // For simplicity, we set all the slots with the same data.
     std::vector<PaddleTensor> slots(4, tensor);
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 7f9bb4b33e97b5ea37e9216b00ce0c82ca3ce230..ba73a6eaa6fc885b6b56c2d6330394e2f9c384bf 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -55,11 +55,9 @@ endif()
 # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
 if(WITH_STATIC_LIB)
   set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference_api.a
       ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a)
 else()
   set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference_api.so
       ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so)
 endif()
 set(EXTERNAL_LIB "-lrt -ldl -lpthread")
diff --git a/paddle/fluid/inference/api/demo_ci/clean.sh b/paddle/fluid/inference/api/demo_ci/clean.sh
new file mode 100755
index 0000000000000000000000000000000000000000..0d9f3d2aa237acaf3bd7adb031b1f2a73c555352
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/clean.sh
@@ -0,0 +1,4 @@
+set -x
+cd `dirname $0`
+rm -rf build/ data/
+set +x
diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
index 5f96fecf93f7a6c42bc6b9fe4e0d985c626388d7..03ac79e9edf0d7ce6e167c3d34af5ba84bbc0e72 100644
--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@@ -47,10 +47,10 @@ void Main(bool use_gpu) {
     //# 2. Prepare input.
     int64_t data[4] = {1, 2, 3, 4};
 
-    PaddleTensor tensor{.name = "",
-                        .shape = std::vector<int>({4, 1}),
-                        .data = PaddleBuf(data, sizeof(data)),
-                        .dtype = PaddleDType::INT64};
+    PaddleTensor tensor;
+    tensor.shape = std::vector<int>({4, 1});
+    tensor.data = PaddleBuf(data, sizeof(data));
+    tensor.dtype = PaddleDType::INT64;
 
     // For simplicity, we set all the slots with the same data.
     std::vector<PaddleTensor> slots(4, tensor);
@@ -94,10 +94,11 @@ void MainThreads(int num_threads, bool use_gpu) {
       for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
         // 2. Dummy Input Data
         int64_t data[4] = {1, 2, 3, 4};
-        PaddleTensor tensor{.name = "",
-                            .shape = std::vector<int>({4, 1}),
-                            .data = PaddleBuf(data, sizeof(data)),
-                            .dtype = PaddleDType::INT64};
+        PaddleTensor tensor;
+        tensor.shape = std::vector<int>({4, 1});
+        tensor.data = PaddleBuf(data, sizeof(data));
+        tensor.dtype = PaddleDType::INT64;
+
         std::vector<PaddleTensor> inputs(4, tensor);
         std::vector<PaddleTensor> outputs;
         // 3. Run
diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
index 0a2a2b713ab21a3124d8a85ba469f64278623ec4..3800d49b34738d5a272033d75cb415ae9ad1fb8f 100644
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -20,8 +20,8 @@ limitations under the License. */
 #include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
 #include <fstream>
 #include <iostream>
+#include "paddle/fluid/inference/demo_ci/utils.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "utils.h"
 
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
@@ -123,11 +123,11 @@ void Main(bool use_gpu) {
   file.close();
 
   // Inference.
-  PaddleTensor input{
-      .name = "xx",
-      .shape = record.shape,
-      .data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)),
-      .dtype = PaddleDType::FLOAT32};
+  PaddleTensor input;
+  input.shape = record.shape;
+  input.data =
+      PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
+  input.dtype = PaddleDType::FLOAT32;
 
   VLOG(3) << "run executor";
   std::vector<PaddleTensor> output;
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 2f8b4f8596946988a728b5cf82de251bfda778a9..b24414e8245b1a4d90acce4fa1ad5690e06b47dd 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -40,6 +40,7 @@ class PaddleBuf {
   // Copy only available when memory is managed externally.
   explicit PaddleBuf(const PaddleBuf&);
   PaddleBuf& operator=(const PaddleBuf&);
+  PaddleBuf& operator=(PaddleBuf&&);
   // Do not own the memory.
   PaddleBuf(void* data, size_t length)
       : data_(data), length_(length), memory_owned_{false} {}
@@ -67,9 +68,9 @@ struct PaddleTensor {
   PaddleTensor() = default;
   std::string name;  // variable name.
   std::vector<int> shape;
-  // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
   PaddleBuf data;  // blob of data.
   PaddleDType dtype;
+  std::vector<std::vector<uint64_t>> lod;  // lod data
 };
 
 enum class PaddleEngineKind {
@@ -126,9 +127,11 @@ struct NativeConfig : public PaddlePredictor::Config {
 
 // Configurations for Anakin engine.
 struct AnakinConfig : public PaddlePredictor::Config {
+  enum TargetType { NVGPU = 0, X86 };
   int device;
   std::string model_file;
   int max_batch_size{-1};
+  TargetType target_type;
 };
 
 struct TensorRTConfig : public NativeConfig {
diff --git a/paddle/fluid/inference/api/check_symbol.sh b/paddle/fluid/inference/check_symbol.sh
similarity index 64%
rename from paddle/fluid/inference/api/check_symbol.sh
rename to paddle/fluid/inference/check_symbol.sh
index 6547ca1413649968e8a0be146915e07192a99898..12b7b3e7e5982f193e48596b867953fc93841b61 100755
--- a/paddle/fluid/inference/api/check_symbol.sh
+++ b/paddle/fluid/inference/check_symbol.sh
@@ -3,8 +3,8 @@
 lib=$1
 if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi
 
-num_paddle_syms=$(nm -D --defined-only ${lib} | grep paddle | wc -l)
-num_google_syms=$(nm -D --defined-only ${lib} | grep google | wc -l)
+num_paddle_syms=$(nm -D ${lib} | grep paddle | wc -l)
+num_google_syms=$(nm -D ${lib} | grep google | grep -v paddle | grep T | wc -l)
 
 if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi
 if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 748f5a084e8c880df215a60fe51c835ba5cd3110..8f42a37cd3f8978b917b42e8f45a128b8422aa57 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,7 +1,8 @@
 # Add TRT tests
 nv_library(tensorrt_converter
-  SRCS mul_op.cc conv2d_op.cc fc_op.cc
-  DEPS tensorrt_engine mul_op)
+  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
+activation_op.cc
+  DEPS tensorrt_engine operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
   ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter)
@@ -13,3 +14,10 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
 nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
+nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL)
+nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL)
+
+nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 8e7e23377d4b2fe7afd51f1f58048fc4ed3c6d99..dba1d50b2d1c487ced8e6ca51f2d257641ad5fc7 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -20,11 +20,60 @@ namespace tensorrt {
 
 class Conv2dOpConverter : public OpConverter {
  public:
-  Conv2dOpConverter() {}
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
     LOG(INFO)
         << "convert a fluid conv2d op to tensorrt conv layer without bias";
+
+    framework::OpDesc op_desc(op, nullptr);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
+
+    auto* X = engine_->GetITensor(op_desc.Input("Input").front());
+    // Declare weights
+    auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
+    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
+
+    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 4UL);
+    const int n_output = Y_t->dims()[0];
+    const int filter_h = Y_t->dims()[2];
+    const int filter_w = Y_t->dims()[3];
+
+    const int groups = boost::get<int>(op_desc.GetAttr("groups"));
+    const std::vector<int> dilations =
+        boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
+    const std::vector<int> strides =
+        boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+    const std::vector<int> paddings =
+        boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+
+    nvinfer1::DimsHW nv_ksize(filter_h, filter_w);
+    nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]);
+    nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+    nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+
+    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                  static_cast<void*>(weight_data),
+                                  Y_t->memory_size() / sizeof(float)};
+
+    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    auto* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Convolution, *const_cast<nvinfer1::ITensor*>(X), n_output,
+        nv_ksize, weight.get(), bias.get());
+    PADDLE_ENFORCE(layer != nullptr);
+    layer->setStride(nv_strides);
+    layer->setPadding(nv_paddings);
+    layer->setDilation(nv_dilations);
+    layer->setNbGroups(groups);
+
+    auto output_name = op_desc.Output("Output").front();
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3744550f60a1696aedd8a3ecd24f1b21d22325b9
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -0,0 +1,210 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class ElementwiseWeightOpConverter : public OpConverter {
+ public:
+  ElementwiseWeightOpConverter() {}
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    // Here the two nullptr looks strange, that's because the
+    // framework::OpDesc's constructor is strange.
+    framework::OpDesc op_desc(op, nullptr);
+    LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer";
+
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+    nvinfer1::Dims dims_x = X->getDimensions();
+    PADDLE_ENFORCE(dims_x.nbDims >= 3);
+
+    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
+    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
+    auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
+
+    std::vector<int> dims_y = framework::vectorize2int(Y_t->dims());
+    if (static_cast<int>(dims_y.size()) == dims_x.nbDims + 1) {
+      if (dims_y[0] == 1) dims_y.erase(dims_y.begin());
+    }
+
+    if (static_cast<int>(dims_y.size()) == 1 && dims_y[0] == dims_x.d[0]) {
+      scale_mode = nvinfer1::ScaleMode::kCHANNEL;
+    } else if (static_cast<int>(dims_y.size()) == dims_x.nbDims &&
+               dims_y[0] == dims_x.d[0]) {
+      scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
+      for (int i = 1; i < dims_x.nbDims; i++) {
+        if (dims_y[i] != dims_x.d[i]) {
+          scale_mode = nvinfer1::ScaleMode::kCHANNEL;
+          break;
+        }
+      }
+      if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) {
+        for (int i = 1; i < dims_x.nbDims; i++) {
+          if (dims_y[i] != 1)
+            PADDLE_THROW(
+                "TensorRT unsupported weight shape for Elementwise op!");
+        }
+      }
+    } else {
+      PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!");
+    }
+
+    TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT,
+                                         static_cast<void*>(weight_data),
+                                         Y_t->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                         0};
+    TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                         0};
+
+    nvinfer1::IScaleLayer* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Scale, *const_cast<nvinfer1::ITensor*>(X), scale_mode,
+        shift_weights.get(), scale_weights.get(), power_weights.get());
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+class ElementwiseTensorOpConverter : public OpConverter {
+ public:
+  ElementwiseTensorOpConverter() {}
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    // Here the two nullptr looks strange, that's because the
+    // framework::OpDesc's constructor is strange.
+    framework::OpDesc op_desc(op, nullptr);
+    LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer";
+
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+    auto* Y = engine_->GetITensor(op_desc.Input("Y").front());
+    nvinfer1::Dims dims_x = X->getDimensions();
+    nvinfer1::Dims dims_y = Y->getDimensions();
+
+    // The two input tensor should have the same dims
+    PADDLE_ENFORCE(dims_x.nbDims >= 3);
+    if (dims_x.nbDims == dims_y.nbDims) {
+      for (int i = 0; i < dims_x.nbDims; i++) {
+        if (dims_x.d[i] != dims_y.d[i])
+          PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
+      }
+    } else {
+      PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
+    }
+
+    auto op_pair = ops.find(op_type_);
+    if (op_pair == ops.end()) {
+      PADDLE_THROW("Wrong elementwise op type!");
+    }
+    nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
+        *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
+
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
+  }
+
+ protected:
+  static const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
+      ops;
+  std::string op_type_;
+};
+
+const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
+    ElementwiseTensorOpConverter::ops = {
+        {"add", nvinfer1::ElementWiseOperation::kSUM},
+        {"mul", nvinfer1::ElementWiseOperation::kPROD},
+        {"sub", nvinfer1::ElementWiseOperation::kSUB},
+        {"div", nvinfer1::ElementWiseOperation::kDIV},
+        {"min", nvinfer1::ElementWiseOperation::kMIN},
+        {"pow", nvinfer1::ElementWiseOperation::kPOW},
+        {"max", nvinfer1::ElementWiseOperation::kMAX},
+};
+
+class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorAddOpConverter() { op_type_ = "add"; }
+};
+
+class ElementwiseTensorMulOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorMulOpConverter() { op_type_ = "mul"; }
+};
+
+class ElementwiseTensorSubOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorSubOpConverter() { op_type_ = "sub"; }
+};
+
+class ElementwiseTensorDivOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorDivOpConverter() { op_type_ = "div"; }
+};
+
+class ElementwiseTensorMinOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorMinOpConverter() { op_type_ = "min"; }
+};
+
+class ElementwiseTensorMaxOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorMaxOpConverter() { op_type_ = "max"; }
+};
+
+class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorPowOpConverter() { op_type_ = "pow"; }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(elementwise_add_weight, ElementwiseWeightOpConverter);
+
+REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor,
+                          ElementwiseTensorAddOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_sub_tensor,
+                          ElementwiseTensorSubOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_div_tensor,
+                          ElementwiseTensorDivOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_mul_tensor,
+                          ElementwiseTensorMulOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_max_tensor,
+                          ElementwiseTensorMaxOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_min_tensor,
+                          ElementwiseTensorMinOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_pow_tensor,
+                          ElementwiseTensorPowOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index 409efac6799b6fb8d27a1343a55e7a508760868f..39fe1f609d7b94638506877fc301f19ef33ec8ac 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -38,7 +38,7 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
 }
 // indata c * k
 // Reorder the data layout from CK to KC.
-void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
+void ReorderCKtoKC(TensorRTEngine::Weight& iweights,  // NOLINT
                    TensorRTEngine::Weight* oweights) {
   int c = iweights.dims[0];
   int k = iweights.dims[1];
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
index 3c342957360ad4192d838147bf37e84d233c2629..514eb659a8da73b6e56b5d17148ec0cb2aeaa135 100644
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -49,5 +49,4 @@ class MulOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(mul);
 REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 968f7eb99ce8519edaa585fd3cb642bd80cc63cc..41faaf7212accaaec238062b1340e8da8fa6be33 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -55,6 +55,32 @@ class OpConverter {
         it = Registry<OpConverter>::Lookup("fc");
       }
     }
+    if (op_desc.Type().find("elementwise") != std::string::npos) {
+      static std::unordered_set<std::string> add_tensor_op_set{
+          "add", "mul", "sub", "div", "max", "min", "pow"};
+      // TODO(xingzhaolong): all mul, sub, div
+      // static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
+      // "sub", "div"};
+      static std::unordered_set<std::string> add_weight_op_set{"add"};
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      int op_type_len = op_desc.Type().size();
+      std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len);
+      std::string Y = op_desc.Input("Y")[0];
+      if (parameters.count(Y)) {
+        PADDLE_ENFORCE(add_weight_op_set.count(op_type) > 0,
+                       "Unsupported elementwise type" + op_type);
+        it =
+            Registry<OpConverter>::Lookup("elementwise_" + op_type + "_weight");
+        PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+                                op_desc.Type());
+      } else {
+        PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0,
+                       "Unsupported elementwise type" + op_type);
+        it =
+            Registry<OpConverter>::Lookup("elementwise_" + op_type + "_tensor");
+      }
+    }
+
     if (!it) {
       it = Registry<OpConverter>::Lookup(op_desc.Type());
     }
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..11cad95361867476c6f775af778015da37f1cfb1
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -0,0 +1,80 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Pool2dOp, IPoolingLayer in TRT. This Layer doesn't has weights.
+ */
+class Pool2dOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4)
+        << "convert a fluid pool2d op to tensorrt pool2d layer without bias";
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+
+    std::string pool_type =
+        boost::get<std::string>(op_desc.GetAttr("pooling_type"));
+    std::vector<int> ksize =
+        boost::get<std::vector<int>>(op_desc.GetAttr("ksize"));
+    std::vector<int> strides =
+        boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+    std::vector<int> paddings =
+        boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+
+    const nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
+    const nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+    const nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+
+    PADDLE_ENFORCE_EQ(input1->getDimensions().nbDims, 3UL);
+
+    nvinfer1::PoolingType nv_pool_type = nvinfer1::PoolingType::kMAX;
+    if (pool_type == "max") {
+      nv_pool_type = nvinfer1::PoolingType::kMAX;
+    } else if (pool_type == "avg") {
+      nv_pool_type = nvinfer1::PoolingType::kAVERAGE;
+    } else {
+      PADDLE_THROW("TensorRT unsupported pooling type!");
+    }
+
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling,
+                                       *const_cast<nvinfer1::ITensor*>(input1),
+                                       nv_pool_type, nv_ksize);
+    PADDLE_ENFORCE_NOT_NULL(layer, "pool layer could not be created.");
+    layer->setStride(nv_strides);
+    layer->setPadding(nv_paddings);
+
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(pool2d);
+REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index 7dabfd9f6a9a8cfbdd1d9a66541180d3499b7bdc..e82762ea03ecd00bce7cfb83b130a3436ccbfed3 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -37,7 +37,7 @@ TEST(ReluOpConverter, main) {
   validator.SetOp(*desc.Proto());
   LOG(INFO) << "execute";
 
-  validator.Execute(1);
+  validator.Execute(5);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f8711c6b60d74639529624c25429bc245de46479
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(conv2d_op, test) {
+  std::unordered_set<std::string> parameters({"conv2d-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
+
+  validator.DeclInputVar("conv2d-X", nvinfer1::Dims3(2, 5, 5));
+  validator.DeclParamVar("conv2d-Y", nvinfer1::Dims4(3, 2, 3, 3));
+  validator.DeclOutputVar("conv2d-Out", nvinfer1::Dims3(3, 5, 5));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("conv2d");
+  desc.SetInput("Input", {"conv2d-X"});
+  desc.SetInput("Filter", {"conv2d-Y"});
+  desc.SetOutput("Output", {"conv2d-Out"});
+
+  const std::vector<int> strides({1, 1});
+  const std::vector<int> paddings({1, 1});
+  const std::vector<int> dilations({1, 1});
+  const int groups = 1;
+
+  desc.SetAttr("strides", strides);
+  desc.SetAttr("paddings", paddings);
+  desc.SetAttr("dilations", dilations);
+  desc.SetAttr("groups", groups);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(3);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+USE_OP(conv2d);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7537d02a35b66a41c158cd8eb1b1e5d4107e7d84
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(elementwise_op, add_weight_test) {
+  std::unordered_set<std::string> parameters({"elementwise_add-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1 << 15);
+  validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
+  validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1));
+  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
+  validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("elementwise_add");
+  desc.SetInput("X", {"elementwise_add-X"});
+  desc.SetInput("Y", {"elementwise_add-Y"});
+  desc.SetOutput("Out", {"elementwise_add-Out"});
+
+  int axis = 1;
+  desc.SetAttr("axis", axis);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(8);
+}
+
+TEST(elementwise_op, add_tensor_test) {
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  TRTConvertValidation validator(8, parameters, scope, 1 << 15);
+  validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
+  validator.DeclInputVar("elementwise_add-Y", nvinfer1::Dims3(10, 3, 3));
+  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
+  validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("elementwise_add");
+  desc.SetInput("X", {"elementwise_add-X"});
+  desc.SetInput("Y", {"elementwise_add-Y"});
+  desc.SetOutput("Out", {"elementwise_add-Out"});
+
+  // the defalut axis of elementwise op is -1
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(8);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+USE_OP(elementwise_add);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
index 081f4d605975f1408d4d8a8ed3108c04d837a4de..1ae2668e733aad23241c63b9985e708396d0b1bc 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
@@ -24,9 +24,8 @@ TEST(fc_op, test) {
   std::unordered_set<std::string> parameters({"mul-Y"});
   framework::Scope scope;
   TRTConvertValidation validator(10, parameters, scope, 1000);
-  validator.DeclInputVar("mul-X", nvinfer1::Dims4(1, 10, 1, 1));
+  validator.DeclInputVar("mul-X", nvinfer1::Dims3(10, 1, 1));
   validator.DeclParamVar("mul-Y", nvinfer1::Dims2(10, 2));
-  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
   validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(1, 2));
 
   // Prepare Op description
@@ -38,7 +37,7 @@ TEST(fc_op, test) {
 
   validator.SetOp(*desc.Proto());
 
-  validator.Execute(1);
+  validator.Execute(10);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
index 674f37f2fdddf013a8f6f4671debbc19c3322423..3d34cd7d5d0deca4d83a3f5b5ed0fb396c6acd56 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -23,7 +23,7 @@ namespace tensorrt {
 TEST(MulOpConverter, main) {
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
+  TRTConvertValidation validator(10, parameters, scope, 1000, false);
   validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6));
   validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10));
   validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10));
@@ -39,7 +39,7 @@ TEST(MulOpConverter, main) {
   validator.SetOp(*desc.Proto());
   LOG(INFO) << "execute";
 
-  validator.Execute(1);
+  validator.Execute(2);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 9b79f86b0edba983019bd932f52b08711ff36d41..d6651a5b244ba31a01220e6299cb2016ae61fe64 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -25,12 +25,42 @@ TEST(OpConverter, ConvertBlock) {
   framework::ProgramDesc prog;
   auto* block = prog.MutableBlock(0);
   auto* conv2d_op = block->AppendOp();
+
+  // init trt engine
+  cudaStream_t stream_;
+  std::unique_ptr<TensorRTEngine> engine_;
+  engine_.reset(new TensorRTEngine(5, 1 << 15, &stream_));
+  engine_->InitNetwork();
+  PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+
+  engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT,
+                        nvinfer1::Dims3(2, 5, 5));
+
   conv2d_op->SetType("conv2d");
+  conv2d_op->SetInput("Input", {"conv2d-X"});
+  conv2d_op->SetInput("Filter", {"conv2d-Y"});
+  conv2d_op->SetOutput("Output", {"conv2d-Out"});
 
-  OpConverter converter;
+  const std::vector<int> strides({1, 1});
+  const std::vector<int> paddings({1, 1});
+  const std::vector<int> dilations({1, 1});
+  const int groups = 1;
+
+  conv2d_op->SetAttr("strides", strides);
+  conv2d_op->SetAttr("paddings", paddings);
+  conv2d_op->SetAttr("dilations", dilations);
+  conv2d_op->SetAttr("groups", groups);
+
+  // init scope
   framework::Scope scope;
-  converter.ConvertBlock(*block->Proto(), {}, scope,
-                         nullptr /*TensorRTEngine*/);
+  std::vector<int> dim_vec = {3, 2, 3, 3};
+  auto* x = scope.Var("conv2d-Y");
+  auto* x_tensor = x->GetMutable<framework::LoDTensor>();
+  x_tensor->Resize(framework::make_ddim(dim_vec));
+
+  OpConverter converter;
+  converter.ConvertBlock(*block->Proto(), {"conv2d-Y"}, scope,
+                         engine_.get() /*TensorRTEngine*/);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c5dddbc8cd37b9fb1ba39382af2da5ad045f3af2
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <gtest/gtest.h>
+#include <fstream>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(Pool2dOpConverter, main) {
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
+
+  // The ITensor's Dims should not contain the batch size.
+  // So, the ITensor's Dims of input and output should be C * H * W.
+  validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 4, 4));
+  validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("pool2d");
+  desc.SetInput("X", {"pool2d-X"});
+  desc.SetOutput("Out", {"pool2d-Out"});
+
+  std::vector<int> ksize({2, 2});
+  std::vector<int> strides({2, 2});
+  std::vector<int> paddings({0, 0});
+  std::string pooling_t = "max";
+
+  desc.SetAttr("pooling_type", pooling_t);
+  desc.SetAttr("ksize", ksize);
+  desc.SetAttr("strides", strides);
+  desc.SetAttr("paddings", paddings);
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(3);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(pool2d);
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index f14885b238134cdf38a278cd8a0734947bcacfe0..63c2f978f253df11100ecca83acae5eab6a0337d 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -63,13 +63,16 @@ class TRTConvertValidation {
  public:
   TRTConvertValidation() = delete;
 
-  TRTConvertValidation(int batch_size,
+  TRTConvertValidation(int max_batch_size,
                        const std::unordered_set<std::string>& parameters,
                        framework::Scope& scope,  // NOLINT
-                       int workspace_size = 1 << 10)
-      : parameters_(parameters), scope_(scope) {
+                       int workspace_size = 1 << 10, bool if_add_batch = true)
+      : parameters_(parameters),
+        scope_(scope),
+        if_add_batch_(if_add_batch),
+        max_batch_size_(max_batch_size) {
     // create engine.
-    engine_.reset(new TensorRTEngine(batch_size, workspace_size, &stream_));
+    engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, &stream_));
     engine_->InitNetwork();
 
     PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
@@ -84,7 +87,7 @@ class TRTConvertValidation {
 
   // Declare a parameter varaible in the scope.
   void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
-    DeclVar(name, dims);
+    DeclVar(name, dims, true);
   }
 
   void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) {
@@ -92,12 +95,18 @@ class TRTConvertValidation {
   }
 
   // Declare a variable in a fluid Scope.
-  void DeclVar(const std::string& name, const nvinfer1::Dims& dims) {
+  void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
+               bool is_param = false) {
     platform::CPUPlace place;
     platform::CPUDeviceContext ctx(place);
 
     // Init Fluid tensor.
     std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
+    // There is no batchsize in ITensor's shape, but We should add it to
+    // tensor's shape of fluid. If the variable is not parameter and the
+    // if_add_batch_ flag is true, add the max batchsize to dim_vec.
+    if (is_param != true && if_add_batch_ == true)
+      dim_vec.insert(dim_vec.begin(), max_batch_size_);
     auto* x = scope_.Var(name);
     auto* x_tensor = x->GetMutable<framework::LoDTensor>();
     x_tensor->Resize(framework::make_ddim(dim_vec));
@@ -131,6 +140,7 @@ class TRTConvertValidation {
 
   void Execute(int batch_size) {
     // Execute Fluid Op
+    PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
     platform::CPUPlace place;
     platform::CPUDeviceContext ctx(place);
     op_->Run(scope_, place);
@@ -139,7 +149,7 @@ class TRTConvertValidation {
     cudaStreamSynchronize(*engine_->stream());
 
     ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
-    const size_t output_space_size = 2000;
+    const size_t output_space_size = 3000;
     for (const auto& output : op_desc_->OutputArgumentNames()) {
       std::vector<float> fluid_out;
       std::vector<float> trt_out(output_space_size);
@@ -149,9 +159,15 @@ class TRTConvertValidation {
       auto* var = scope_.FindVar(output);
       auto tensor = var->GetMutable<framework::LoDTensor>();
       framework::TensorToVector(*tensor, ctx, &fluid_out);
+
+      size_t fluid_out_size = fluid_out.size();
+      if (if_add_batch_ == true) {
+        fluid_out_size =
+            batch_size * (framework::product(tensor->dims()) / max_batch_size_);
+      }
       // Compare two output
       ASSERT_FALSE(fluid_out.empty());
-      for (size_t i = 0; i < fluid_out.size(); i++) {
+      for (size_t i = 0; i < fluid_out_size; i++) {
         // Loose the threshold for CI in different machine model.
         EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5);
       }
@@ -167,6 +183,12 @@ class TRTConvertValidation {
   std::unique_ptr<framework::OpDesc> op_desc_;
   const std::unordered_set<std::string>& parameters_;
   framework::Scope& scope_;
+  // The ITensor of trt does not cotain the batch size,
+  // bug, in most cases, we need to set batch size for
+  // fluid's tensor shape. This variable indicates
+  // whether to add batch size to tensor shape of fluid.
+  bool if_add_batch_;
+  int max_batch_size_;
 };
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index f8732e51b66bdc78aa35d06ba9651f1942a74b01..dc03702990587bf5e65d28da662d10df4d882110 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -113,7 +113,7 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
   ASSERT_EQ(y_cpu[1], 14.5);
 }
 
-TEST_F(TensorRTEngineTest, test_conv2d_temp) {
+TEST_F(TensorRTEngineTest, test_conv2d) {
   // Weight in CPU memory.
   float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
   float raw_bias[1] = {0};
@@ -146,6 +146,37 @@ TEST_F(TensorRTEngineTest, test_conv2d_temp) {
   ASSERT_EQ(y_cpu[1], 6.0);
 }
 
+TEST_F(TensorRTEngineTest, test_pool2d) {
+  // Weight in CPU memory.
+  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+                                  nvinfer1::Dims3{1, 2, 2});
+
+  nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE;
+  auto* pool_layer =
+      TRT_ENGINE_ADD_LAYER(engine_, Pooling, *const_cast<nvinfer1::ITensor*>(x),
+                           pool_t, nvinfer1::DimsHW{2, 2});
+
+  PADDLE_ENFORCE(pool_layer != nullptr);
+  pool_layer->setStride(nvinfer1::DimsHW{1, 1});
+  pool_layer->setPadding(nvinfer1::DimsHW{0, 0});
+
+  engine_->DeclareOutput(pool_layer, 0, "y");
+  engine_->FreezeNetwork();
+  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
+
+  float x_v[8] = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0};
+  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
+                           8 * sizeof(float));
+  engine_->Execute(2);
+
+  LOG(INFO) << "to get output";
+  float* y_cpu = new float[2];
+  engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
+
+  ASSERT_EQ(y_cpu[0], 2.0);
+  ASSERT_EQ(y_cpu[1], 5.0);
+}
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index 2fa5a9540ba1311c7f87e6675a53044b23dd8276..017fc4cd7b11c150cb941fffca2606a4d707330f 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -17,7 +17,7 @@ function(inference_test TARGET_NAME)
     string(REGEX REPLACE "^_$" "" arg "${arg}")
     cc_test(test_inference_${TARGET_NAME}${arg}
         SRCS test_inference_${TARGET_NAME}.cc
-        DEPS paddle_fluid
+        DEPS paddle_fluid_origin
         ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
     set_tests_properties(test_inference_${TARGET_NAME}${arg}
         PROPERTIES DEPENDS test_${TARGET_NAME})
@@ -43,6 +43,6 @@ inference_test(word2vec)
 # TODO(TJ): clean me up
 cc_test(test_inference_nlp
   SRCS test_inference_nlp.cc
-  DEPS paddle_fluid
+  DEPS paddle_fluid_origin
   ARGS
   --model_path=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model)
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 5cc1db12bb71e428d493e7c6f718b1c6ed431858..e2a3e9d46ef9f303d191d59253ffbe9f4826184b 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -20,9 +20,6 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
-#ifdef PADDLE_WITH_MKLML
-#include <omp.h>
-#endif
 
 DEFINE_string(model_path, "", "Directory of the inference model.");
 DEFINE_string(data_file, "", "File of input index data.");
@@ -30,6 +27,7 @@ DEFINE_int32(repeat, 100, "Running the inference program repeat times");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
 DEFINE_int32(num_threads, 1, "Number of threads should be used");
 DECLARE_bool(use_mkldnn);
+DECLARE_int32(paddle_num_threads);
 
 inline double GetCurrentMs() {
   struct timeval time;
@@ -160,12 +158,7 @@ TEST(inference, nlp) {
   std::unique_ptr<paddle::framework::Scope> scope(
       new paddle::framework::Scope());
 
-#ifdef PADDLE_WITH_MKLML
-  // only use 1 thread number per std::thread
-  omp_set_dynamic(0);
-  omp_set_num_threads(1);
-  paddle::platform::SetNumThreads(1);
-#endif
+  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
 
   double start_ms = 0, stop_ms = 0;
   if (FLAGS_num_threads > 1) {
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 01a8501dd4abe73cbc71dc4c08734cae66df08ef..c2f45fdc99b87bc12c2aadf1985de6e98a24fce7 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -15,6 +15,10 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "glog/logging.h"
 
+DEFINE_bool(free_idle_memory, false,
+            "If it is true, Paddle will try to free idle memory trunks during "
+            "running time.");
+
 namespace paddle {
 namespace memory {
 namespace detail {
@@ -152,13 +156,14 @@ void BuddyAllocator::Free(void* p) {
   pool_.insert(
       IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
 
-  // Clean up if existing too much free memory
-
-  // Prefer freeing fallback allocation first
-  CleanIdleFallBackAlloc();
+  if (FLAGS_free_idle_memory) {
+    // Clean up if existing too much free memory
+    // Prefer freeing fallback allocation first
+    CleanIdleFallBackAlloc();
 
-  // Free normal allocation
-  CleanIdleNormalAlloc();
+    // Free normal allocation
+    CleanIdleNormalAlloc();
+  }
 }
 
 size_t BuddyAllocator::Used() { return total_used_; }
diff --git a/paddle/fluid/operators/.flatten_op.cc.swp b/paddle/fluid/operators/.flatten_op.cc.swp
new file mode 100644
index 0000000000000000000000000000000000000000..3395b6074b6a4c684a97674af702ca8b91dc85e9
Binary files /dev/null and b/paddle/fluid/operators/.flatten_op.cc.swp differ
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 4e2002ad24415437ae4f85eba0e90a6c689e2996..4c3b8ec78190723598a56f7633764f10dd5047f3 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -192,9 +192,9 @@ if(WITH_DISTRIBUTE)
     
     set(DISTRIBUTE_DEPS "")
     if(WITH_GRPC)
-        set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
+        set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
     else()
-        set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib)
+        set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
         if(WITH_BRPC_RDMA)
             find_library(IBVERBS_LIBRARY NAMES ibverbs)
             ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
@@ -270,6 +270,9 @@ op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
 op_library(unsqueeze_op DEPS reshape_op)
 op_library(squeeze_op DEPS reshape_op)
+op_library(extract_rows_op DEPS memory)
+op_library(flatten_op DEPS reshape_op)
+
 
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 1828be57b5a54005a0066b18ebebdb740726f67a..22cbf680c0670552fb014043c69fcadc56863529 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -20,10 +20,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"
 
-DEFINE_bool(cudnn_deterministic, true,
+DEFINE_bool(cudnn_deterministic, false,
             "Whether allow using an autotuning algorithm for convolution "
             "operator. The autotuning algorithm may be non-deterministic. If "
-            "false, the algorithm is deterministic.");
+            "true, the algorithm is deterministic.");
 
 namespace paddle {
 namespace operators {
@@ -77,7 +77,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     // cudnn 7 can support groups, no need to do it mannually
     // FIXME(typhoonzero): find a better way to disable groups
     // rather than setting it to 1.
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+    CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
         cudnn_conv_desc, groups));
     groups = 1;
 #endif
@@ -129,7 +129,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
 
-    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
         workspace_size_limit, &algo));
@@ -140,18 +140,18 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     if (dev_ctx.GetComputeCapability() >= 70 &&
         std::type_index(typeid(T)) ==
             std::type_index(typeid(platform::float16))) {
-      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
           cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
       // Currently tensor core is only enabled using this algo
       algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
     } else {
-      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
           cudnn_conv_desc, CUDNN_DEFAULT_MATH));
     }
 #endif
 
     // get workspace size able to allocate
-    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, algo, &workspace_size_in_bytes));
     // It is possible for float16 on Volta GPU to allocate more memory than
@@ -165,7 +165,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn conv forward ---------------------
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     for (int i = 0; i < groups; i++) {
-      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
+      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
           handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
           cudnn_filter_desc, filter_data + i * group_offset_filter,
           cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
@@ -218,7 +218,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     // cudnn 7 can support groups, no need to do it mannually
     // FIXME(typhoonzero): find a better way to disable groups
     // rather than setting it to 1.
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+    CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
         cudnn_conv_desc, groups));
     groups = 1;
 #endif
@@ -272,8 +272,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
-      if (FLAGS_cudnn_deterministic) {
-        PADDLE_ENFORCE(
+      if (!FLAGS_cudnn_deterministic) {
+        CUDNN_ENFORCE(
             platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
                 handle, cudnn_filter_desc,
                 // dyDesc: Handle to the previously initialized input
@@ -289,7 +289,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
         data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
       }
 
-      PADDLE_ENFORCE(
+      CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
               handle, cudnn_filter_desc, cudnn_output_grad_desc,
               cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
@@ -297,8 +297,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     }
 
     if (filter_grad) {
-      if (FLAGS_cudnn_deterministic) {
-        PADDLE_ENFORCE(
+      if (!FLAGS_cudnn_deterministic) {
+        CUDNN_ENFORCE(
             platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
                 handle, cudnn_input_desc, cudnn_output_grad_desc,
                 cudnn_conv_desc, cudnn_filter_desc,
@@ -308,7 +308,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
         filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
       }
 
-      PADDLE_ENFORCE(
+      CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
               handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
               cudnn_filter_desc, filter_algo, &tmp_size));
@@ -326,7 +326,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       // Because beta is zero, it is unnecessary to reset input_grad.
 
       for (int i = 0; i < groups; i++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
             handle, &alpha, cudnn_filter_desc,
             filter_data + i * group_offset_filter, cudnn_output_grad_desc,
             output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
@@ -339,7 +339,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset filter_grad.
       for (int i = 0; i < groups; i++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
             handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
             cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
             cudnn_conv_desc, filter_algo, cudnn_workspace,
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 5098bd8700e11c9a2faeba90c38ed2d9499b17cf..f07ab5a33b87d7945e5fcdf8f3644f0711ce643b 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -55,7 +55,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
       const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto src_pd = conv_bwd_weights_pd_->src_primitive_desc();
     auto user_pd = user_memory_p->get_primitive_desc();
     return this->AcquireMemory(src_pd, user_pd, user_memory_p,
@@ -64,7 +64,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromWeightsPrimitive(
       const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc();
     auto user_pd = user_memory_p->get_primitive_desc();
     return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
@@ -80,7 +80,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive(
       const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc();
     auto user_pd = user_memory_p->get_primitive_desc();
     return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
@@ -89,7 +89,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromDataPrimitive(
       const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc();
     auto user_pd = user_weights_memory_p->get_primitive_desc();
     return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p,
@@ -109,7 +109,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
       const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto src_pd = conv_pd_->src_primitive_desc();
     auto user_pd = user_memory_p->get_primitive_desc();
     return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
@@ -118,7 +118,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
       const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
     auto weights_pd = conv_pd_->weights_primitive_desc();
     return this->AcquireMemory(weights_pd, user_weights_pd,
@@ -197,12 +197,12 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   // Generate keys for storing/retriving primitives for this operator
   // TODO(jczaja): Make hashing function more optimial
-  static std::string GetHash(memory::dims& input_dims,
-                             memory::dims& weights_dims,
-                             std::vector<int>& strides,
-                             std::vector<int>& paddings,
-                             std::vector<int>& dilations, int groups,
-                             const std::string& suffix) {
+  static std::string GetHash(memory::dims& input_dims,     // NOLINT
+                             memory::dims& weights_dims,   // NOLINT
+                             std::vector<int>& strides,    // NOLINT
+                             std::vector<int>& paddings,   // NOLINT
+                             std::vector<int>& dilations,  // NOLINT
+                             int groups, const std::string& suffix) {
     return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) +
            dims2str(paddings) + dims2str(dilations) + std::to_string(groups) +
            suffix;
@@ -280,12 +280,16 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      * ('any') which lets a primitive (convolution in this case) choose
      * the memory format preferred for best performance
      */
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    auto chosen_memory_format =
+        platform::data_format_to_memory_format(data_format);
+
     auto src_md = platform::MKLDNNMemDesc(
-        src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto dst_md = platform::MKLDNNMemDesc(
-        dst_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
 
     // create a conv primitive descriptor and save it for usage in backward
     std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
@@ -423,16 +427,20 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
      * ('any') which lets a primitive (conv backward in this case) choose
      * the memory format preferred for best performance
      */
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    auto chosen_memory_format =
+        platform::data_format_to_memory_format(data_format);
+
     auto src_md = platform::MKLDNNMemDesc(
-        src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto diff_src_md = platform::MKLDNNMemDesc(
-        src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto diff_weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto diff_dst_md = platform::MKLDNNMemDesc(
-        dst_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
 
     // Retrieve conv_pd from device context
     auto conv_pd =
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
index 038ea8999072f562104c5386ed18b6b275816345..82fff68e7557b3f0b44e6faf2a50e5a0ecbba589 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
@@ -87,7 +87,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
     // Get the algorithm
-    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
         handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
         // dxDesc: Handle to the previously initialized output tensor
         // descriptor.
@@ -95,7 +95,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
         workspace_size_limit, &algo));
 
     // get workspace size able to allocate
-    PADDLE_ENFORCE(
+    CUDNN_ENFORCE(
         platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
             handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
             cudnn_output_desc, algo, &workspace_size_in_bytes));
@@ -110,7 +110,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
     int filter_offset = filter->numel() / groups;
     T alpha = 1.0f, beta = 0.0f;
     for (int g = 0; g < groups; g++) {
-      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
           handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
           cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
           algo, cudnn_workspace, workspace_size_in_bytes, &beta,
@@ -178,11 +178,11 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
     auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
       // choose backward algorithm for data
-      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
           handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
           cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
           workspace_size_limit, &data_algo));
-      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
           handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
           cudnn_input_desc, data_algo, &fwd_ws_size));
       workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size);
@@ -190,7 +190,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
 
     if (filter_grad) {
       // choose backward algorithm for filter
-      PADDLE_ENFORCE(
+      CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
               handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
               cudnn_filter_desc,
@@ -198,7 +198,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
               workspace_size_limit, &filter_algo));
 
       // get workspace for backwards filter algorithm
-      PADDLE_ENFORCE(
+      CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
               handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
               cudnn_filter_desc, filter_algo, &bwd_filter_ws_size));
@@ -222,7 +222,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
       for (int g = 0; g < groups; g++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
             handle, &alpha, cudnn_output_desc,
             output_grad_data + output_grad_offset * g, cudnn_filter_desc,
             filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
@@ -237,7 +237,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       // Because beta is zero, it is unnecessary to reset filter_grad.
       // Gradient with respect to the filter
       for (int g = 0; g < groups; g++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
             handle, &alpha, cudnn_output_desc,
             output_grad_data + output_grad_offset * g, cudnn_input_desc,
             input_data + input_offset * g, cudnn_conv_desc, filter_algo,
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 1612927055dd4ec5ee2220bc2b285e8d9b640ea8..da5d20505e9b06c0717af8d79d5456a9ade1e89c 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -17,9 +17,9 @@ if(WITH_GRPC)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   cc_test(grpc_serde_test SRCS grpc_serde_test.cc 
-     DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
-  cc_test(rpc_server_test SRCS rpc_server_test.cc 
-    DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_table_op SERIAL)
+    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
+  cc_test(rpc_server_test SRCS rpc_server_test.cc
+    DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
   return()
 endif()
 
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index 265f964ddc682868c64669744b130aebbbf86692..b4f60c9ff9a41d5cb7dbe4e7a7694a84bab8e940 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -49,6 +49,7 @@ void GRPCClient::SendComplete() {
 }
 
 GRPCClient::~GRPCClient() {
+  stopped_ = true;
   Wait();
   cq_.Shutdown();
   {
@@ -275,7 +276,7 @@ void GRPCClient::Proceed() {
   void* tag = nullptr;
   bool ok = false;
 
-  while (cq_.Next(&tag, &ok)) {
+  while (!stopped_ && cq_.Next(&tag, &ok)) {
     BaseProcessor* c = static_cast<BaseProcessor*>(tag);
     GPR_ASSERT(ok);
     PADDLE_ENFORCE(c);
diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h
index 8351d825f817437e1b3691e916952dd9a86af491..0c95ffeb5ce7e1586c5968fb122acd12c0c0196e 100644
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -174,7 +174,7 @@ class CheckpointNotifyProcessor : public BaseProcessor {
 
 class GRPCClient : public RPCClient {
  public:
-  GRPCClient() : ok_(true), completed_(false) {}
+  GRPCClient() : ok_(true), completed_(false), stopped_(false) {}
   virtual ~GRPCClient();
 
   bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
@@ -237,6 +237,8 @@ class GRPCClient : public RPCClient {
   // mutex for sending complete message only once
   std::mutex completed_mutex_;
   bool completed_;
+
+  volatile bool stopped_;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 55995783c6eab10632ab2a5bca64ca856f000df1..de1a503154deb967eb4389a9f43b86c05626d966 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -41,6 +41,7 @@ bool RequestSendHandler::Handle(const std::string& varname,
 
   // Async
   if (!sync_mode_) {
+    rpc_server_->Profiler().OneStep();
     try {
       executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
                                     scope);
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
index 83b14fa64d735d80f43bf55c798cddb2f3ea7032..406e7294c190172347d432fb155c2a81c43dda25 100644
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -18,11 +18,44 @@
 #include <string>
 
 #include "paddle/fluid/operators/distributed/rpc_server.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_int32(rpc_server_profile_period, 0,
+             "the period of listen_and_serv to do profile");
+DEFINE_string(rpc_server_profile_path, "/dev/null",
+              "the profile log file path");
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+RPCServerProfiler::RPCServerProfiler(int profile_period,
+                                     const std::string& profile_log_path)
+    : profile_period_(profile_period), profile_log_path_(profile_log_path) {
+  step_ = 0;
+}
+
+void RPCServerProfiler::OneStep() {
+  PADDLE_ENFORCE_LE(step_, profile_period_,
+                    "step_ should not be larger then "
+                    "profile_period_");
+  if (profile_period_ <= 0) {
+    return;
+  }
+
+  if (step_ == 0) {
+    auto pf_state = paddle::platform::ProfilerState::kCPU;
+    paddle::platform::EnableProfiler(pf_state);
+  }
+  if (step_ == profile_period_) {
+    paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kTotal,
+                                      profile_log_path_);
+    step_ = 0;
+  } else {
+    step_++;
+  }
+}
+
 void RPCServer::ShutDown() {
   LOG(INFO) << "RPCServer ShutDown ";
   ShutDownImpl();
diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
index fd914d7a72e61bc9472876c433b65598ef5b1980..d813ba03e2fbec6e808f59f814a9b2f4bfbcd77b 100644
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -19,16 +19,33 @@
 #include <thread>  // NOLINT
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/operators/distributed/request_handler.h"
 
+DECLARE_int32(rpc_server_profile_period);
+DECLARE_string(rpc_server_profile_path);
+
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+class RPCServerProfiler {
+ public:
+  RPCServerProfiler(int profile_period, const std::string& profile_log_path);
+  void OneStep();
+
+ private:
+  const int profile_period_;
+  std::string profile_log_path_;
+  int step_;
+};
+
 class RPCServer {
  public:
   explicit RPCServer(const std::string& address, int client_num)
       : cur_cond_(0),
+        profiler_(FLAGS_rpc_server_profile_period,
+                  FLAGS_rpc_server_profile_path),
         bind_address_(address),
         exit_flag_(false),
         selected_port_(0),
@@ -67,6 +84,7 @@ class RPCServer {
   void Complete();
 
   void ResetBarrierCounter();
+  RPCServerProfiler& Profiler() { return profiler_; }
 
  protected:
   virtual void ShutDownImpl() = 0;
@@ -79,6 +97,7 @@ class RPCServer {
   std::unordered_map<std::string, int> rpc_cond_map_;
   std::atomic<int> cur_cond_;
   std::condition_variable rpc_cond_;
+  RPCServerProfiler profiler_;
 
  protected:
   std::string bind_address_;
diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
index 9f2360ec70d2ce5d4e16435595e109c1bf04fd13..b50830c362d3f6ecf38affbfa6a1ffe2ed77e125 100644
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -30,7 +30,7 @@ namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace distributed = paddle::operators::distributed;
 
-USE_OP(lookup_table);
+USE_NO_KERNEL_OP(lookup_sparse_table);
 
 std::unique_ptr<distributed::RPCServer> g_rpc_service;
 std::unique_ptr<distributed::RequestHandler> g_req_handler;
@@ -42,13 +42,13 @@ framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
   framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}});
   framework::VariableNameMap output({{"Output", {"out"}}});
   auto op = block->AppendOp();
-  op->SetType("lookup_table");
+  op->SetType("lookup_sparse_table");
   op->SetInput("W", {"w"});
   op->SetInput("Ids", {"ids"});
   op->SetOutput("Out", {"out"});
 
   auto& out = *root_block->Var("out");
-  out.SetType(framework::proto::VarType::SELECTED_ROWS);
+  out.SetType(framework::proto::VarType::LOD_TENSOR);
   out.SetShape({10, 10});
 
   return block;
@@ -59,20 +59,19 @@ void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
   w_var->GetMutable<framework::SelectedRows>();
 
   auto out_var = scope->Var("out");
-  out_var->GetMutable<framework::SelectedRows>();
+  out_var->GetMutable<framework::LoDTensor>();
 
   auto ids_var = scope->Var("ids");
-  ids_var->GetMutable<framework::SelectedRows>();
+  ids_var->GetMutable<framework::LoDTensor>();
 }
 
 void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
                          int64_t rows_numel) {
   CreateVarsOnScope(scope, place);
-  auto ids_var = scope->Var("ids")->GetMutable<framework::SelectedRows>();
-  auto rows = ids_var->mutable_rows();
-  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i * 2);
-  ids_var->mutable_value()->Resize({rows_numel, 1});
-  ids_var->mutable_value()->mutable_data<float>(*place);
+  auto ids_var = scope->Var("ids")->GetMutable<framework::LoDTensor>();
+  int64_t* ids_ptr =
+      ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place);
+  for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2;
 }
 
 void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
@@ -148,11 +147,11 @@ TEST(PREFETCH, CPU) {
     client->AsyncPrefetchVar(ep, ctx, scope, in_var_name, out_var_name);
     client->Wait();
     auto var = scope.Var(out_var_name);
-    auto value = var->GetMutable<framework::SelectedRows>()->value();
-    auto ptr = value.mutable_data<float>(place);
+    auto value = var->GetMutable<framework::LoDTensor>();
+    auto ptr = value->mutable_data<float>(place);
 
     for (int64_t i = 0; i < rows_numel; ++i) {
-      EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
+      EXPECT_EQ(ptr[0 + i * value->dims()[1]], static_cast<float>(i * 2));
     }
   }
 
diff --git a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
index 1a5427b39241b666eeaf12b173ea00443bb5f6e4..c86cd57316078778e5930c9b524b931d523028d7 100644
--- a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
@@ -47,12 +47,12 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
     int axis = ctx.Attr<int>("axis");
 
     auto x_dims = x->dims();
-    auto y_dims = y->dims();
+    auto y_dims_untrimed = y->dims();
     auto z_dims = z->dims();
 
     // Execute default elementwise_add operator when
     // broadcast operations need to performed.
-    if (x_dims != y_dims) {
+    if (x_dims != y_dims_untrimed) {
       auto sum_func = [](T a, T b) -> T { return a + b; };
 
       TransformFunctor<decltype(sum_func), T,
@@ -62,11 +62,11 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
               ctx.template device_context<paddle::platform::CPUDeviceContext>(),
               sum_func);
 
-      axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+      axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
       PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
                      "Axis should be in range [0, x_dims)");
 
-      trim_trailing_singular_dims(&y_dims);
+      auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
       axis = (y_dims.size() == 0) ? x_dims.size() : axis;
 
       int pre, n, post;
@@ -88,7 +88,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
                      "Wrong layout/format set for Y tensor");
 
       std::vector<int> src_x_tz = framework::vectorize2int(x_dims);
-      std::vector<int> src_y_tz = framework::vectorize2int(y_dims);
+      std::vector<int> src_y_tz = framework::vectorize2int(y_dims_untrimed);
       std::vector<int> dst_tz = framework::vectorize2int(z_dims);
 
       std::vector<memory::primitive_desc> srcs_pd;
@@ -142,36 +142,39 @@ class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using Tensor = framework::Tensor;
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
+    // skip out, x, y,
+    // dout length is larger or equal than dx, dy.
+    auto* out = dout;
+    auto *x = dout, *y = dout;
 
     auto set_mkldnn_format = [](Tensor* in, const Tensor* out) {
       in->set_layout(DataLayout::kMKLDNN);
       in->set_format(out->format());
     };
 
-    if (x->dims() == y->dims()) {
-      auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
-      if (dx) {
-        blas.VCOPY(dout->numel(), dout->data<T>(),
-                   dx->mutable_data<T>(ctx.GetPlace()));
-        set_mkldnn_format(dx, dout);
-      }
-
-      if (dy) {
-        blas.VCOPY(dout->numel(), dout->data<T>(),
-                   dy->mutable_data<T>(ctx.GetPlace()));
-        set_mkldnn_format(dy, dout);
+    if (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()) {
+      if (dx->dims() == dy->dims()) {
+        auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
+        if (dx) {
+          blas.VCOPY(dout->numel(), dout->data<T>(),
+                     dx->mutable_data<T>(ctx.GetPlace()));
+          set_mkldnn_format(dx, dout);
+        }
+
+        if (dy) {
+          blas.VCOPY(dout->numel(), dout->data<T>(),
+                     dy->mutable_data<T>(ctx.GetPlace()));
+          set_mkldnn_format(dy, dout);
+        }
       }
     } else {
       // Execute default kernel when broadcast is needed
-      ElemwiseGradCompute<paddle::platform::CPUDeviceContext, T,
-                          IdentityGrad<T>, IdentityGrad<T>>(
+      ElemwiseExplicitGradCompute<paddle::platform::CPUDeviceContext, T,
+                                  IdentityGrad<T>, IdentityGrad<T>>(
           ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
           IdentityGrad<T>());
     }
diff --git a/paddle/fluid/operators/elementwise_add_op.cc b/paddle/fluid/operators/elementwise_add_op.cc
index d2c20537136fc3ac9d1bece24a2238f26215c922..3c97ac995c649ecd0d196a584240e1e7ac04f08e 100644
--- a/paddle/fluid/operators/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise_add_op.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
 namespace ops = paddle::operators;
-REGISTER_ELEMWISE_OP(elementwise_add, "Add", "Out = X + Y");
+REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add);
+REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y", "Out",
+                              "X");
 REGISTER_OP_CPU_KERNEL(
     elementwise_add,
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_add_op.cu b/paddle/fluid/operators/elementwise_add_op.cu
index dfff518f170b56d180b6883c363effb8dbd677b6..6cbf6066c92b02eb75922587f9da5192bed15580 100644
--- a/paddle/fluid/operators/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise_add_op.cu
@@ -16,6 +16,60 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_add_op.h"
 #include "paddle/fluid/platform/float16.h"
 
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void ElementwiseAddCUDAKernel(const T *x, const T *y, T *z, int n,
+                                         int post, int size) {
+  int idx_x = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx_x < size) {
+    int idx_y = idx_x / post - (idx_x / (n * post)) * n;
+    z[idx_x] = x[idx_x] + y[idx_y];
+  }
+}
+
+template <typename T>
+class ElementwiseAddKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    using Tensor = framework::Tensor;
+
+    const auto x = ctx.Input<Tensor>("X");
+    const auto y = ctx.Input<Tensor>("Y");
+    auto z = ctx.Output<Tensor>("Out");
+    auto *z_data = z->mutable_data<T>(ctx.GetPlace());
+
+    auto &device = *(ctx.cuda_device_context().eigen_device());
+    const framework::DDim &x_dim = x->dims();
+    framework::DDim y_dim = y->dims();
+    int size = x->numel();
+    if (x_dim == y_dim) {
+      auto dim = framework::make_ddim({size});
+      auto z_eigen = framework::EigenTensor<T, 1>::From(*z, dim);
+      auto x_eigen = framework::EigenTensor<T, 1>::From(*x, dim);
+      auto y_eigen = framework::EigenTensor<T, 1>::From(*y, dim);
+      z_eigen.device(device) = x_eigen + y_eigen;
+    } else {
+      int axis = ctx.Attr<int>("axis");
+      axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
+      y_dim = trim_trailing_singular_dims(y_dim);
+      axis = (y_dim.size() == 0) ? x_dim.size() : axis;
+      int pre, n, post;
+      get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
+      int threads = 512;
+      int grids = (size + threads - 1) / threads;
+      auto stream = ctx.cuda_device_context().stream();
+      ElementwiseAddCUDAKernel<T><<<grids, threads, 0, stream>>>(
+          x->data<T>(), y->data<T>(), z_data, n, post, size);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h
index baf04c30b17cb333fc8a6544afd6c479442f835b..0b19723720171a857c946880c246e2247a0023a7 100644
--- a/paddle/fluid/operators/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise_add_op.h
@@ -95,9 +95,10 @@ void default_elementwise_add_grad(const framework::ExecutionContext& ctx,
                                   framework::Tensor* dy) {
   int axis = ctx.Attr<int>("axis");
 
-  ElemwiseGradCompute<DeviceContext, T, IdentityGrad<T>, IdentityGrad<T>>(
-      ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
-      IdentityGrad<T>());
+  ElemwiseExplicitGradCompute<DeviceContext, T, IdentityGrad<T>,
+                              IdentityGrad<T>>(ctx, *x, *y, *out, *dout, axis,
+                                               dx, dy, IdentityGrad<T>(),
+                                               IdentityGrad<T>());
 }
 
 template <typename DeviceContext, typename T>
@@ -140,18 +141,44 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using Tensor = framework::Tensor;
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
 
-    if (platform::is_cpu_place(ctx.GetPlace()) && (x->dims() == y->dims())) {
-      elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
+    if (dx != nullptr) {
+      // In fact, we can just share memory, but it may cause a bug of memory
+      // optimizer
+      // dx->ShareDataWith(*dout);
+      framework::TensorCopy(*dout, ctx.GetPlace(),
+                            ctx.template device_context<DeviceContext>(), dx);
+    }
+
+    if (dy == nullptr) return;
+
+    const framework::DDim& x_dim = dout->dims();
+    framework::DDim y_dim = dy->dims();
+    if (x_dim == y_dim) {
+      // dy->ShareDataWith(*dout);
+      framework::TensorCopy(*dout, ctx.GetPlace(),
+                            ctx.template device_context<DeviceContext>(), dy);
     } else {
-      default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx,
-                                                     dy);
+      dy->mutable_data<T>(ctx.GetPlace());
+      // Perform reduction to dout to calculate dy
+      int axis = ctx.Attr<int>("axis");
+      axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
+      y_dim = trim_trailing_singular_dims(y_dim);
+      axis = (y_dim.size() == 0) ? x_dim.size() : axis;
+
+      auto& device =
+          *(ctx.template device_context<DeviceContext>().eigen_device());
+      int pre, n, post;
+      get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
+      auto eigen_dout = framework::EigenTensor<T, 3>::From(
+          *dout, framework::make_ddim({pre, n, post}));
+      auto eigen_dy =
+          framework::EigenTensor<T, 1>::From(*dy, framework::make_ddim({n}));
+      eigen_dy.device(device) = eigen_dout.sum(
+          framework::EigenDim<2>::From(framework::make_ddim({0, 2})));
     }
   }
 };
diff --git a/paddle/fluid/operators/elementwise_div_op.cc b/paddle/fluid/operators/elementwise_div_op.cc
index 824b1221e5a77c8799dc34820b7f0db180c2439e..84c8a65e5f859d276ae6d5f1a3f25c9d713a7a61 100644
--- a/paddle/fluid/operators/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise_div_op.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_div_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
 namespace ops = paddle::operators;
+
 REGISTER_ELEMWISE_OP(elementwise_div, "Div", "Out = X / Y");
+
 REGISTER_OP_CPU_KERNEL(
     elementwise_div,
     ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index bb88970e42c194d9437609b62435f1a89e2b446b..d8a12e800ad733800c1ec333f15d31d4dcd1a3a5 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -78,7 +78,9 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() final {
     AddInput("X", "(Tensor), The first input tensor of elementwise op.");
     AddInput("Y", "(Tensor), The second input tensor of elementwise op.");
-    AddOutput("Out", "The output of elementwise op.").Reuse("X");
+    // AddOutput("SavedShape", "(Tensor), save X, Y shape for grad to save
+    // memory.").AsIntermediate();
+    AddOutput("Out", "The output of elementwise op.");
     AddAttr<int>("axis",
                  "(int, default -1). The start dimension index "
                  "for broadcasting Y onto X.")
@@ -125,11 +127,13 @@ But the output only shares the LoD information with the input $X$.
 
 )DOC",
                                GetName(), GetEquation()));
+    SetReuse();
   }
 
  protected:
   virtual std::string GetName() const = 0;
   virtual std::string GetEquation() const = 0;
+  virtual void SetReuse() {}
 };
 
 class ElementwiseOpGrad : public framework::OperatorWithKernel {
@@ -162,8 +166,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    auto input_data_type = framework::ToDataType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type());
 
 #ifdef PADDLE_WITH_MKLDNN
     if (platform::CanMKLDNNBeUsed(ctx)) {
@@ -175,9 +179,58 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
+
+// For Add, Sub op, the X, Out is not needed.
+class ElementwiseOpExplicitGrad : public ElementwiseOpGrad {
+ public:
+  using operators::ElementwiseOpGrad::ElementwiseOpGrad;
+  using operators::ElementwiseOpGrad::GetExpectedKernelType;
+  using Tensor = framework::Tensor;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+      ctx->SetOutputDim(x_grad_name, out_dims);
+    }
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(y_grad_name)) {
+      PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+      auto y_dims = ctx->GetInputDim("Y");
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
+/*
+*/
+
+#define REGISTER_ELEMWISE_GRAD_MAKER(kernel_type, op_name)                   \
+  class kernel_type##GradMaker                                               \
+      : public paddle::framework::SingleGradOpDescMaker {                    \
+   public:                                                                   \
+    using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; \
+                                                                             \
+   protected:                                                                \
+    std::unique_ptr<paddle::framework::OpDesc> Apply() const override {      \
+      auto* op = new paddle::framework::OpDesc();                            \
+      op->SetType(#kernel_type "_grad");                                     \
+      op->SetInput("Y", Input("Y"));                                         \
+      op->SetInput(::paddle::framework::GradVarName("Out"),                  \
+                   OutputGrad("Out"));                                       \
+      op->SetAttrMap(Attrs());                                               \
+      op->SetOutput(::paddle::framework::GradVarName("X"), InputGrad("X"));  \
+      op->SetOutput(::paddle::framework::GradVarName("Y"), InputGrad("Y"));  \
+      return std::unique_ptr<::paddle::framework::OpDesc>(op);               \
+    }                                                                        \
+  }
+
 #define REGISTER_ELEMWISE_OP(op_type, op_name, equation)                \
   class __ElemwiseOp##op_type##Maker__                                  \
       : public ::paddle::operators::ElementwiseOpMaker {                \
@@ -190,3 +243,18 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
                     ::paddle::operators::ElementwiseOpInferVarType,     \
                     ::paddle::framework::DefaultGradOpDescMaker<true>); \
   REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad)
+
+#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation, ...) \
+  class __ElemwiseOp##op_type##Maker__                                 \
+      : public ::paddle::operators::ElementwiseOpMaker {               \
+   protected:                                                          \
+    virtual std::string GetName() const { return op_name; }            \
+    virtual std::string GetEquation() const { return equation; }       \
+    virtual void SetReuse() { Reuse(__VA_ARGS__); }                    \
+  };                                                                   \
+  REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp,       \
+                    __ElemwiseOp##op_type##Maker__,                    \
+                    ::paddle::operators::ElementwiseOpInferVarType,    \
+                    op_type##GradMaker);                               \
+  REGISTER_OPERATOR(op_type##_grad,                                    \
+                    ::paddle::operators::ElementwiseOpExplicitGrad)
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index 8b052611f80ddf874ca48c1c58e13346528a834e..bc3e95e904f8b6c2cdd2ae6685bf67580178e6b6 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <glog/logging.h>
 #include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -65,17 +67,21 @@ inline void get_mid_dims(const framework::DDim& x_dims,
   }
 }
 
-inline void trim_trailing_singular_dims(framework::DDim* dims) {
+inline framework::DDim trim_trailing_singular_dims(
+    const framework::DDim& dims) {
   // Remove trailing dimensions of size 1 for y
-  auto actual_dims_size = dims->size();
+  auto actual_dims_size = dims.size();
   for (; actual_dims_size != 0; --actual_dims_size) {
-    if ((*dims)[actual_dims_size - 1] != 1) break;
+    if (dims[actual_dims_size - 1] != 1) break;
   }
-  if (actual_dims_size != dims->size()) {
-    auto actual_dims = framework::vectorize(*dims);
-    actual_dims.resize(actual_dims_size);
-    *dims = framework::make_ddim(actual_dims);
+
+  std::vector<int> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
   }
+  framework::DDim actual_dims = framework::make_ddim(trim_dims);
+  return actual_dims;
 }
 
 template <typename T, typename DeviceContext>
@@ -456,6 +462,71 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x,
 
 #endif
 
+template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
+void ElemwiseGradComputeNoBroadcast(
+    const framework::ExecutionContext& ctx, const framework::DDim& x_dim,
+    const framework::DDim& y_dim, const framework::Tensor& x,
+    const framework::Tensor& y, const framework::Tensor& out,
+    const framework::Tensor& dout, int axis, framework::Tensor* dx,
+    framework::Tensor* dy, DX_OP dx_op, DY_OP dy_op) {
+  size_t N = static_cast<size_t>(framework::product(x_dim));
+  platform::ForRange<DeviceContext> for_range(
+      ctx.template device_context<DeviceContext>(), N);
+  for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP>{
+      x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), dx_op, dy_op,
+      dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+      dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())});
+}
+
+template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
+void ElemwiseGradComputeWithBroadcast(
+    const framework::ExecutionContext& ctx, const framework::DDim& x_dim,
+    const framework::DDim& y_dim_untrimed, const framework::Tensor& x,
+    const framework::Tensor& y, const framework::Tensor& out,
+    const framework::Tensor& dout, int axis, framework::Tensor* dx,
+    framework::Tensor* dy, DX_OP dx_op, DY_OP dy_op) {
+  axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis);
+  auto y_dim = trim_trailing_singular_dims(y_dim_untrimed);
+  axis = (y_dim.size() == 0) ? x_dim.size() : axis;
+
+  int pre, n, post;
+  get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
+  if (post == 1) {
+    int h = pre;
+    int w = n;
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef __NVCC__
+      ElemwiseGradBroadcast1CUDA(
+          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
+          y.data<T>(), out.data<T>(), dout.data<T>(), h, w, dx_op, dy_op,
+          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+#endif
+    } else {
+      ElemwiseGradBroadcast1CPU(
+          x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), h, w, dx_op,
+          dy_op, dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+    }
+  } else {
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef __NVCC__
+      ElemwiseGradBroadcast2CUDA(
+          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
+          y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post, dx_op,
+          dy_op, dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+#endif
+    } else {
+      ElemwiseGradBroadcast2CPU(
+          x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post,
+          dx_op, dy_op,
+          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+    }
+  }
+}
+
 template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
 void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
                          const framework::Tensor& x, const framework::Tensor& y,
@@ -463,63 +534,50 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
                          const framework::Tensor& dout, int axis,
                          framework::Tensor* dx, framework::Tensor* dy,
                          DX_OP dx_op, DY_OP dy_op) {
+  const framework::DDim& x_dim = x.dims();
+  const framework::DDim& y_dim = y.dims();
   if (x.dims() == y.dims()) {
-    size_t N = static_cast<size_t>(framework::product(x.dims()));
-    platform::ForRange<DeviceContext> for_range(
-        ctx.template device_context<DeviceContext>(), N);
-    for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP>{
-        x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), dx_op, dy_op,
-        dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-        dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())});
+    ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+        ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
   } else {  // Y is a scalar
-    auto x_dim = x.dims();
-    auto y_dim = y.dims();
-
-    axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
-    trim_trailing_singular_dims(&y_dim);
-    axis = (y_dim.size() == 0) ? x_dim.size() : axis;
-
-    int pre, n, post;
-    get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
-    if (post == 1) {
-      int h = pre;
-      int w = n;
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
-        ElemwiseGradBroadcast1CUDA(
-            ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
-            y.data<T>(), out.data<T>(), dout.data<T>(), h, w, dx_op, dy_op,
-            dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-            dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-#endif
-      } else {
-        ElemwiseGradBroadcast1CPU(
-            x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), h, w,
-            dx_op, dy_op,
-            dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-            dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-      }
-    } else {
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
-        ElemwiseGradBroadcast2CUDA(
-            ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
-            y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post, dx_op,
-            dy_op,
-            dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-            dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-#endif
-      } else {
-        ElemwiseGradBroadcast2CPU(
-            x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), pre, n,
-            post, dx_op, dy_op,
-            dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-            dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-      }
+    ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+        ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+  }
+}
+
+// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub.
+// explicit gradient can cut off X, Y, Out from gradient op
+// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse
+// elementwise code.
+template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
+void ElemwiseExplicitGradCompute(const framework::ExecutionContext& ctx,
+                                 const framework::Tensor& x,
+                                 const framework::Tensor& y,
+                                 const framework::Tensor& out,
+                                 const framework::Tensor& dout, int axis,
+                                 framework::Tensor* dx, framework::Tensor* dy,
+                                 DX_OP dx_op, DY_OP dy_op) {
+  if (dy == nullptr) {
+    const framework::DDim& dx_dims = dout.dims();
+    auto dy_dims = dx_dims;
+    ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+        ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+  } else {
+    if (dout.dims() == dy->dims()) {
+      const framework::DDim& dx_dims = dout.dims();
+      const framework::DDim& dy_dims = dy->dims();
+      ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+          ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+    } else {  // Y is a scalar
+      auto dx_dims = dout.dims();
+      const framework::DDim& dy_dims = dy->dims();
+      ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+          ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
     }
   }
 }
 
+// Deprecated
 template <typename DeviceContext, typename T, typename functor,
           typename broadcastfunctor, typename broadcast2functor>
 void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
@@ -547,7 +605,7 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
   }
 
   axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-  trim_trailing_singular_dims(&y_dims);
+  trim_trailing_singular_dims(y_dims);
   axis = (y_dims.size() == 0) ? x_dims.size() : axis;
 
   int pre, n, post;
@@ -574,19 +632,19 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
       x, y, z, ctx.template device_context<DeviceContext>(), func);
 
   auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+  auto y_dims_untrimed = y->dims();
+  PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),
                     "Rank of first input must >= rank of second input.");
 
-  if (x_dims == y_dims) {
+  if (x_dims == y_dims_untrimed) {
     functor.Run();
     return;
   }
 
-  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
   PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
                  "Axis should be in range [0, x_dims)");
-  trim_trailing_singular_dims(&y_dims);
+  auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
   axis = (y_dims.size() == 0) ? x_dims.size() : axis;
 
   int pre, n, post;
diff --git a/paddle/fluid/operators/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise_sub_op.cc
index a7562b166b373ee2a8c9b6f379431d88d3e45fcb..b7224261e6a7ca82dff92a25f5fe8818c08e676d 100644
--- a/paddle/fluid/operators/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise_sub_op.cc
@@ -15,7 +15,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_sub_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
 namespace ops = paddle::operators;
-REGISTER_ELEMWISE_OP(elementwise_sub, "Sub", "Out = X - Y");
+REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub);
+REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_sub, "Sub", "Out = X - Y", "Out",
+                              "X");
+
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_sub_op.h b/paddle/fluid/operators/elementwise_sub_op.h
index fe088b8203722a43b9aba7be3878b8f4ca68ba12..11c7e3fe628001f095836a788f2bcc7c4ee7ad4b 100644
--- a/paddle/fluid/operators/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise_sub_op.h
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -55,14 +55,15 @@ class ElementwiseSubGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using Tensor = framework::Tensor;
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
+    // skip out, x, y
+    auto* out = dout;
+    auto *x = dout, *y = dout;
+
+    ElemwiseExplicitGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
         ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
   }
 };
diff --git a/paddle/fluid/operators/extract_rows_op.cc b/paddle/fluid/operators/extract_rows_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9a297d03cfb041e584159a5fc5ba214f8ac404b4
--- /dev/null
+++ b/paddle/fluid/operators/extract_rows_op.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class ExtractRowsOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ExtractRowsOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ExtractRowsOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("X")[0],
+                      framework::proto::VarType::SELECTED_ROWS,
+                      "The type of input(X) must be SelectedRows.");
+    auto in_dims = ctx->GetInputDim("X");
+
+    ctx->SetOutputDim(
+        "Out", framework::make_ddim(std::vector<int64_t>{in_dims[0], 1}));
+  }
+};
+
+class ExtractRowsOp : public framework::OperatorBase {
+ public:
+  ExtractRowsOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
+    auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+
+    auto in_rows = in.rows();
+    auto out_dim = framework::make_ddim(
+        std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
+    auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());
+
+    if (paddle::platform::is_gpu_place(in.place())) {
+#ifdef PADDLE_WITH_CUDA
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto *dev_ctx = pool.Get(in.place());
+      auto src_ptr = in_rows.Data(in.place());
+      auto stream =
+          reinterpret_cast<const platform::CUDADeviceContext &>(*dev_ctx)
+              .stream();
+      memory::Copy(boost::get<platform::CUDAPlace>(out->place()), dst_ptr,
+                   boost::get<platform::CUDAPlace>(in.place()), src_ptr,
+                   in_rows.size() * sizeof(int64_t), stream);
+#else
+      PADDLE_THROW("Not compiled with CUDA.");
+#endif
+    } else {
+      memory::Copy(platform::CPUPlace(), dst_ptr, platform::CPUPlace(),
+                   in_rows.data(), in_rows.size() * sizeof(int64_t));
+    }
+  }
+};
+
+class ExtractRowsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(SelectedRows). The input tensor of extract_rows operator,"
+             " and its type is SelectedRows.");
+    AddOutput("Out", "(Tensor). The the rows of input(X).");
+
+    AddComment(R"DOC(
+    ExtractRows Operator.
+
+The function of extract_rows_op is extracting the rows from the input(X)
+whose type is SelectedRows.
+
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(extract_rows, ops::ExtractRowsOp, ops::ExtractRowsOpMaker,
+                  ops::ExtractRowsOpInferShape);
diff --git a/paddle/fluid/operators/feed_op.cc b/paddle/fluid/operators/feed_op.cc
index bcb3e63ed7dbc775c1de6c4522f0548ea48a6cf0..dc7ef664958238ddbd48745bd59cc7db28e49f5b 100644
--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
@@ -31,7 +31,6 @@ class FeedOp : public framework::OperatorBase {
                const platform::Place &place) const override {
     // get device context from pool
     auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    platform::RecordEvent record_event(Type(), dev_ctx);
 
     auto feed_var_name = Input("X");
     auto *feed_var = scope.FindVar(feed_var_name);
diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc
index 680fde19eefe57475b7526ebc29d4ff977a16977..d9cd956dfdff3d009d38ee5088f5396080580483 100644
--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@@ -36,12 +36,6 @@ class FetchBarrierOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
     std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
-
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
diff --git a/paddle/fluid/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc
index 1640a2a22c69a0e3ab81a2889d6105b2cf4162b7..c197b45e8196a47def6465128e8ca39d8daefed6 100644
--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
@@ -30,9 +30,6 @@ class FetchOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    platform::RecordEvent record_event(Type(), pool.Get(place));
-
     auto fetch_var_name = Input("X");
     auto *fetch_var = scope.FindVar(fetch_var_name);
     PADDLE_ENFORCE(fetch_var != nullptr,
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fdda01381e117cecffb2a05f8399f3ad82a46339
--- /dev/null
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -0,0 +1,169 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class FlattenOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input (X) of Flatten op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output (Output) of Flatten op should not be null.");
+    const auto &axis = ctx->Attrs().Get<int>("axis");
+    const auto &in_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(axis >= 0, "The axis should be greater than or equal to 0.");
+    PADDLE_ENFORCE(
+        axis <= in_dims.size(),
+        "The axis should be less than or equal to input tensor's rank.");
+
+    const auto &out_dims = GetOutputShape(axis, in_dims);
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+    if (in_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+
+  static std::vector<int32_t> GetOutputShape(const int axis,
+                                             const framework::DDim &in_dims) {
+    int64_t outer = 1, inner = 1;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      if (i < axis) {
+        outer *= in_dims[i];
+      } else {
+        inner *= in_dims[i];
+      }
+    }
+    std::vector<int32_t> out_shape(2);
+    out_shape[0] = outer;
+    out_shape[1] = inner;
+    return out_shape;
+  }
+};
+
+class FlattenOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axis = Attr<int>("axis");
+    auto in_dims =
+        scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    const auto &out_dims = FlattenOpInferShape::GetOutputShape(axis, in_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = out_dims;
+    attrs["inplace"] = false;
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class FlattenOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) A tensor of rank >= axis.");
+    AddOutput("Out",
+              "A 2D tensor is reshaped input tensor. The input dimensions"
+              "up to axis are flattened to the outer dimension of the output"
+              "and the remaining input dimensions are flattened into the inner"
+              "dimension of the output.");
+    AddAttr<int>("axis",
+                 "(int)"
+                 "Indicate up to which input dimensions (exclusive) should be"
+                 "flattened to the outer dimension of the output. The value"
+                 "for axis must be in the range [0, R], where R is the rank of"
+                 "the input tensor. When axis = 0, the shape of the output"
+                 "tensor is (1, (d_0 X d_1 ... d_n), where the shape of the"
+                 "input tensor is (d_0, d_1, ... d_n).")
+        .SetDefault(1);
+    AddComment(R"DOC(
+Flatten Operator
+
+Flattens the input tensor into a 2D matrix.
+
+Examples:
+Case 1:
+  Given
+    X.shape = (3, 100, 100, 4)
+  and
+    axis = 2
+  We get:
+    Out.shape = (3 * 100, 4 * 100)
+
+Case 2:
+  Given
+    X.shape = (3, 100, 100, 4)
+  and
+    axis = 0
+  We get:
+    Out.shape = (1, 3 * 100 * 100 * 4)
+)DOC");
+  }
+};
+
+class FlattenGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    context->SetOutputDim(framework::GradVarName("X"),
+                          context->GetInputDim("X"));
+    context->ShareLoD("X", framework::GradVarName("X"));
+  }
+};
+
+class FlattenGradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto in_dims =
+        scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(in_dims);
+    attrs["inplace"] = false;
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
+        attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+USE_OP(reshape);
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker,
+                  ops::FlattenOpInferShape,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape);
diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused_elemwise_activation_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a6fd0aeb021dce40339c32251af130d5984dccd2
--- /dev/null
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cc
@@ -0,0 +1,221 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/fused_elemwise_activation_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FusedElemwiseActivationOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("X"),
+        "Input(X) of FusedElemwiseActivationOp op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Y"),
+        "Input(Y) of FusedElemwiseActivationOp op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of FusedElemwiseActivationOp op should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto y_dim = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
+                      "Rank of first input must >= rank of second input.");
+
+    ctx->SetOutputDim("Out", x_dim);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx.Input<framework::Tensor>("X")->type(),
+                      ctx.Input<framework::Tensor>("Y")->type(),
+                      "The element's type of input should be the same.");
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+
+class FusedElemwiseActivationMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(vector<Tensor>)");
+    AddInput("Y", "(vector<Tensor>)");
+    AddOutput("Out", "vector<Tensor>");
+    AddAttr<int>("axis",
+                 "axis is used by elementwise_op, the default value is -1.")
+        .SetDefault(-1);
+    AddAttr<float>("scale",
+                   "scale is used by scale_op, the default value is 0.0.")
+        .SetDefault(0.0);
+    AddAttr<bool>("recomputation",
+                  "Whether to recompute the Out."
+                  "fused_elemwise_activation_grad has two methods to get the "
+                  "dx and dy, one "
+                  "is to use the 'Out', and the other is not to use it. "
+                  "The former method will save the time of recomputing the "
+                  "'Out', but it must occupy the memory to store the 'out'. "
+                  "While, the later method can avoid occupying the memory, "
+                  "but it must recompute the 'Out'. The default value is true.")
+        .SetDefault(true);
+    AddAttr<std::vector<std::string>>("functor_list",
+                                      "The functors that should be fused.")
+        .AddCustomChecker([&](const std::vector<std::string> &functor_list) {
+          PADDLE_ENFORCE(ValidCheck(functor_list));
+        });
+
+    AddComment(R"DOC(
+FusedElemwiseActivation Operator.
+
+At present, FusedElemwiseActivation only supports Two kinds of compound
+operators (elementwise_op and activation_op):
+
+    Z = Binary(X, Unary(Y))
+    Z = Unary(Binary(X, Y))
+
+The attributions of activation_op can be get from fused_elemwise_activation_op's
+attributions. functor_list records the functors to be fused, for example
+"scale,elementwise_add".
+
+)DOC");
+  }
+
+ private:
+  bool ValidCheck(const std::vector<std::string> &functors) {
+    std::unordered_set<std::string> unary_fun = {"scale", "relu"};
+    std::unordered_set<std::string> binary_fun = {"elementwise_add"};
+
+    std::string unary_fun_str;
+    if (binary_fun.count(functors[0])) {
+      unary_fun_str = functors[1];
+    } else if (binary_fun.count(functors[1])) {
+      unary_fun_str = functors[0];
+    } else {
+      PADDLE_THROW("%s and %s are not included in fused_list.", functors[0],
+                   functors[1]);
+    }
+    PADDLE_ENFORCE_EQ(unary_fun.count(unary_fun_str), 1,
+                      "%s is not included in fused_list.", unary_fun_str);
+    return true;
+  }
+};
+
+class FusedElemwiseActivationGradMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType(this->ForwardOpType() + "_grad");
+
+    for (auto &input_param : this->InputNames()) {
+      op_desc_ptr->SetInput(input_param, this->Input(input_param));
+      op_desc_ptr->SetOutput(framework::GradVarName(input_param),
+                             this->InputGrad(input_param, true));
+    }
+
+    for (auto &output_param : this->OutputNames()) {
+      op_desc_ptr->SetInput(output_param, this->Output(output_param));
+      op_desc_ptr->SetInput(framework::GradVarName(output_param),
+                            this->OutputGrad(output_param));
+    }
+    op_desc_ptr->SetAttrMap(this->Attrs());
+
+    std::vector<std::string> functor_names =
+        boost::get<std::vector<std::string>>(
+            op_desc_ptr->GetAttr("functor_list"));
+    functor_names[0] += "_grad";
+    functor_names[1] += "_grad";
+    op_desc_ptr->SetAttr("functor_list", functor_names);
+    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
+  }
+};
+
+class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                      "Rank of first input must >= rank of second input.");
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type_index = ctx.Input<framework::Tensor>("X")->type();
+    PADDLE_ENFORCE_EQ(input_data_type_index,
+                      ctx.Input<framework::Tensor>("Y")->type(),
+                      "The element's type of input should be the same.");
+    PADDLE_ENFORCE_EQ(
+        input_data_type_index,
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
+        "The element's type of input should be the same.");
+
+    auto input_data_type = framework::ToDataType(input_data_type_index);
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fused_elemwise_activation, ops::FusedElemwiseActivationOp,
+                  ops::FusedElemwiseActivationMaker,
+                  ops::FusedElemwiseActivationGradMaker);
+REGISTER_OPERATOR(fused_elemwise_activation_grad,
+                  ops::FusedElemwiseActivationOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    fused_elemwise_activation,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CPUDeviceContext,
+                                       float>,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CPUDeviceContext,
+                                       double>);
+
+REGISTER_OP_CPU_KERNEL(
+    fused_elemwise_activation_grad,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CPUDeviceContext,
+                                           float>,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CPUDeviceContext,
+                                           double>);
diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.cu b/paddle/fluid/operators/fused_elemwise_activation_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e1d2b16b4b5e3a480777f834c2cbeb6d00a755e4
--- /dev/null
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused_elemwise_activation_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    fused_elemwise_activation,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
+                                       float>,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
+                                       double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    fused_elemwise_activation_grad,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
+                                           float>,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
+                                           double>);
diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused_elemwise_activation_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe0017b824532b1210d0ae3e51983d63d081f12a
--- /dev/null
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.h
@@ -0,0 +1,425 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/functors.h"
+
+namespace math = paddle::operators::math;
+
+namespace paddle {
+namespace operators {
+
+// CompoundFunctors
+// For example: Z = Binary(X, Unary(Y))
+template <typename T, typename BinaryFun, typename UnaryFun>
+struct BinaryCompoundFunctor {
+  BinaryCompoundFunctor(const BinaryFun &binary_fun, const UnaryFun &unary_fun)
+      : binary_fun_(binary_fun), unary_fun_(unary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y) {
+    return binary_fun_(x, unary_fun_(y));
+  }
+
+ private:
+  BinaryFun binary_fun_;
+  UnaryFun unary_fun_;
+};
+
+// For example: Z = Unary(Binary(X, Y))
+template <typename T, typename UnaryFun, typename BinaryFun>
+struct UnaryCompoundFunctor {
+  UnaryCompoundFunctor(const UnaryFun &unary_fun, const BinaryFun &binary_fun)
+      : unary_fun_(unary_fun), binary_fun_(binary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y) {
+    return unary_fun_(binary_fun_(x, y));
+  }
+
+ private:
+  UnaryFun unary_fun_;
+  BinaryFun binary_fun_;
+};
+
+// FIXME(zcd): DBinaryFun and DUnaryFun have to method to get
+// the dx, one is to use the 'out', and the other is not to use it.
+// the former method will save the time of recomputing the
+// 'out', but it must occupy the memory to store the 'out'.
+// While the later method can avoid occupying this memory,
+// but it must recompute the 'out'.
+
+template <typename T, typename DBinaryFun, typename UnaryFun,
+          bool Recomputation = true>
+struct BinaryCompoundGradDxFunctor {
+  BinaryCompoundGradDxFunctor(const DBinaryFun &d_binary_fun,
+                              const UnaryFun &unary_fun)
+      : d_binary_fun_(d_binary_fun), unary_fun_(unary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    if (Recomputation) {
+      return dout * d_binary_fun_(x, unary_fun_(y));
+    } else {
+      return dout * d_binary_fun_(x, unary_fun_(y), out);
+    }
+  }
+
+ private:
+  DBinaryFun d_binary_fun_;
+  UnaryFun unary_fun_;
+};
+
+template <typename T, typename DBinaryFun, typename UnaryFun,
+          typename DUnaryFun, bool Recomputation = true>
+struct BinaryCompoundGradDyFunctor {
+  BinaryCompoundGradDyFunctor(const DBinaryFun &d_binary_fun,
+                              const UnaryFun &unary_fun,
+                              const DUnaryFun &d_unary_fun)
+      : d_binary_fun_(d_binary_fun),
+        unary_fun_(unary_fun),
+        d_unary_fun_(d_unary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    if (Recomputation) {
+      return dout * d_binary_fun_(unary_fun_(y), x) * d_unary_fun_(y);
+    } else {
+      return dout * d_binary_fun_(unary_fun_(y), x, out) * d_unary_fun_(y);
+    }
+  }
+
+ private:
+  DBinaryFun d_binary_fun_;
+  UnaryFun unary_fun_;
+  DUnaryFun d_unary_fun_;
+};
+
+template <typename T, typename DUnaryFun, typename BinaryFun,
+          typename DBinaryFun, bool Recomputation = true>
+struct UnaryCompoundGradDxFunctor {
+  UnaryCompoundGradDxFunctor(const DUnaryFun &d_unary_fun,
+                             const BinaryFun &binary_fun,
+                             const DBinaryFun &d_binary_fun)
+      : d_unary_fun_(d_unary_fun),
+        binary_fun_(binary_fun),
+        d_binary_fun_(d_binary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    T base;
+    if (Recomputation) {
+      base = dout * d_unary_fun_(binary_fun_(x, y));
+    } else {
+      base = dout * d_unary_fun_(binary_fun_(x, y), out);
+    }
+    return base * d_binary_fun_(x, y);
+  }
+
+ private:
+  DUnaryFun d_unary_fun_;
+  BinaryFun binary_fun_;
+  DBinaryFun d_binary_fun_;
+};
+
+template <typename T, typename DUnaryFun, typename BinaryFun,
+          typename DBinaryFun, bool Recomputation = true>
+struct UnaryCompoundGradDyFunctor {
+  UnaryCompoundGradDyFunctor(const DUnaryFun &d_unary_fun,
+                             const BinaryFun &binary_fun,
+                             const DBinaryFun &d_binary_fun)
+      : d_unary_fun_(d_unary_fun),
+        binary_fun_(binary_fun),
+        d_binary_fun_(d_binary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    T base;
+    if (Recomputation) {
+      base = dout * d_unary_fun_(binary_fun_(x, y));
+    } else {
+      base = dout * d_unary_fun_(binary_fun_(x, y), out);
+    }
+    return base * d_binary_fun_(y, x);
+  }
+
+ private:
+  DUnaryFun d_unary_fun_;
+  BinaryFun binary_fun_;
+  DBinaryFun d_binary_fun_;
+};
+
+template <typename DeviceContext, typename T, typename BinaryFunctor,
+          typename UnaryFunctor>
+static void RunBinaryCompoundFunctor(const framework::ExecutionContext &ctx,
+                                     const BinaryFunctor &binary_functor,
+                                     const UnaryFunctor &unary_functor,
+                                     const framework::Tensor *in_x,
+                                     const framework::Tensor *in_y,
+                                     framework::Tensor *output) {
+  int axis = ctx.Attr<int>("axis");
+  using BinaryCompoundFunctor =
+      BinaryCompoundFunctor<T, BinaryFunctor, UnaryFunctor>;
+
+  ElementwiseComputeEx<BinaryCompoundFunctor, DeviceContext, T>(
+      ctx, in_x, in_y, axis,
+      BinaryCompoundFunctor(binary_functor, unary_functor), output);
+}
+
+template <typename DeviceContext, typename T, typename UnaryFunctor,
+          typename BinaryFunctor>
+static void RunUnaryCompoundFunctors(const framework::ExecutionContext &ctx,
+                                     const UnaryFunctor &unary_functor,
+                                     const BinaryFunctor &binary_functor,
+                                     const framework::Tensor *in_x,
+                                     const framework::Tensor *in_y,
+                                     framework::Tensor *output) {
+  int axis = ctx.Attr<int>("axis");
+
+  using UnaryCompoundFunctor =
+      UnaryCompoundFunctor<T, UnaryFunctor, BinaryFunctor>;
+
+  ElementwiseComputeEx<UnaryCompoundFunctor, DeviceContext, T>(
+      ctx, in_x, in_y, axis,
+      UnaryCompoundFunctor(unary_functor, binary_functor), output);
+}
+
+template <typename DeviceContext, typename T, typename BinaryGradFunctor,
+          typename UnaryFunctor, typename UnaryGradFunctor,
+          bool Recomputation = true>
+static void RunBinaryCompoundGradFunctors(
+    const framework::ExecutionContext &ctx,
+    const BinaryGradFunctor &binary_grad_functor,
+    const UnaryFunctor &unary_functor,
+    const UnaryGradFunctor &unary_grad_functor, const framework::Tensor *in_x,
+    const framework::Tensor *in_y, const framework::Tensor *in_out,
+    const framework::Tensor *in_out_grad, framework::Tensor *x_grad,
+    framework::Tensor *y_grad) {
+  int axis = ctx.Attr<int>("axis");
+
+  using BinaryCompoundDxFunctor =
+      BinaryCompoundGradDxFunctor<T, BinaryGradFunctor, UnaryFunctor,
+                                  Recomputation>;
+  using BinaryCompoundDyFunctor =
+      BinaryCompoundGradDyFunctor<T, BinaryGradFunctor, UnaryFunctor,
+                                  UnaryGradFunctor, Recomputation>;
+
+  ElemwiseGradCompute<DeviceContext, T, BinaryCompoundDxFunctor,
+                      BinaryCompoundDyFunctor>(
+      ctx, *in_x, *in_y, *in_out, *in_out_grad, axis, x_grad, y_grad,
+      BinaryCompoundDxFunctor(binary_grad_functor, unary_functor),
+      BinaryCompoundDyFunctor(binary_grad_functor, unary_functor,
+                              unary_grad_functor));
+}
+
+template <typename DeviceContext, typename T, typename UnaryGradFunctor,
+          typename BinaryFunctor, typename BinaryGradFunctor,
+          bool Recomputation = true>
+static void RunUnaryCompoundGradFunctors(
+    const framework::ExecutionContext &ctx,
+    const UnaryGradFunctor &unary_grad_functor,
+    const BinaryFunctor &binary_functor,
+    const BinaryGradFunctor &binary_grad_functor, const framework::Tensor *in_x,
+    const framework::Tensor *in_y, const framework::Tensor *in_out,
+    const framework::Tensor *in_out_grad, framework::Tensor *x_grad,
+    framework::Tensor *y_grad) {
+  int axis = ctx.Attr<int>("axis");
+
+  using UnaryCompoundDxFunctor =
+      UnaryCompoundGradDxFunctor<T, UnaryGradFunctor, BinaryFunctor,
+                                 BinaryGradFunctor, Recomputation>;
+  using UnaryCompoundDyFunctor =
+      UnaryCompoundGradDyFunctor<T, UnaryGradFunctor, BinaryFunctor,
+                                 BinaryGradFunctor, Recomputation>;
+
+  ElemwiseGradCompute<DeviceContext, T, UnaryCompoundDxFunctor,
+                      UnaryCompoundDyFunctor>(
+      ctx, *in_x, *in_y, *in_out, *in_out_grad, axis, x_grad, y_grad,
+      UnaryCompoundDxFunctor(unary_grad_functor, binary_functor,
+                             binary_grad_functor),
+      UnaryCompoundDyFunctor(unary_grad_functor, binary_functor,
+                             binary_grad_functor));
+}
+
+template <typename DeviceContext, typename T>
+static void RunFunctors(const framework::ExecutionContext &ctx,
+                        const framework::Tensor *in_x,
+                        const framework::Tensor *in_y,
+                        framework::Tensor *output) {
+  auto &functors = ctx.Attr<std::vector<std::string>>("functor_list");
+  auto funcs_str = functors[0] + "," + functors[1];
+  // TODO(zcd): The following code can be refined.
+  if (funcs_str == "elementwise_add,scale") {
+    // Z = Binary(X, Unary(Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    RunBinaryCompoundFunctor<DeviceContext, T, math::AddFunctor<T>,
+                             math::ScaleFunctor<T>>(
+        ctx, math::AddFunctor<T>(), math::ScaleFunctor<T>(scale), in_x, in_y,
+        output);
+  } else if (funcs_str == "scale,elementwise_add") {
+    // Z = Unary(Binary(X, Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    RunUnaryCompoundFunctors<DeviceContext, T, math::ScaleFunctor<T>,
+                             math::AddFunctor<T>>(
+        ctx, math::ScaleFunctor<T>(scale), math::AddFunctor<T>(), in_x, in_y,
+        output);
+  } else if (funcs_str == "elementwise_add,relu") {
+    RunBinaryCompoundFunctor<DeviceContext, T, math::AddFunctor<T>,
+                             math::ReluFunctor<T>>(
+        ctx, math::AddFunctor<T>(), math::ReluFunctor<T>(), in_x, in_y, output);
+  } else if (funcs_str == "relu,elementwise_add") {
+    RunUnaryCompoundFunctors<DeviceContext, T, math::ReluFunctor<T>,
+                             math::AddFunctor<T>>(
+        ctx, math::ReluFunctor<T>(), math::AddFunctor<T>(), in_x, in_y, output);
+  } else {
+    PADDLE_THROW("%s has not been implemented.", funcs_str);
+  }
+}
+
+template <typename DeviceContext, typename T>
+static void RunGradFunctors(const framework::ExecutionContext &ctx,
+                            const framework::Tensor *in_x,
+                            const framework::Tensor *in_y,
+                            const framework::Tensor *in_out,
+                            const framework::Tensor *in_out_grad,
+                            framework::Tensor *x_grad,
+                            framework::Tensor *y_grad) {
+  auto &functors = ctx.Attr<std::vector<std::string>>("functor_list");
+  auto funcs_str = functors[0] + "," + functors[1];
+
+  bool recomputation = ctx.Attr<bool>("recomputation");
+
+  // TODO(zcd): The following code can be refined. for example, use registion
+  if (funcs_str == "elementwise_add_grad,scale_grad") {
+    // The backward of Z = Binary(X, Unary(Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    if (recomputation) {
+      RunBinaryCompoundGradFunctors<DeviceContext, T, math::AddGradFunctor<T>,
+                                    math::ScaleFunctor<T>,
+                                    math::ScaleGradFunctor<T>, true>(
+          ctx, math::AddGradFunctor<T>(), math::ScaleFunctor<T>(scale),
+          math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out, in_out_grad,
+          x_grad, y_grad);
+    } else {
+      RunBinaryCompoundGradFunctors<DeviceContext, T, math::AddGradFunctor<T>,
+                                    math::ScaleFunctor<T>,
+                                    math::ScaleGradFunctor<T>, false>(
+          ctx, math::AddGradFunctor<T>(), math::ScaleFunctor<T>(scale),
+          math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out, in_out_grad,
+          x_grad, y_grad);
+    }
+  } else if (funcs_str == "scale_grad,elementwise_add_grad") {
+    // The backward of Z = Unary(Binary(X, Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    if (recomputation) {
+      RunUnaryCompoundGradFunctors<DeviceContext, T, math::ScaleGradFunctor<T>,
+                                   math::AddFunctor<T>, math::AddGradFunctor<T>,
+                                   true>(ctx, math::ScaleGradFunctor<T>(scale),
+                                         math::AddFunctor<T>(),
+                                         math::AddGradFunctor<T>(), in_x, in_y,
+                                         in_out, in_out_grad, x_grad, y_grad);
+    } else {
+      RunUnaryCompoundGradFunctors<DeviceContext, T, math::ScaleGradFunctor<T>,
+                                   math::AddFunctor<T>, math::AddGradFunctor<T>,
+                                   false>(ctx, math::ScaleGradFunctor<T>(scale),
+                                          math::AddFunctor<T>(),
+                                          math::AddGradFunctor<T>(), in_x, in_y,
+                                          in_out, in_out_grad, x_grad, y_grad);
+    }
+  } else if (funcs_str == "elementwise_add_grad,relu_grad") {
+    if (recomputation) {
+      RunBinaryCompoundGradFunctors<DeviceContext, T, math::AddGradFunctor<T>,
+                                    math::ReluFunctor<T>,
+                                    math::ReluGradFunctor<T>, true>(
+          ctx, math::AddGradFunctor<T>(), math::ReluFunctor<T>(),
+          math::ReluGradFunctor<T>(), in_x, in_y, in_out, in_out_grad, x_grad,
+          y_grad);
+    } else {
+      RunBinaryCompoundGradFunctors<DeviceContext, T, math::AddGradFunctor<T>,
+                                    math::ReluFunctor<T>,
+                                    math::ReluGradFunctor<T>, false>(
+          ctx, math::AddGradFunctor<T>(), math::ReluFunctor<T>(),
+          math::ReluGradFunctor<T>(), in_x, in_y, in_out, in_out_grad, x_grad,
+          y_grad);
+    }
+  } else if (funcs_str == "relu_grad,elementwise_add_grad") {
+    if (recomputation) {
+      RunUnaryCompoundGradFunctors<DeviceContext, T, math::ReluGradFunctor<T>,
+                                   math::AddFunctor<T>, math::AddGradFunctor<T>,
+                                   true>(ctx, math::ReluGradFunctor<T>(),
+                                         math::AddFunctor<T>(),
+                                         math::AddGradFunctor<T>(), in_x, in_y,
+                                         in_out, in_out_grad, x_grad, y_grad);
+    } else {
+      RunUnaryCompoundGradFunctors<DeviceContext, T, math::ReluGradFunctor<T>,
+                                   math::AddFunctor<T>, math::AddGradFunctor<T>,
+                                   false>(ctx, math::ReluGradFunctor<T>(),
+                                          math::AddFunctor<T>(),
+                                          math::AddGradFunctor<T>(), in_x, in_y,
+                                          in_out, in_out_grad, x_grad, y_grad);
+    }
+  } else {
+    PADDLE_THROW("%s has not been implemented.", funcs_str);
+  }
+}
+
+template <typename DeviceContext, typename T>
+class FusedElemwiseActivationKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto &in_x = detail::Ref(ctx.Input<framework::Tensor>("X"),
+                             "Cannot get input tensor %s, variable name = %s",
+                             "X", ctx.op().Input("X"));
+    auto &in_y = detail::Ref(ctx.Input<framework::Tensor>("Y"),
+                             "Cannot get input tensor %s, variable name = %s",
+                             "Y", ctx.op().Input("Y"));
+    auto &output = detail::Ref(ctx.Output<framework::Tensor>("Out"),
+                               "Cannot get input tensor %s, variable name = %s",
+                               "Out", ctx.op().Output("Out"));
+
+    RunFunctors<DeviceContext, T>(ctx, &in_x, &in_y, &output);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto &in_x = detail::Ref(ctx.Input<framework::Tensor>("X"),
+                             "Cannot get input tensor %s, variable name = %s",
+                             "X", ctx.op().Input("X"));
+    auto &in_y = detail::Ref(ctx.Input<framework::Tensor>("Y"),
+                             "Cannot get input tensor %s, variable name = %s",
+                             "Y", ctx.op().Input("Y"));
+    auto &in_out = detail::Ref(ctx.Input<framework::Tensor>("Out"),
+                               "Cannot get input tensor %s, variable name = %s",
+                               "Out", ctx.op().Input("Out"));
+    auto &in_out_grad =
+        detail::Ref(ctx.Input<framework::Tensor>(framework::GradVarName("Out")),
+                    "Cannot get input tensor %s, variable name = %s",
+                    framework::GradVarName("Out"),
+                    ctx.op().Input(framework::GradVarName("Out")));
+
+    framework::Tensor *x_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    framework::Tensor *y_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+
+    RunGradFunctors<DeviceContext, T>(ctx, &in_x, &in_y, &in_out, &in_out_grad,
+                                      x_grad, y_grad);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 438b44b42aaf4c7e3ff05a5f7c52bbfd850e92c7..b1948076969e7a651179b97b107b85beea175280 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -19,11 +19,12 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include <vector>
 
+#include "gflags/gflags.h"
+
 #include "paddle/fluid/operators/detail/macros.h"
 
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
-#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -103,6 +104,7 @@ void ListenAndServOp::RunSyncLoop(
     framework::Scope *recv_scope,
     const std::vector<int> &prefetch_block_id_list,
     const int checkpoint_point_block_id) const {
+  VLOG(2) << "RunSyncLoop";
   size_t num_blocks = program->Size();
   auto optimize_blocks =
       Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
@@ -122,7 +124,9 @@ void ListenAndServOp::RunSyncLoop(
       std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
 
   rpc_service_->ResetBarrierCounter();
+
   while (true) {
+    rpc_service_->Profiler().OneStep();
     // Get from multiple trainers, we don't care about the order in which
     // the gradients arrives, just add suffix 0~n and merge the gradient.
     rpc_service_->SetCond(distributed::kRequestSend);
@@ -170,6 +174,7 @@ void ListenAndServOp::RunSyncLoop(
 void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
                                    framework::ProgramDesc *program,
                                    framework::Scope *recv_scope) const {
+  VLOG(2) << "RunAsyncLoop";
   // grad name to block id
   std::unordered_map<std::string, int32_t> grad_to_block_id;
   std::unordered_map<int32_t, std::string> id_to_grad;
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index ac35cf0b89bfaa0c0f8e64445f18a3bbd478e70a..27e26cb1b5c1e831f05dac299489628b92eaa58c 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -31,9 +31,6 @@ class LoadOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    platform::RecordEvent record_event(Type(), dev_ctx);
-
     // FIXME(yuyang18): We save variable to local file now, but we should change
     // it to save an output stream.
     auto filename = Attr<std::string>("file_path");
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index bda499432214b8841c8dfc406ee45ca0367920e7..d77b095c5d783a2a9fab87eb8b458117a6a3d225 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -32,20 +32,21 @@ class LookupTableOp : public framework::OperatorWithKernel {
 
     auto table_dims = ctx->GetInputDim("W");
     auto ids_dims = ctx->GetInputDim("Ids");
+    int ids_rank = ids_dims.size();
 
-    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
-    // is LoDTensor, this tensor contains the ids to be looked up in W
-    // and it must be a column vector with rank = 2 while the 2nd dimension
-    // size must be 1, when Ids's type is SelectedRows, the rows of Ids
-    // contains the ids to be looked up in W;
-    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
-      PADDLE_ENFORCE_EQ(ids_dims[1], 1);
-    }
+    PADDLE_ENFORCE_EQ(table_dims.size(), 2);
+    PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
+                      "The last dimension of the 'Ids' tensor must be 1.");
+
+    auto output_dims =
+        framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1));
+    output_dims.push_back(table_dims[1]);
+    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
 
-    ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]});
-    ctx->ShareLoD("Ids", /*->*/ "Out");
+    if (ctx->GetOutputsVarType("Out")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      ctx->ShareLoD("Ids", /*->*/ "Out");
+    }
   }
 
  protected:
@@ -62,17 +63,11 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("W",
              "(Tensor) The input represents embedding tensors, "
              "which is a learnable parameter.");
-    AddInput(
-        "Ids",
-        "(Tensor or SelectedRows) Ids's type can be Tensor or "
-        "SelectedRows, when Ids's type is Tensor, this tensor contains "
-        "the ids to be looked up in W and it must be a column vector with "
-        "rank = 2 while the 2nd dimension size must be 1; when Ids's type is "
-        "SelectedRows, the rows of Ids contains the ids to be looked up "
-        "in W.");
-    AddOutput("Out",
-              "(Tensor or SelectedRows) The lookup results, which have the "
-              "same type as W.");
+    AddInput("Ids",
+             "An input with type int32 or int64 "
+             "contains the ids to be looked up in W. "
+             "The last dimension size must be 1.");
+    AddOutput("Out", "The lookup results, which have the same type as W.");
     AddAttr<bool>("is_sparse",
                   "(boolean, default false) "
                   "Sparse update.")
@@ -90,15 +85,10 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
 Lookup Table Operator.
 
 This operator is used to perform lookups on the parameter W,
-then concatenated into a dense or sparse tensor.
-
-The type of Ids(Input) is SelectedRows, Tensor or LoDTensor, when Ids's
-type is SelectedRows, the rows of Ids contains the ids to be looked up in W;
-when Ids's type is Tensor, this tensor contains the ids to be looked up in W
-and it must be a column vector with rank = 2 while the 2nd dimension size must be 1,
-at this time, Ids can carry the LoD (Level of Details) information, or not, and
-the output only shares the LoD information with input Ids.
+then concatenated into a dense tensor.
 
+The input Ids can carry the LoD (Level of Details) information,
+or not. And the output only shares the LoD information with input Ids.
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 77722c50d39003d9342afb04a61ae3aaf6b21100..74823dab09cac358f647c074ac2f2ee2fed17e55 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -23,7 +23,7 @@ namespace operators {
 
 template <typename T, int BlockDimX, int BlockDimY, int GridDimX,
           bool PaddingFlag>
-__global__ void LookupTable(T* output, const T* table, const int64_t* ids,
+__global__ void LookupTable(T *output, const T *table, const int64_t *ids,
                             const int64_t N, const int64_t K, const int64_t D,
                             const int64_t padding_idx) {
   int idx = threadIdx.x;
@@ -33,8 +33,8 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids,
     int64_t id = ids[idy];
     PADDLE_ASSERT(id >= 0);
     PADDLE_ASSERT(id < N);
-    T* out = output + idy * D;
-    const T* tab = table + id * D;
+    T *out = output + idy * D;
+    const T *tab = table + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
       if (PaddingFlag) {
         if (id == padding_idx)
@@ -50,7 +50,7 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids,
 }
 
 template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
-__global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
+__global__ void LookupTableGrad(T *table, const T *output, const int64_t *ids,
                                 const int64_t N, const int64_t K,
                                 const int64_t D) {
   int idx = threadIdx.x;
@@ -60,8 +60,8 @@ __global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
     int id = ids[idy];
     PADDLE_ASSERT(id >= 0);
     PADDLE_ASSERT(id < N);
-    const T* out = output + idy * D;
-    T* tab = table + id * D;
+    const T *out = output + idy * D;
+    T *tab = table + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
       paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
     }
@@ -72,36 +72,19 @@ __global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
 template <typename T>
 class LookupTableCUDAKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* table_t = context.Input<LoDTensor>("W");
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_t = context.Input<LoDTensor>("W");
+    auto *ids_t = context.Input<LoDTensor>("Ids");
+    auto *output_t = context.Output<LoDTensor>("Out");
     int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-    auto* ids_var = context.InputVar("Ids");
-    Tensor* output_t = context.Output<Tensor>("Out");
-
-    int64_t* ids;
-    int64_t K;
-
-    // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
-    // is LoDTensor, this tensor contains the ids to be looked up in W;
-    // when Ids's type is SelectedRows, the rows of Ids contains the
-    // ids to be looked up in W.
-    if (ids_var->IsType<framework::LoDTensor>()) {
-      auto* ids_t = context.Input<LoDTensor>("Ids");
-      ids = const_cast<int64_t*>(ids_t->data<int64_t>());
-      K = ids_t->numel();
-    } else if (ids_var->IsType<framework::SelectedRows>()) {
-      auto* ids_t = context.Input<framework::SelectedRows>("Ids");
-      ids = const_cast<int64_t*>(ids_t->rows().CUDAData(context.GetPlace()));
-      K = ids_t->rows().size();
-      output_t->Resize({K, table_t->dims()[1]});
-    } else {
-      PADDLE_THROW("Unsupported Variable Type of Ids");
-    }
 
     size_t N = table_t->dims()[0];
     size_t D = table_t->dims()[1];
-    auto* table = table_t->data<T>();
-    auto* output = output_t->mutable_data<T>(context.GetPlace());
+    size_t K = ids_t->numel();
+
+    auto *ids = ids_t->data<int64_t>();
+    auto *table = table_t->data<T>();
+    auto *output = output_t->mutable_data<T>(context.GetPlace());
 
     dim3 threads(128, 8);
     dim3 grids(8, 1);
@@ -122,41 +105,44 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
 template <typename T>
 class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx =
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto &dev_ctx =
         context.template device_context<platform::CUDADeviceContext>();
     bool is_sparse = context.Attr<bool>("is_sparse");
     // Since paddings are not trainable and fixed in forward, the gradient of
     // paddings makes no sense and we don't deal with it in backward.
     if (is_sparse) {
-      auto* ids = context.Input<LoDTensor>("Ids");
-      auto* table = context.Input<LoDTensor>("W");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *table = context.Input<LoDTensor>("W");
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
 
-      auto* ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();
+      auto *ids_data = ids->data<int64_t>();
+      int64_t ids_num = ids->numel();
 
       auto stream = dev_ctx.stream();
       // copy GPU memory to CPU pinned memory
       framework::Vector<int64_t> new_rows;
-      new_rows.resize(ids_dim[0]);
+      new_rows.resize(ids_num);
       auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
 
       // TODO(yuyang18): Strange code here.
       memory::Copy(platform::CPUPlace(),
                    new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
-                   ids_data, ids_dim[0] * sizeof(int64_t), stream);
+                   ids_data, ids_num * sizeof(int64_t), stream);
 
       d_table->set_rows(new_rows);
 
-      auto* d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      auto *d_table_value = d_table->mutable_value();
+      d_table_value->Resize({ids_num, table->dims()[1]});
       d_table_value->mutable_data<T>(context.GetPlace());
 
-      auto* d_table_data = d_table_value->data<T>();
-      auto* d_output_data = d_output->data<T>();
-      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
+      auto *d_table_data = d_table_value->data<T>();
+      auto *d_output_data = d_output->data<T>();
+      auto d_output_dims = d_output->dims();
+      PADDLE_ENFORCE_EQ(
+          d_table_value->dims(),
+          framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
       memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data,
                    d_output->numel() * sizeof(T), stream);
 
@@ -168,9 +154,9 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       int N = d_table_t->dims()[0];
       int D = d_table_t->dims()[1];
       int K = ids_t->numel();
-      const int64_t* ids = ids_t->data<int64_t>();
-      const T* d_output = d_output_t->data<T>();
-      T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
+      const int64_t *ids = ids_t->data<int64_t>();
+      const T *d_output = d_output_t->data<T>();
+      T *d_table = d_table_t->mutable_data<T>(context.GetPlace());
 
       auto t = framework::EigenVector<T>::Flatten(*d_table_t);
       t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index d482506bf0361c11a019e32efbf348a64aaf5164..f5c10ced8305b64c6386c5051804f8c9a8f71802 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -36,43 +36,13 @@ template <typename T>
 class LookupTableKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
+    auto *ids_t = context.Input<LoDTensor>("Ids");      // int tensor
+    auto *output_t = context.Output<LoDTensor>("Out");  // float tensor
     auto *table_var = context.InputVar("W");
-    auto *ids_var = context.InputVar("Ids");
-    Tensor *output_t = context.Output<Tensor>("Out");
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-
-    DDim table_dim;
 
-    if (table_var->IsType<LoDTensor>()) {
-      table_dim = context.Input<LoDTensor>("W")->dims();
-    } else if (table_var->IsType<SelectedRows>()) {
-      auto *table_t = context.Input<SelectedRows>("W");
-      table_dim = table_t->value().dims();
-    } else {
-      PADDLE_THROW(
-          "The parameter W of a LookupTable "
-          "must be either LoDTensor or SelectedRows");
-    }
-
-    int64_t *ids;
-    int64_t ids_numel;
-
-    // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
-    // is LoDTensor, this tensor contains the ids to be looked up in W;
-    // when Ids's type is SelectedRows, the rows of Ids contains the
-    // ids to be looked up in W.
-    if (ids_var->IsType<LoDTensor>()) {
-      auto *ids_t = context.Input<LoDTensor>("Ids");
-      ids = const_cast<int64_t *>(ids_t->data<int64_t>());
-      ids_numel = ids_t->numel();
-    } else if (ids_var->IsType<SelectedRows>()) {
-      auto *ids_t = context.Input<SelectedRows>("Ids");
-      ids = const_cast<int64_t *>(ids_t->rows().data());
-      ids_numel = ids_t->rows().size();
-      output_t->Resize({ids_numel, table_dim[1]});
-    } else {
-      PADDLE_THROW("Unsupported Variable Type of Ids");
-    }
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+    int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>());
+    int64_t ids_numel = ids_t->numel();
 
     if (table_var->IsType<LoDTensor>()) {
       auto *table_t = context.Input<LoDTensor>("W");
@@ -139,17 +109,17 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
 
       auto *ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();
+      int64_t ids_num = ids->numel();
 
       framework::Vector<int64_t> new_rows;
-      new_rows.reserve(ids_dim[0]);
-      for (int64_t i = 0; i < ids_dim[0]; i++) {
+      new_rows.reserve(ids_num);
+      for (int64_t i = 0; i < ids_num; i++) {
         new_rows.push_back(ids_data[i]);
       }
       d_table->set_rows(new_rows);
 
       auto *d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table_dim[1]});
+      d_table_value->Resize({ids_num, table_dim[1]});
       d_table_value->mutable_data<T>(context.GetPlace());
 
       d_table->set_height(table_dim[0]);
@@ -157,7 +127,10 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *d_output_data = d_output->data<T>();
       auto *d_table_data = d_table_value->data<T>();
 
-      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
+      auto d_output_dims = d_output->dims();
+      PADDLE_ENFORCE_EQ(
+          d_table_value->dims(),
+          framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
       memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
     } else {
       auto *ids = context.Input<LoDTensor>("Ids");
@@ -165,10 +138,9 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
 
       auto *ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();
 
       int N = table_dim[0];
-      int D = d_output->dims()[1];
+      int D = table_dim[1];
 
       auto *d_output_data = d_output->data<T>();
       auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/math/functors.h b/paddle/fluid/operators/math/functors.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad2f49ccbf5ff37d33cc9e71c1a683571f4f8137
--- /dev/null
+++ b/paddle/fluid/operators/math/functors.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// AddFunctor
+template <typename T>
+struct AddFunctor {
+  // out = x + y;
+  inline HOSTDEVICE T operator()(T x, T y) { return x + y; }
+};
+
+template <typename T>
+struct AddGradFunctor {
+  inline HOSTDEVICE T operator()(T x, T y) { return 1; }
+
+  inline HOSTDEVICE T operator()(T x, T y, T out) const { return 1; }
+};
+
+template <typename T>
+struct ScaleFunctor {
+  explicit ScaleFunctor(const T coeff) : coeff_(coeff) {}
+
+  inline HOSTDEVICE T operator()(T ele) { return ele * coeff_; }
+
+ private:
+  T coeff_;
+};
+
+template <typename T>
+struct ScaleGradFunctor {
+  explicit ScaleGradFunctor(T coeff) : coeff_(coeff) {}
+
+  inline HOSTDEVICE T operator()(T x) { return coeff_; }
+
+  inline HOSTDEVICE T operator()(T x, T out) { return coeff_; }
+
+ private:
+  T coeff_;
+};
+
+template <typename T>
+struct ReluFunctor {
+  inline HOSTDEVICE T operator()(T x) { return x * (x > 0); }
+};
+
+template <typename T>
+struct ReluGradFunctor {
+  inline HOSTDEVICE T operator()(T x) { return x > 0 ? 1 : 0; }
+
+  inline HOSTDEVICE T operator()(T x, T out) { return x > 0 ? 1 : 0; }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index a50b9ace39249f4f899a46e171bbdced033b46bc..1472edbbf47e3e4d6b22c65349713904b13647d2 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col.h"
 #include <vector>
+#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
 
 namespace paddle {
 namespace operators {
@@ -35,36 +36,18 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col->dims().size() == 5);
 
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[1];
-    int filter_width = col->dims()[2];
-    int col_height = col->dims()[3];
-    int col_width = col->dims()[4];
-
-    int channels_col = im_channels * filter_height * filter_width;
-
-    const T* im_data = im.data<T>();
-    T* col_data = col->data<T>();
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / (filter_width * filter_height);
-      for (int h = 0; h < col_height; ++h) {
-        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-        for (int w = 0; w < col_width; ++w) {
-          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-          int col_idx = (c * col_height + h) * col_width + w;
-          int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
-
-          col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
-                               im_col_idx < 0 || im_col_idx >= im_width)
-                                  ? static_cast<T>(0)
-                                  : im_data[im_idx];
-        }
+    if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
+        dilation[1] == 1) {
+      if (padding[0] == 0 && padding[1] == 0) {
+        im2col_sh1sw1dh1dw1ph0pw0<T>(im, col);
+        return;
+      } else if (padding[0] == 1 && padding[1] == 1) {
+        im2col_sh1sw1dh1dw1ph1pw1<T>(im, col);
+        return;
       }
+      // TODO(TJ): complete padding >=2
     }
+    im2col_common<T>(im, dilation, stride, padding, col);
   }
 };
 
diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d32bc5bd0d7f25479370959cabeb9b9c9e7e2d6
--- /dev/null
+++ b/paddle/fluid/operators/math/im2col_cfo_cpu.h
@@ -0,0 +1,252 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/**
+ * The most common im2col algorithm.
+ * Support dilation, stride and padding.
+ */
+template <typename T>
+inline void im2col_common(const framework::Tensor& im,
+                          const std::vector<int>& dilation,
+                          const std::vector<int>& stride,
+                          const std::vector<int>& padding,
+                          framework::Tensor* col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+  int channels_col = im_channels * filter_height * filter_width;
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  for (int c = 0; c < channels_col; ++c) {
+    int w_offset = c % filter_width;
+    int h_offset = (c / filter_width) % filter_height;
+    int c_im = c / (filter_width * filter_height);
+    for (int h = 0; h < output_height; ++h) {
+      int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+      for (int w = 0; w < output_width; ++w) {
+        int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+        int col_idx = (c * output_height + h) * output_width + w;
+        int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
+        col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
+                             im_col_idx < 0 || im_col_idx >= im_width)
+                                ? static_cast<T>(0)
+                                : im_data[im_idx];
+      }
+    }
+  }
+}
+
+/**
+ * im2col algorithm with strides == 1, dilations == 1, paddings == 0
+ */
+template <typename T>
+inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im,
+                                      framework::Tensor* col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  int col_matrix_width = output_width * output_height;
+  int im_size = im_height * im_width;
+  size_t copy_size = sizeof(T) * output_width;
+  const T* im_data_oh = im_data;
+  T* dst_data_oh = col_data;
+  for (int oh = 0; oh < output_height; ++oh) {
+    const T* src_data_ic = im_data_oh;
+    T* dst_data = dst_data_oh;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      const T* src_data = src_data_ic;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        for (int kw = 0; kw < filter_width; ++kw) {
+          std::memcpy(dst_data, src_data + kw, copy_size);
+          dst_data = dst_data + col_matrix_width;
+        }
+        src_data = src_data + im_width;
+      }
+      src_data_ic = src_data_ic + im_size;
+    }
+    im_data_oh = im_data_oh + im_width;
+    dst_data_oh = dst_data_oh + output_width;
+  }
+}
+
+/**
+ * im2col algorithm with strides == 1, dilations == 1, paddings == 1
+ * and filter_width == 1 have a special implementation
+ */
+template <typename T>
+inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im,
+                                      framework::Tensor* col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+
+  constexpr int plh = 1;
+  constexpr int prh = 1;
+  constexpr int plw = 1;
+  constexpr int prw = 1;
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  int im_size = im_height * im_width;
+  int col_matrix_width = output_width * output_height;
+  int col_block_fh = filter_width * col_matrix_width;  // fw*oh*ow
+  int col_block_ic = filter_height * col_block_fh;     // fh*fw*oh*ow
+
+  // fill height padding
+  {
+    size_t copy_size = sizeof(T) * output_width;
+    T* col_start_l = col_data;
+    T* col_start_r = col_data + (filter_height - 1) * col_block_fh +
+                     col_matrix_width - output_width;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      T* dst_data_l = col_start_l;
+      T* dst_data_r = col_start_r;
+      for (int kw = 0; kw < filter_width; ++kw) {
+        std::memset(dst_data_l, 0, copy_size);
+        std::memset(dst_data_r, 0, copy_size);
+        dst_data_l = dst_data_l + col_matrix_width;
+        dst_data_r = dst_data_r + col_matrix_width;
+      }
+      col_start_l = col_start_l + col_block_ic;
+      col_start_r = col_start_r + col_block_ic;
+    }
+  }
+
+  auto pad = static_cast<T>(0);
+  if (filter_width == 1) {
+    // fill width padding
+    T* dst_data_ic = col_data;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      T* dst_data_kh = dst_data_ic;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        T* dst_data = dst_data_kh;
+        for (int oh = 0; oh < output_height; ++oh) {
+          *dst_data = pad;
+          dst_data = dst_data + output_width - 1;
+          *dst_data = pad;
+          ++dst_data;
+        }
+        dst_data_kh = dst_data_kh + col_block_fh;
+      }
+      dst_data_ic = dst_data_ic + col_block_ic;
+    }
+    // fill core
+    size_t copy_size = sizeof(T) * (output_width - plw - prw);
+    for (int oh = 0; oh < output_height; ++oh) {
+      const T* im_data_start =
+          im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
+      T* dst_data = col_data + oh * output_width;
+      for (int ic = 0; ic < im_channels; ++ic) {
+        const T* src_data = im_data_start + ic * im_size;
+        for (int kh = 0; kh < filter_height; ++kh) {
+          if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
+                                         kh > (filter_height - prh - 1))) {
+            dst_data = dst_data + col_matrix_width;
+            continue;
+          }
+          std::memcpy(dst_data + plw, src_data, copy_size);
+          dst_data = dst_data + col_matrix_width;
+          src_data = src_data + im_width;
+        }
+      }
+    }
+    return;
+  }
+
+  // filter_width != 1
+  // fill width padding
+  T* dst_data_ic = col_data;
+  for (int ic = 0; ic < im_channels; ++ic) {
+    T* dst_data_kh = dst_data_ic;
+    for (int kh = 0; kh < filter_height; ++kh) {
+      for (T* dst_data :
+           {dst_data_kh, dst_data_kh + (filter_width - prw) * col_matrix_width +
+                             output_width - 1}) {
+        // TODO(TJ): from plh, saving repeated assignment
+        for (int oh = 0; oh < output_height; ++oh) {
+          *dst_data = pad;
+          dst_data = dst_data + output_width;
+        }
+      }
+      dst_data_kh = dst_data_kh + col_block_fh;
+    }
+    dst_data_ic = dst_data_ic + col_block_ic;
+  }
+
+  // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
+  // (output_width-1)}
+  // length of copy_size is equal kw.
+  for (int oh = 0; oh < output_height; ++oh) {
+    const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
+    T* dst_data = col_data + oh * output_width;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      const T* src_data = im_data_start + ic * im_size;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
+                                       kh > (filter_height - prh - 1))) {
+          dst_data = dst_data + filter_width * col_matrix_width;
+          continue;
+        }
+        // TODO(TJ): reuse plw-kw outside this for
+        // try to unify
+        for (int kw = 0; kw < plw; ++kw) {
+          std::memcpy(dst_data + (plw - kw), src_data,
+                      sizeof(T) * (output_width - (plw - kw)));
+          dst_data = dst_data + col_matrix_width;
+        }
+        for (int kw = plw; kw < filter_width - prw; ++kw) {
+          std::memcpy(dst_data, src_data + (kw - plw),
+                      sizeof(T) * output_width);
+          dst_data = dst_data + col_matrix_width;
+        }
+        int i = 1;
+        for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) {
+          std::memcpy(dst_data, src_data + (kw - plw),
+                      sizeof(T) * (output_width - i));
+          dst_data = dst_data + col_matrix_width;
+        }
+        src_data = src_data + im_width;
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index 8e3f0f286823c383bb0c44d0e7887040ec9b20a0..ae2c90b33a4298ada4fd01aa2a44ebdf10d036d4 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col.h"
 #include <gtest/gtest.h>
+#include <sys/time.h>
 #include <vector>
+#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
 
 template <typename DeviceContext, typename Place>
 void testIm2col() {
@@ -167,3 +169,104 @@ TEST(math, im2col) {
              paddle::platform::CUDAPlace>();
 #endif
 }
+
+#define PREPARE_IM2COL_CPU                                                   \
+  paddle::platform::CPUPlace place;                                          \
+  paddle::platform::CPUDeviceContext context(place);                         \
+  paddle::framework::Tensor input;                                           \
+  paddle::framework::Tensor out;                                             \
+  paddle::framework::Tensor ref;                                             \
+  std::vector<int> padding({ph, pw});                                        \
+  std::vector<int> stride({1, 1});                                           \
+  std::vector<int> dilation({1, 1});                                         \
+  float* input_ptr = input.mutable_data<float>({ic, ih, iw}, place);         \
+  for (int i = 0; i < input.numel(); ++i) {                                  \
+    input_ptr[i] = static_cast<float>(i + 1);                                \
+  }                                                                          \
+  int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1;            \
+  int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1;             \
+  out.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
+  ref.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
+  paddle::operators::math::Im2ColFunctor<                                    \
+      paddle::operators::math::ColFormat::kCFO,                              \
+      paddle::platform::CPUDeviceContext, float>                             \
+      im2col
+
+void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
+  PREPARE_IM2COL_CPU;
+
+  im2col(context, input, dilation, stride, padding, &out);
+  paddle::operators::math::im2col_common<float>(input, dilation, stride,
+                                                padding, &ref);
+
+  float* ref_data = ref.data<float>();
+  float* out_data = out.data<float>();
+  for (int i = 0; i < out.numel(); ++i) {
+    EXPECT_EQ(out_data[i], ref_data[i]);
+  }
+}
+
+void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
+  PREPARE_IM2COL_CPU;
+  constexpr int repeat = 100;
+  auto GetCurrentMs = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
+  };
+  auto t1 = GetCurrentMs();
+  for (int i = 0; i < repeat; ++i) {
+    im2col(context, input, dilation, stride, padding, &out);
+  }
+  auto t2 = GetCurrentMs();
+
+  for (int i = 0; i < repeat; ++i) {
+    paddle::operators::math::im2col_common<float>(input, dilation, stride,
+                                                  padding, &ref);
+  }
+  auto t3 = GetCurrentMs();
+
+  LOG(INFO) << "before: " << (t3 - t2) / repeat
+            << ",after: " << (t2 - t1) / repeat
+            << ",boost: " << ((t3 - t2) / (t2 - t1) - 1) * 100 << "%";
+}
+
+TEST(math, im2col_cputest) {
+  // padding_h == padding_w
+  for (int p = 0; p < 4; ++p) {
+    // width == height
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 5, /*fh*/ 4, /*fw*/ 4, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 2, /*ph*/ p,
+                  /*pw*/ p);
+
+    // height != width
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 1, /*fw*/ 3, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 5, /*fh*/ 3, /*fw*/ 1, /*ph*/ p,
+                  /*pw*/ p);
+
+    // filter == 1
+    testIm2colCPU(/*ic*/ 3, /*ih*/ 4, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 3, /*ih*/ 3, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p,
+                  /*pw*/ p);
+  }
+
+  // padding_h != padding_w
+  testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ 1,
+                /*pw*/ 2);
+
+  // benchmark
+  for (int p : {0, 1}) {
+    for (int k : {1, 3, 5}) {
+      LOG(INFO) << "padding == " << p << ", filter == " << k;
+      benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ k, /*fw*/ k,
+                  /*ph*/ p, /*pw*/ p);
+    }
+  }
+}
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index a579182ec1bd5d10d95bbf8c6f5a0e70ceaaaf4b..3effe776258cb541dbba32f63eda457d917011f4 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -52,7 +52,7 @@ void SoftmaxCUDNNFunctor<T>::operator()(
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
   cudnnTensorDescriptor_t cudnn_y_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxForward(
+  CUDNN_ENFORCE(platform::dynload::cudnnSoftmaxForward(
       context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
       CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_x_desc,
       X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc,
@@ -83,7 +83,7 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
       dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
   cudnnTensorDescriptor_t cudnn_ygrad_desc =
       dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxBackward(
+  CUDNN_ENFORCE(platform::dynload::cudnnSoftmaxBackward(
       context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
       CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_y_desc,
       Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(),
diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
index c9744db3d0654ef63357963d9a9a3cb946f56e2d..eb09470f37eabb5524f774bc289fc68f5884c540 100644
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -164,14 +163,11 @@ class ParallelDoOp : public framework::OperatorBase {
       auto &place = places[place_idx];
       auto *cur_scope = sub_scopes[place_idx];
 
-      workers.emplace_back(
-          framework::Async([program, cur_scope, place, block, place_idx] {
-            // Give the thread an id to distinguish parallel block with same id.
-            platform::RecordThread rt(static_cast<int>(place_idx) + 1);
-            framework::Executor executor(place);
-            executor.Run(*program, cur_scope, block->ID(),
-                         false /*create_local_scope*/);
-          }));
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
+        framework::Executor executor(place);
+        executor.Run(*program, cur_scope, block->ID(),
+                     false /*create_local_scope*/);
+      }));
     }
     for (auto &worker : workers) {
       worker.wait();
@@ -242,14 +238,11 @@ class ParallelDoGradOp : public framework::OperatorBase {
       auto *cur_scope = sub_scopes[i];
 
       // execute
-      workers.emplace_back(
-          framework::Async([program, cur_scope, place, block, i] {
-            // Give the thread an id to distinguish parallel block with same id.
-            platform::RecordThread rt(static_cast<int>(i) + 1);
-            framework::Executor executor(place);
-            executor.Run(*program, cur_scope, block->ID(),
-                         false /*create_local_scope*/);
-          }));
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
+        framework::Executor executor(place);
+        executor.Run(*program, cur_scope, block->ID(),
+                     false /*create_local_scope*/);
+      }));
     }
     for (auto &worker : workers) {
       worker.wait();
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index be55bc43b14f1e6211f71b4080d1676838ad508c..31f083565fddee66aea1485ed71f41b6199f4502 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -81,7 +81,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn pool algorithm ---------------------
     auto handle = ctx.cuda_device_context().cudnn_handle();
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    PADDLE_ENFORCE(platform::dynload::cudnnPoolingForward(
+    CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward(
         handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
         cudnn_output_desc, output_data));
   }
@@ -154,7 +154,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
       T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
 
-      PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward(
+      CUDNN_ENFORCE(platform::dynload::cudnnPoolingBackward(
           handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
           cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
           &beta, cudnn_input_desc, input_grad_data));
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
index 65fcce8bb019965a805ad09d50be0aba64e4f24e..a0d640b2020958af53a4405ae886eadb2a1e117e 100644
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -65,6 +66,12 @@ class ReadOp : public framework::OperatorBase {
             .GetMutable<framework::ReaderHolder>();
     std::vector<std::string> out_arg_names = Outputs("Out");
     std::vector<framework::LoDTensor> ins;
+
+    // For profiling
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(dev_place);
+    platform::RecordEvent record_event(Type(), &ctx);
+
     reader->ReadNext(&ins);
     if (ins.empty()) {
       if (Attr<bool>("throw_eof_exp")) {
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index 1ba684014904e61a86bebacd7d29d7e10d313092..4a6ce938a5f337d035b21f562d46daf606236db0 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -40,8 +40,6 @@ class RecvOp : public framework::OperatorBase {
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
 
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index a9fd1869c9df5464db6fc87ac633cdba2d6dbe7f..a1dfe39c3a4f84f5e4aaa2306813a7decf0e49ea 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -127,12 +127,6 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor). The output tensor of reshape operator.");
     AddAttr<std::vector<int>>(
         "shape", "(std::vector<int>) Target shape of reshape operator.");
-    AddAttr<bool>("inplace",
-                  "(default: false) Change the source tensor's shape without "
-                  "memory copy. When Attr(inplace) is set true, the output "
-                  "tensor shares memory with Input(X), otherwise, a new output "
-                  "tensor is created, and its data are copied from Input(x).")
-        .SetDefault(false);
     AddComment(R"DOC(
 Reshape Operator.
 
@@ -233,16 +227,9 @@ class ReshapeKernel {
           "sequence_reshape op.");
     }
 
-    bool inplace = ctx.Attr<bool>("inplace");
+    out->mutable_data(ctx.GetPlace(), in->type());
+    framework::TensorCopySync(*in, ctx.GetPlace(), out);
     out->Resize(out_dims);
-    if (!inplace) {
-      out->mutable_data(ctx.GetPlace(), in->type());
-      framework::TensorCopySync(*in, ctx.GetPlace(), out);
-      out->Resize(out_dims);
-    } else {
-      out->ShareDataWith(*in);
-      out->Resize(out_dims);
-    }
   }
 };
 
@@ -251,19 +238,11 @@ class ReshapeGradKernel {
   void operator()(const framework::ExecutionContext &ctx) const {
     auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto in_dims = d_x->dims();
 
     d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    bool inplace = ctx.Attr<bool>("inplace");
-
-    auto in_dims = d_x->dims();
-    if (!inplace) {
-      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
-      ctx.device_context().Wait();
-      d_x->Resize(in_dims);
-    } else {
-      d_x->ShareDataWith(*d_out);
-      d_x->Resize(in_dims);
-    }
+    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
+    d_x->Resize(in_dims);
   }
 };
 
diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc
index d7f8e994afd7e656bd5a9dd7c5ab45f0d52fe88b..1866a86048acbefadcb4d82cd6309cd16f0352d6 100644
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -39,11 +39,6 @@ class SendBarrierOp : public framework::OperatorBase {
     std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
     bool sync_mode = Attr<bool>("sync_mode");
 
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
-
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index 829f310d4233c01a7fbb9ccf7427f6e47ce8d384..3cd42f2d059532b7090e66ce21de8e5cb014adf1 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -42,9 +42,6 @@ class SendOp : public framework::OperatorBase {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
 
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
-
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
diff --git a/paddle/fluid/operators/send_recv_util.h b/paddle/fluid/operators/send_recv_util.h
index deab005149027caffa962783df944fad7110382f..dc26c53c64f06ce21856fb5af8f2a5eb3fc75bb7 100644
--- a/paddle/fluid/operators/send_recv_util.h
+++ b/paddle/fluid/operators/send_recv_util.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+#include "paddle/fluid/framework/ir/node.h"
 
 namespace paddle {
 namespace operators {
@@ -22,7 +23,10 @@ inline bool NeedSend(const framework::Scope& scope,
                      const std::string& varname) {
   // dummy variable is only used in parallel executor to represent
   // some dependency relationship, we don't need to send/recv it.
-  if (varname == "dummy") return false;
+  // TODO(paddle-dev): Why would parallel executor logic leaked into here?
+  if (varname.find(framework::ir::Node::kControlDepVarName) !=
+      std::string::npos)
+    return false;
   auto* var = scope.FindVar(varname);
   PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
                           varname);
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
index 5596fa0648ccc151bc0d11de9c556599428a8d71..2bdb23e999621b10799b5163f326bc4b66a437e6 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -30,8 +30,16 @@ class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     Out->mutable_data<T>(context.GetPlace());
 
+    auto dims = X->dims();
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::LoDTensor flattened_x;
+    framework::LoDTensor flattened_out;
+    flattened_x.ShareDataWith(*X).Resize(flattened_dims);
+    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
+
     math::SoftmaxCUDNNFunctor<T>()(
-        context.template device_context<platform::CUDADeviceContext>(), X, Out);
+        context.template device_context<platform::CUDADeviceContext>(),
+        &flattened_x, &flattened_out);
   }
 };
 
@@ -46,9 +54,18 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     dX->mutable_data<T>(context.GetPlace());
 
+    auto dims = Out->dims();
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::LoDTensor flattened_out;
+    framework::LoDTensor flattened_d_out;
+    framework::LoDTensor flattened_d_x;
+    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
+    flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims);
+    flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims);
+
     math::SoftmaxGradCUDNNFunctor<T>()(
-        context.template device_context<platform::CUDADeviceContext>(), Out,
-        dOut, dX);
+        context.template device_context<platform::CUDADeviceContext>(),
+        &flattened_out, &flattened_d_out, &flattened_d_x);
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc
index 6668e6b9e917eea7ba4a80ac78917b73eb827208..01819f53e3ab0973f6140c5a81f18f954b6a0376 100644
--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -26,9 +26,9 @@ using paddle::platform::MKLDNNMemDesc;
 
 using mkldnn::memory;  // Note: paddle has also "memory" namespace
 using mkldnn::primitive;
-using mkldnn::softmax_forward;
-using mkldnn::softmax_backward;
 using mkldnn::prop_kind;
+using mkldnn::softmax_backward;
+using mkldnn::softmax_forward;
 using mkldnn::stream;
 using platform::to_void_cast;
 
@@ -113,17 +113,27 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
     auto mkldnn_engine = dev_ctx.GetEngine();
     const Tensor* input = ctx.Input<Tensor>("X");
     Tensor* output = ctx.Output<Tensor>("Out");
-    PADDLE_ENFORCE(input->dims().size() == 2UL,
-                   "The input of softmax op must be a 2D matrix.");
-    const T* input_data = input->data<T>();
-    // allocate memory for output
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
-    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
-    // MKL-DNN does support softmax over selected axis. Having 2D Tensor,
-    // we will make normalization after final eg. axis: 1
-    PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
-                   "Softmax input and output dimensions should match");
+    PADDLE_ENFORCE_EQ(
+        input->dims(), output->dims(),
+        "The shape of softmax's input and output must be identical.");
+
+    // make sure 'output' holds memory, which will be shared by
+    // 'flattened_output' later.
+    output->mutable_data<T>(ctx.GetPlace());
+
+    // flatten input and output to 2-D matrixs
+    auto dims = input->dims();  // input and output share the same shape
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::Tensor flattened_input;
+    framework::Tensor flattened_output;
+    flattened_input.ShareDataWith(*input).Resize(flattened_dims);
+    flattened_output.ShareDataWith(*output).Resize(flattened_dims);
+
+    const T* input_data = flattened_input.data<T>();
+    T* output_data = flattened_output.mutable_data<T>(ctx.GetPlace());
+
+    std::vector<int> src_tz = paddle::framework::vectorize2int(flattened_dims);
+    std::vector<int> dst_tz = src_tz;
     // Same memory descriptor to be used for input and output
     memory::dims softmax_tz = {src_tz[0], src_tz[1]};
     // Generate keys for storing/retriving primitives for this operator
@@ -174,23 +184,34 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     auto mkldnn_engine = dev_ctx.GetEngine();
     const Tensor* output = ctx.Input<Tensor>("Out");
-    const T* dst_data = output->data<T>();
-
     auto* dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
-    const auto* diff_dst_ptr = dout->template data<T>();
-
     auto* dx =
         ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
-    T* diff_src_ptr = dx->template mutable_data<T>(ctx.GetPlace());
 
-    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    PADDLE_ENFORCE_EQ(
+        dout->dims(), dx->dims(),
+        "The shape of softmax_grad's input and output must be identical.");
+
+    // make sure 'dx' holds memory, which will be shared by 'flattened_dx'
+    // later.
+    dx->template mutable_data<T>(ctx.GetPlace());
+
+    auto dims = dout->dims();  // input and output share the same shape
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::Tensor flattened_output;
+    framework::Tensor flattened_dout;
+    framework::Tensor flattened_dx;
+    flattened_output.ShareDataWith(*output).Resize(flattened_dims);
+    flattened_dout.ShareDataWith(*dout).Resize(flattened_dims);
+    flattened_dx.ShareDataWith(*dx).Resize(flattened_dims);
+
+    const T* dst_data = flattened_output.data<T>();
+    const T* diff_dst_ptr = flattened_dout.template data<T>();
+    T* diff_src_ptr = flattened_dx.template mutable_data<T>(ctx.GetPlace());
+
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(flattened_dims);
     std::vector<int> src_tz(dst_tz);
-    PADDLE_ENFORCE(output->dims().size() == 2UL,
-                   "The input of softmax op must be a 2D matrix.");
-    // MKL-DNN does support softmax over selected axis. Having 2D Tensor,
-    // we will make normalization after final eg. axis: 1
-    PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
-                   "Softmax input and output dimensions should match");
+
     // Same memory descriptor to be used for input and output
     memory::dims softmax_tz = {src_tz[0], src_tz[1]};
     // Currently only supports NC data format
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 31a7458f637921c290fc71ac748143867b4aae19..bb081238820b9ee3ae095442d21cfce11f7b41e5 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -37,10 +37,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of SoftmaxOp should not be null.");
 
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE(x_dims.size() == 2UL,
-                   "The input of softmax op must be a matrix.");
-    ctx->SetOutputDim("Out", x_dims);
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 
@@ -81,8 +78,8 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "The input tensor of softmax. "
-             "2-D with shape [batch_size, input_feature_dimensions].");
+             "The input tensor of softmax, "
+             "whose last dimension is the input_feature_dimensions.");
     AddOutput("Out", "The normalized values with the same shape as X.")
         .Reuse("X");
     AddAttr<bool>(
@@ -105,20 +102,23 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Softmax Operator.
 
-The input of the softmax operator is a 2-D tensor with shape N x K (N is the
-batch_size, K is the dimension of input feature). The output tensor has the
-same shape as the input tensor.
+The input of the softmax operator is a tensor of any rank. The output tensor 
+has the same shape as the input.
 
-For each row of the input tensor, the softmax operator squashes the
-K-dimensional vector of arbitrary real values to a K-dimensional vector of real
-values in the range [0, 1] that add up to 1.
+The input tensor will first be logically flattened to a 2-D matrix. The matrix's 
+second dimension(row length) is as same as the last dimension of the input 
+tensor, and the first dimension(column length) is the product of all other 
+dimensions of the input tensor. For each row of the matrix, the softmax operator 
+squashes the K-dimensional(K is the width of the matrix, which is also the size 
+of the input tensor's last dimension) vector of arbitrary real values to a 
+K-dimensional vector of real values in the range [0, 1] that add up to 1.
 It computes the exponential of the given dimension and the sum of exponential
 values of all the other dimensions in the K-dimensional vector input.
 Then the ratio of the exponential of the given dimension and the sum of
 exponential values of all the other dimensions is the output of the softmax
 operator.
 
-For each row $i$ and each column $j$ in Input(X), we have:
+For each row $i$ and each column $j$ in the matrix, we have:
     $$Out[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$
 
 )DOC");
@@ -137,7 +137,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
                       ctx->GetInputDim(framework::GradVarName("Out")),
                       "Input(Out) and its gradients should have a same shape.");
 
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->SetOutputDim(framework::GradVarName("X"),
+                      ctx->GetInputDim(framework::GradVarName("Out")));
   }
 
  protected:
@@ -160,8 +161,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
       layout_ = framework::DataLayout::kMKLDNN;
     }
 #endif
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    auto input_data_type = framework::ToDataType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type());
     if (input_data_type == framework::proto::VarType::FP16) {
       PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                      "float16 can only be used on GPU place");
@@ -172,13 +173,31 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
   }
 };
 
+class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("softmax_grad");
+
+    op->SetInput("Out", Output("Out"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::SoftmaxOpGradMaker);
 REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
 REGISTER_OP_CPU_KERNEL(
     softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
index 600da45a0bbb69b76d59c981e195fc03a49b0504..1205bd0587f32caae04c27ecea581fc17988507f 100644
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -31,8 +31,16 @@ class SoftmaxKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     Out->mutable_data<T>(context.GetPlace());
 
+    auto dims = X->dims();
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::LoDTensor flattened_x;
+    framework::LoDTensor flattened_out;
+    flattened_x.ShareDataWith(*X).Resize(flattened_dims);
+    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
+
     math::SoftmaxFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), X, Out);
+        context.template device_context<DeviceContext>(), &flattened_x,
+        &flattened_out);
   }
 };
 
@@ -47,8 +55,18 @@ class SoftmaxGradKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     dX->mutable_data<T>(context.GetPlace());
 
+    auto dims = Out->dims();
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::LoDTensor flattened_out;
+    framework::LoDTensor flattened_d_out;
+    framework::LoDTensor flattened_d_x;
+    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
+    flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims);
+    flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims);
+
     math::SoftmaxGradFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), Out, dOut, dX);
+        context.template device_context<DeviceContext>(), &flattened_out,
+        &flattened_d_out, &flattened_d_x);
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 8f7840cee1dd95a828fd4ac8815e335a5db47e3d..a559b01ed32a48e3befb37c2ae8935b4f3a4acb0 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 
+#include <cub/cub.cuh>
+#include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
 
 namespace paddle {
@@ -53,8 +55,196 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
     logit_grad[ids] = loss_grad[row_ids] * (logit_grad[ids] - labels[ids]);
   }
 }
+
 }  // namespace
 
+static __device__ __forceinline__ float real_exp(float x) { return expf(x); }
+static __device__ __forceinline__ double real_exp(double x) { return exp(x); }
+static __device__ __forceinline__ float real_log(float x) {
+  return math::TolerableValue<float>()(logf(x));
+}
+static __device__ __forceinline__ double real_log(double x) {
+  return math::TolerableValue<double>()(log(x));
+}
+
+/** In the following codes, 3 CUDA kernels are implemented to calculate softmax
+ * and loss **/
+/*
+  Supposing the x is `logits` and y is `labels`, the equations are as
+followings:
+
+  cross\_entropy_i = \sum_{j}[- y_i_j * log({e^{x_i_j}/\sum_{j}e^{x_i_j}})]
+        = \sum_{j}[- y_i_j * log({e^{x_i_j - max_i}/\sum_{j}e^{x_i_j-max_i}})]
+        = \sum_{j}[-y_i_j * (x_i_j - max_i - log\sum_{j}e^{x_i_j - max_i})]
+        = \sum_{j}[-y_i_j * (x_i_j - max_i - logDiffMaxSum_i)]
+        = \sum_{j}(-y_i_j * tmp_i_j)
+
+  softmax_i_j = e^{tmp_i_j}
+
+where:
+  max_i = \max_{j}{x_i_j}
+  logDiffMaxSum_i = log\sum_{j}e^{x_i_j - max_i}
+  tmp_i_j = x_i_j - max_i - logDiffMaxSum_i
+
+Therefore, the calculation can be separated into 3 steps:
+Step 1: row-wise operation to calculate max_i
+Step 2: row-wise operation to calculate logDiffMaxSum_i
+Step 3: caculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i
+
+To save memory, we can share memory among max_i, logDiffMaxSum_i and
+cross\_entropy_i.
+In this way, the 3 steps should be changed to:
+Step 1 (RowReductionForMax): row-wise operation to calculate max_i
+Step 2 (RowReductionForDiffMaxSum): calculate immediate result of softmax'_i_j =
+x_i_j - max_i, and row-wise operation to calculate logDiffMaxSum_i
+Step 3 (RowReductionForSoftmaxAndCrossEntropy): calculate tmp_i_j = softmax'_i_j
+- logDiffMaxSum_i, and finally get softmax_i_j and cross\_entropy_i
+*/
+
+// There are 3 kinds of reduce algorithms in cub:
+// BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
+// BLOCK_REDUCE_RAKING
+// BLOCK_REDUCE_WARP_REDUCTIONS (default)
+template <typename T, int BlockDim>
+using BlockReduce =
+    cub::BlockReduce<T, BlockDim /*, cub::BLOCK_REDUCE_WARP_REDUCTIONS*/>;
+
+template <typename T, int BlockDim>
+using BlockReduceTempStorage = typename BlockReduce<T, BlockDim>::TempStorage;
+
+// Make sure that BlockDim <= feature_size
+// This kernel is used to calculate the max element of each row
+template <typename T, int BlockDim>
+__global__ void RowReductionForMax(const T* logits_data, T* max_data,
+                                   int feature_size) {
+  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
+
+  auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
+  auto end_idx = feature_size * (blockIdx.x + 1);
+
+  T cur_max = logits_data[beg_idx];
+  beg_idx += BlockDim;
+  while (beg_idx < end_idx) {
+    if (cur_max < logits_data[beg_idx]) {
+      cur_max = logits_data[beg_idx];
+    }
+    beg_idx += BlockDim;
+  }
+
+  cur_max = BlockReduce<T, BlockDim>(temp_storage).Reduce(cur_max, cub::Max());
+
+  if (threadIdx.x == 0) {
+    max_data[blockIdx.x] = cur_max < -64 ? -64 : cur_max;
+  }
+}
+
+// Make sure that BlockDim <= feature_size
+template <typename T, int BlockDim>
+__global__ void RowReductionForDiffMaxSum(const T* logits_data, T* max_data,
+                                          T* softmax, int feature_size) {
+  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
+
+  auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
+  auto end_idx = feature_size * (blockIdx.x + 1);
+
+  auto block_max = max_data[blockIdx.x];
+
+  softmax[beg_idx] = logits_data[beg_idx] - block_max;
+  T diff_max_sum = real_exp(softmax[beg_idx]);
+  beg_idx += BlockDim;
+  while (beg_idx < end_idx) {
+    softmax[beg_idx] = logits_data[beg_idx] - block_max;
+    diff_max_sum += real_exp(softmax[beg_idx]);
+    beg_idx += BlockDim;
+  }
+
+  diff_max_sum =
+      BlockReduce<T, BlockDim>(temp_storage).Reduce(diff_max_sum, cub::Sum());
+  if (threadIdx.x == 0) max_data[blockIdx.x] = real_log(diff_max_sum);
+}
+
+// Make sure that BlockDim <= feature_size
+template <typename T, int BlockDim>
+__global__ void RowReductionForSoftmaxAndCrossEntropy(const T* logits_data,
+                                                      const T* labels_data,
+                                                      T* loss_data, T* softmax,
+                                                      int feature_size) {
+  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
+
+  auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
+  auto end_idx = feature_size * (blockIdx.x + 1);
+
+  // log_diff_max_sum shares memory with loss
+  auto block_log_diff_max_sum = loss_data[blockIdx.x];
+  auto tmp = softmax[beg_idx] - block_log_diff_max_sum;
+  softmax[beg_idx] = real_exp(tmp);
+  auto loss = -labels_data[beg_idx] * tmp;
+  beg_idx += BlockDim;
+  while (beg_idx < end_idx) {
+    tmp = softmax[beg_idx] - block_log_diff_max_sum;
+    softmax[beg_idx] = real_exp(tmp);
+    loss -= (labels_data[beg_idx] * tmp);
+    beg_idx += BlockDim;
+  }
+
+  loss = BlockReduce<T, BlockDim>(temp_storage).Reduce(loss, cub::Sum());
+  if (threadIdx.x == 0) loss_data[blockIdx.x] = loss;
+}
+
+template <typename T>
+__global__ void SetSoftmaxToOneWhenFeatureSizeIsOne(T* out, int batch_size) {
+  auto idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < batch_size) out[idx] = static_cast<T>(1);
+}
+
+template <typename T>
+static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data,
+                                               const T* labels_data,
+                                               T* softmax_data, T* loss_data,
+                                               int batch_size, int feature_size,
+                                               cudaStream_t stream) {
+  constexpr int kMaxBlockDim = 512;
+  int block_dim = feature_size >= kMaxBlockDim
+                      ? kMaxBlockDim
+                      : (1 << static_cast<int>(std::log2(feature_size)));
+
+#define CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)                \
+  case BlockDim:                                                              \
+    RowReductionForMax<T, BlockDim><<<batch_size, BlockDim, 0, stream>>>(     \
+        logits_data, loss_data, feature_size);                                \
+    RowReductionForDiffMaxSum<T,                                              \
+                              BlockDim><<<batch_size, BlockDim, 0, stream>>>( \
+        logits_data, loss_data, softmax_data, feature_size);                  \
+    RowReductionForSoftmaxAndCrossEntropy<                                    \
+        T, BlockDim><<<batch_size, BlockDim, 0, stream>>>(                    \
+        logits_data, labels_data, loss_data, softmax_data, feature_size);     \
+    break
+
+  switch (block_dim) {
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(512);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(256);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(128);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(64);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(32);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(16);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(8);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
+    case 1:
+      SetSoftmaxToOneWhenFeatureSizeIsOne<<<(batch_size + kMaxBlockDim - 1) /
+                                                kMaxBlockDim,
+                                            kMaxBlockDim, 0, stream>>>(
+          softmax_data, batch_size);
+      cudaMemsetAsync(loss_data, 0, batch_size, stream);
+      break;
+    default:
+      PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op");
+      break;
+  }
+
+#undef CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
+}
+
 template <typename T>
 class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -66,14 +256,24 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
     Tensor* softmax = context.Output<Tensor>("Softmax");
 
     Tensor* loss = context.Output<Tensor>("Loss");
-    softmax->mutable_data<T>(context.GetPlace());
-    loss->mutable_data<T>(context.GetPlace());
-
-    math::SoftmaxFunctor<platform::CUDADeviceContext, T>()(
-        context.cuda_device_context(), logits, softmax);
-    math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
-        context.cuda_device_context(), loss, softmax, labels,
-        context.Attr<bool>("soft_label"));
+    auto* softmax_data = softmax->mutable_data<T>(context.GetPlace());
+    auto* loss_data = loss->mutable_data<T>(context.GetPlace());
+
+    auto soft_label = context.Attr<bool>("soft_label");
+    if (soft_label) {
+      int batch_size = logits->dims()[0];
+      int feature_size = logits->dims()[1];
+      auto* logits_data = logits->data<T>();
+      auto* labels_data = labels->data<T>();
+      SoftmaxWithCrossEntropyFusedKernel(
+          logits_data, labels_data, softmax_data, loss_data, batch_size,
+          feature_size, context.cuda_device_context().stream());
+    } else {
+      math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(), logits,
+                                     softmax);
+      math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
+          context.cuda_device_context(), loss, softmax, labels, false);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h
index d263426e073d95ad6d584c7370baf596587a993d..c4af5a65fc5f81c1af7c1fdcca637ca37c940637 100644
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
@@ -67,10 +68,15 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
       const auto &ids_rows = ids_selected_rows->rows();
       auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
       const size_t shard_num = outs.size();
+      for (auto &out : outs) {
+        out->mutable_rows()->clear();
+      }
       // get rows for outputs
-      for (auto &id : ids_rows) {
-        size_t shard_id = static_cast<size_t>(id) % shard_num;
-        outs[shard_id]->mutable_rows()->push_back(id);
+      std::unordered_map<int64_t, size_t> id_to_index;
+      for (size_t i = 0; i < ids_rows.size(); ++i) {
+        id_to_index[ids_rows[i]] = i;
+        size_t shard_id = static_cast<size_t>(ids_rows[i]) % shard_num;
+        outs[shard_id]->mutable_rows()->push_back(ids_rows[i]);
       }
 
       int64_t row_width = ids_dims[1];
@@ -80,7 +86,8 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
             {static_cast<int64_t>(out->rows().size()), row_width});
         T *output = out->mutable_value()->mutable_data<T>(ddim, place);
         for (int64_t i = 0; i < ddim[0]; ++i) {
-          memcpy(output + i * row_width, ids + out->rows()[i] * row_width,
+          memcpy(output + i * row_width,
+                 ids + id_to_index[out->rows()[i]] * row_width,
                  row_width * sizeof(T));
         }
       }
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
index db641a4bc2c637e0babee6b6bc6e67b068759ff5..ee3078876c15b06a887064f08dc0c05d450b5f77 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -55,18 +55,8 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
                     "TensorRT' tensor input requires at least 2 dimensions");
   PADDLE_ENFORCE_LE(shape.size(), 4UL,
                     "TensorRT' tensor input requires at most 4 dimensions");
-
-  switch (shape.size()) {
-    case 2:
-      return nvinfer1::Dims2(1, shape[1]);
-    case 3:
-      return nvinfer1::Dims3(1, shape[1], shape[2]);
-    case 4:
-      return nvinfer1::Dims4(1, shape[1], shape[2], shape[3]);
-    default:
-      return nvinfer1::Dims();
-  }
-  return nvinfer1::Dims();
+  PADDLE_ENFORCE_EQ(shape.size(), 4UL);
+  return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
 }
 
 }  // namespace
@@ -86,6 +76,9 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
     parameters.insert(param);
   }
 
+  std::vector<std::string> output_maps =
+      context.Attr<std::vector<std::string>>("output_name_mapping");
+
   // TODO(Superjomn) replace this with a different stream
   auto *engine = Singleton<TRT_EngineManager>::Global().Create(
       max_batch, max_workspace, nullptr /*engine hold its own stream*/,
@@ -97,6 +90,7 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
   // Add inputs
   VLOG(4) << "declare inputs";
   for (auto &input : context.Inputs("Xs")) {
+    if (parameters.count(input)) continue;
     VLOG(4) << "declare input " << input;
     auto *var = block.FindVar(input);
     // TensorRT engine need to create parameters. The parameter's description
@@ -122,7 +116,7 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
       block_desc, parameters, context.scope(), engine);
 
   // Add outputs
-  for (auto &output : context.Outputs("Ys")) {
+  for (auto &output : output_maps) {
     engine->DeclareOutput(output);
   }
 
@@ -163,7 +157,4 @@ REGISTER_OP_CPU_KERNEL(
     ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>,
     ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>);
 
-// A trick to compile with the needed TensorRT op converter.
-USE_TRT_CONVERTER(mul)
-
 #endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index 32d10fd8a5687ebaae1d7d75af531cbc45ef4245..2cbe1213a2f428a3ce56b06f97636baeb4b66c26 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -66,8 +66,17 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
                       context.Attr<int>("max_batch"));
 
+    std::vector<std::string> output_maps =
+        context.Attr<std::vector<std::string>>("output_name_mapping");
+
+    auto params = context.Attr<std::vector<std::string>>("parameters");
+    std::unordered_set<std::string> parameters;
+    for (const auto& param : params) {
+      parameters.insert(param);
+    }
     // Convert input tensor from fluid to engine.
     for (const auto& x : context.Inputs("Xs")) {
+      if (parameters.count(x)) continue;
       // convert input and copy to TRT engine's buffer
       auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
           context.scope(), x);
@@ -82,10 +91,12 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
     // Execute the engine.
     PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0);
     engine->Execute(FLAGS_tensorrt_engine_batch_size);
+
     // Convert output tensor from engine to fluid
+    int output_index = 0;
     for (const auto& y : context.Outputs("Ys")) {
       // convert output and copy to fluid.
-      nvinfer1::ITensor* trt_t = engine->GetITensor(y);
+      nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
       auto dims = trt_t->getDimensions();
       // Use the output ITensor's dims to reshape the Fluid Tensor.
       std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
@@ -102,7 +113,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       // TODO(Superjomn) change this float to dtype size.
       auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) *
                   FLAGS_tensorrt_engine_batch_size;
-      engine->GetOutputInCPU(y,
+      engine->GetOutputInCPU(output_maps[output_index],
                              fluid_t->mutable_data<float>(platform::CPUPlace()),
                              size * sizeof(float));
       //} else {
@@ -110,6 +121,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       // y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
       // size * sizeof(float));
       //}
+      output_index += 1;
     }
 
     cudaStreamSynchronize(*engine->stream());
diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc
index 7cb1e47a1516c32fb31a7818e7203b498e31e431..37657fa0b0498986fe67027415279af1775e58b9 100644
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -103,6 +103,9 @@ TEST(TensorRTEngineOp, manual) {
   SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
   SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
                                     std::vector<std::string>({}));
+  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(),
+                                    "output_name_mapping",
+                                    std::vector<std::string>({"z0"}));
 
   LOG(INFO) << "create engine op";
   auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
@@ -196,6 +199,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
       std::vector<std::string>({"y0", "y1", "y2", "y3"}));
   SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "b_engine");
 
+  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(),
+                                    "output_name_mapping",
+                                    std::vector<std::string>({"z3"}));
+
   auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
 
   // Execute them.
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index e0d7937ae2f3ce4bda12f3771727e2992d63cb9b..f08c0e8e345179bf198ca9d50278b7f65e03ca2c 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -18,7 +18,11 @@ else()
 endif()
 cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)
 
-cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog enforce)
+set(CPU_INFO_DEPS gflags glog enforce)
+IF(WITH_XBYAK)
+    list(APPEND CPU_INFO_DEPS xbyak)
+ENDIF()
+cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
 nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
@@ -60,3 +64,7 @@ cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 
 nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
+
+IF(WITH_GPU)
+  nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
+ENDIF()
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index 77ecb170111d63f23312d06fa8a8172bc45f2a4e..234a04b5c2eb5ee643e8a4e723b28331cd8e6ee0 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 
 #ifdef PADDLE_WITH_MKLML
+#include <omp.h>
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif
 
@@ -33,6 +34,7 @@ void SetNumThreads(int num_threads) {
 #elif defined(PADDLE_WITH_MKLML)
   int real_num_threads = num_threads > 1 ? num_threads : 1;
   platform::dynload::MKL_Set_Num_Threads(real_num_threads);
+  omp_set_num_threads(num_threads);
 #else
   PADDLE_ENFORCE(false, "To be implemented.");
 #endif
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index f832d72b53e8d06a32d5c0ac2ecf7130aa28a666..7d53a684d6068c79659719159696ef5aebfeaa2b 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -14,6 +14,11 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/cpu_info.h"
 
+#ifdef PADDLE_WITH_XBYAK
+#include "xbyak/xbyak.h"
+#include "xbyak/xbyak_util.h"
+#endif
+
 #ifdef __APPLE__
 #include <sys/sysctl.h>
 #include <sys/types.h>
@@ -98,5 +103,39 @@ size_t CUDAPinnedMaxChunkSize() {
   return CUDAPinnedMaxAllocSize() / 256;
 }
 
+#ifdef PADDLE_WITH_XBYAK
+namespace jit {
+
+static Xbyak::util::Cpu cpu;
+bool MayIUse(const cpu_isa_t cpu_isa) {
+  using namespace Xbyak::util;  // NOLINT
+  switch (cpu_isa) {
+    case sse42:
+      return cpu.has(Cpu::tSSE42);
+    case avx2:
+      return cpu.has(Cpu::tAVX2);
+    case avx512_common:
+      return cpu.has(Cpu::tAVX512F);
+    case avx512_core:
+      return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) &&
+             cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ);
+    case avx512_core_vnni:
+      return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) &&
+             cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ) &&
+             cpu.has(Cpu::tAVX512_VNNI);
+    case avx512_mic:
+      return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512CD) &&
+             cpu.has(Cpu::tAVX512ER) && cpu.has(Cpu::tAVX512PF);
+    case avx512_mic_4ops:
+      return true && MayIUse(avx512_mic) && cpu.has(Cpu::tAVX512_4FMAPS) &&
+             cpu.has(Cpu::tAVX512_4VNNIW);
+    case isa_any:
+      return true;
+  }
+  return false;
+}
+
+}  // namespace jit
+#endif
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index f06c2b67fe4385f427322e9bb2f3080fdd3acc94..f5f67667594f1ab80058533e4c5d5b04c2592b60 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -37,5 +37,25 @@ size_t CUDAPinnedMinChunkSize();
 //! Get the maximum chunk size for buddy allocator.
 size_t CUDAPinnedMaxChunkSize();
 
+#ifdef PADDLE_WITH_XBYAK
+namespace jit {
+
+typedef enum {
+  isa_any,
+  sse42,
+  avx2,
+  avx512_common,
+  avx512_core,
+  avx512_core_vnni,
+  avx512_mic,
+  avx512_mic_4ops,
+} cpu_isa_t;  // Instruction set architecture
+
+// May I use some instruction
+inline bool MayIUse(const cpu_isa_t cpu_isa);
+
+}  // namespace jit
+#endif
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index ecec4178f2d9937920e52eb74bf9068b84e741a0..23457ff5fe1ec27094113ba0dde26adc64c716b5 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -14,6 +14,10 @@ limitations under the License. */
 
 #pragma once
 #include <cuda.h>
+// NOTE(): support float16 to half in header file.
+#define PADDLE_CUDA_FP16
+#include <cuda_fp16.h>
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace platform {
@@ -36,6 +40,18 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
 #endif
 }
 
+// CUDA 9.0 have native compatible float16 shfl_down
+#if CUDA_VERSION < 9000
+template <>
+__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
+                                                       float16 val, int delta,
+                                                       int width) {
+  half tmp = static_cast<half>(val);
+  __shfl_down(tmp, static_cast<unsigned>(delta), width);
+  return float16(tmp);
+}
+#endif
+
 template <typename T>
 __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
                                              int width = 32) {
@@ -46,6 +62,11 @@ __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
 #endif
 }
 
+template <typename T>
+HOSTDEVICE T Infinity() {
+  return INFINITY;
+}
+
 template <typename T>
 __device__ T reduceSum(T val, int tid, int len) {
   // NOTE(zcd): The warp size should be taken from the
diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ca5ca1caeb23f01c047feeccf9c276b2dcd1cb68
--- /dev/null
+++ b/paddle/fluid/platform/cuda_helper_test.cu
@@ -0,0 +1,153 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <random>
+
+#define PADDLE_CUDA_FP16
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using paddle::platform::float16;
+
+template <typename T>
+__global__ void AddKernel(const T* data_a, T* data_b, size_t num) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]);
+  }
+}
+
+template <typename T>
+struct AddFunctor {
+  T operator()(const T& a, const T& b) { return a + b; }
+};
+
+template <typename T>
+void TestCase(size_t num) {
+  T *in1, *in2, *out;
+  T *d_in1, *d_in2;
+  size_t size = sizeof(T) * num;
+  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
+  cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
+  in1 = reinterpret_cast<T*>(malloc(size));
+  in2 = reinterpret_cast<T*>(malloc(size));
+  out = reinterpret_cast<T*>(malloc(size));
+  std::minstd_rand engine;
+  std::uniform_real_distribution<double> dist(0.0, 1.0);
+  for (size_t i = 0; i < num; ++i) {
+    in1[i] = static_cast<T>(dist(engine));
+    in2[i] = static_cast<T>(dist(engine));
+  }
+  cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);
+  AddKernel<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
+  cudaDeviceSynchronize();
+  cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  for (size_t i = 0; i < num; ++i) {
+    // NOTE(dzhwinter): the float16 add has small underflow/overflow
+    // so we use EXPECT_NEAR to check the result.
+    EXPECT_NEAR(static_cast<float>(out[i]),
+                static_cast<float>(AddFunctor<T>()(in1[i], in2[i])), 0.001);
+  }
+  free(in1);
+  free(in2);
+  free(out);
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
+
+// cuda primitives
+TEST(CudaAtomic, Add) {
+  TestCase<float>(static_cast<size_t>(10));
+  TestCase<float>(static_cast<size_t>(1024 * 1024));
+
+  TestCase<double>(static_cast<size_t>(10));
+  TestCase<double>(static_cast<size_t>(1024 * 1024));
+}
+
+TEST(CudaAtomic, float16) {
+  TestCase<float16>(static_cast<size_t>(1));
+  TestCase<float16>(static_cast<size_t>(2));
+  TestCase<float16>(static_cast<size_t>(3));
+
+  TestCase<float16>(static_cast<size_t>(10));
+  TestCase<float16>(static_cast<size_t>(1024 * 1024));
+}
+
+// unalignment of uint8
+void TestUnalign(size_t num, const int shift_bit) {
+  PADDLE_ENFORCE(num % 2 == 0, "must be a multiple of 2");
+  float16 *in1, *in2, *out;
+  float16 *d_in1, *d_in2;
+  size_t size = sizeof(uint8_t) * (num + shift_bit);
+  size_t array_size = sizeof(float16) * (num / 2);
+
+  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
+  cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
+  in1 = reinterpret_cast<float16*>(malloc(size));
+  in2 = reinterpret_cast<float16*>(malloc(size));
+  out = reinterpret_cast<float16*>(malloc(size));
+
+  // right shift 1, mimic the unalignment of address
+  float16* r_in1 =
+      reinterpret_cast<float16*>(reinterpret_cast<uint8_t*>(in1) + shift_bit);
+  float16* r_in2 =
+      reinterpret_cast<float16*>(reinterpret_cast<uint8_t*>(in2) + shift_bit);
+
+  std::minstd_rand engine;
+  std::uniform_real_distribution<double> dist(0.0, 1.0);
+  for (size_t i = 0; i < num / 2; ++i) {
+    r_in1[i] = static_cast<float16>(dist(engine));
+    r_in2[i] = static_cast<float16>(dist(engine));
+  }
+  cudaMemcpy(d_in1, r_in1, array_size, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, r_in2, array_size, cudaMemcpyHostToDevice);
+  AddKernel<float16><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num / 2);
+  cudaDeviceSynchronize();
+  cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  for (size_t i = 0; i < num / 2; ++i) {
+    // NOTE(dzhwinter): the float16 add has small underflow/overflow
+    // so we use EXPECT_NEAR to check the result.
+    EXPECT_NEAR(static_cast<float>(out[i]),
+                static_cast<float>(AddFunctor<float16>()(r_in1[i], r_in2[i])),
+                0.001);
+  }
+  free(in1);
+  free(in2);
+  free(out);
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
+
+TEST(CudaAtomic, float16Unalign) {
+  // same with float16 testcase
+  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 2);
+  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 2);
+  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 2);
+
+  // shift the address.
+  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 1);
+  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 1);
+  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 1);
+
+  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 3);
+  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 3);
+  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 3);
+}
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index d535ed2f89df6a0b311ec068ecd92c8e3183cee7..67ea64833d3b844d88a2e5996f860ef165bd8ffd 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -14,12 +14,14 @@ limitations under the License. */
 
 #pragma once
 #include <cuda.h>
+#include <stdio.h>
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace platform {
 
 #define CUDA_ATOMIC_WRAPPER(op, T) \
-  __device__ __forceinline__ T CudaAtomic##op(T* address, const T val)
+  __device__ __forceinline__ T CudaAtomic##op(T *address, const T val)
 
 #define USE_CUDA_ATOMIC(op, T) \
   CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
@@ -42,17 +44,17 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) {
   static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
                 "long long should be int64");
   return CudaAtomicAdd(
-      reinterpret_cast<unsigned long long int*>(address),  // NOLINT
-      static_cast<unsigned long long int>(val));           // NOLINT
+      reinterpret_cast<unsigned long long int *>(address),  // NOLINT
+      static_cast<unsigned long long int>(val));            // NOLINT
 }
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
 USE_CUDA_ATOMIC(Add, double);
 #else
 CUDA_ATOMIC_WRAPPER(Add, double) {
-  unsigned long long int* address_as_ull =                 // NOLINT
-      reinterpret_cast<unsigned long long int*>(address);  // NOLINT
-  unsigned long long int old = *address_as_ull, assumed;   // NOLINT
+  unsigned long long int *address_as_ull =                  // NOLINT
+      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
 
   do {
     assumed = old;
@@ -64,6 +66,67 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
 
   return __longlong_as_double(old);
 }
+#endif
+
+#ifdef PADDLE_CUDA_FP16
+// NOTE(dzhwinter): cuda do not have atomicCAS for half.
+// Just use the half address as a unsigned value address and
+// do the atomicCAS. According to the value store at high 16 bits
+// or low 16 bits, then do a different sum and CAS.
+// Given most warp-threads will failed on the atomicCAS, so this
+// implemented should be avoided in high concurrency. It's will be
+// slower than the way convert value into 32bits and do a full atomicCAS.
+
+// convert the value into float and do the add arithmetic.
+// then store the result into a uint32.
+inline static __device__ uint32_t add_to_low_half(uint32_t val, float x) {
+  float16 low_half;
+  // the float16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half = static_cast<float16>(static_cast<float>(low_half) + x);
+  return (val & 0xFFFF0000u) | low_half.x;
+}
+
+inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
+  float16 high_half;
+  // the float16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half = static_cast<float16>(static_cast<float>(high_half) + x);
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+CUDA_ATOMIC_WRAPPER(Add, float16) {
+  // concrete packed float16 value may exsits in lower or higher 16bits
+  // of the 32bits address.
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t sum;
+  uint32_t newval;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // the float16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    float16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // the float16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed, add_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    float16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
+
 #endif
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 6ea4f8b7cba18ce7f803dbd9b15a7ae70c3055f2..bb8b14bb9fa41942c3aa653ca224c0842fbf9a00 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -59,13 +59,12 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
 #define CUDNN_VERSION_MIN(major, minor, patch) \
   (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
 
-#define CUDNN_ENFORCE(condition)                                  \
-  do {                                                            \
-    cudnnStatus_t status = condition;                             \
-    if (status != CUDNN_STATUS_SUCCESS) {                         \
-      VLOG(1) << ::paddle::platform::cudnnGetErrorString(status); \
-      PADDLE_THROW("cuDNN call failed");                          \
-    }                                                             \
+#define CUDNN_ENFORCE(condition)                                     \
+  do {                                                               \
+    cudnnStatus_t status = condition;                                \
+    if (UNLIKELY(status != CUDNN_STATUS_SUCCESS)) {                  \
+      PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
+    }                                                                \
   } while (false)
 
 enum class DataLayout {  // Not use
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index d9e2afadaf8ec439d158e57c94d3e6e684bce116..dc1d751141187edb7738e42c41514614d4d399b0 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -30,9 +30,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 namespace {
-// Current thread's id. Note, we don't distinguish nested threads
-// for now.
-thread_local int cur_thread_id = 0;
 // Tracking the nested block stacks of each thread.
 thread_local std::deque<int> block_id_stack;
 // Tracking the nested event stacks.
@@ -192,6 +189,8 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
 }
 }  // namespace
 
+#endif  // PADDLE_WITH_CUPTI
+
 class DeviceTracerImpl : public DeviceTracer {
  public:
   DeviceTracerImpl() : enabled_(false) {}
@@ -247,6 +246,8 @@ class DeviceTracerImpl : public DeviceTracer {
     if (enabled_) {
       return;
     }
+
+#ifdef PADDLE_WITH_CUPTI
     EnableActivity();
 
     // Register callbacks for buffer requests and completed by CUPTI.
@@ -265,6 +266,7 @@ class DeviceTracerImpl : public DeviceTracer {
         dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
                                      CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
     CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
+#endif  // PADDLE_WITH_CUPTI
     enabled_ = true;
   }
 
@@ -316,16 +318,21 @@ class DeviceTracerImpl : public DeviceTracer {
   }
 
   void Disable() {
+#ifdef PADDLE_WITH_CUPTI
     // flush might cause additional calls to DeviceTracker.
     dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED);
+#endif  // PADDLE_WITH_CUPTI
     std::lock_guard<std::mutex> l(trace_mu_);
+#ifdef PADDLE_WITH_CUPTI
     DisableActivity();
     dynload::cuptiUnsubscribe(subscriber_);
     CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
+#endif  // PADDLE_WITH_CUPTI
     enabled_ = false;
   }
 
  private:
+#ifdef PADDLE_WITH_CUPTI
   static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
                                    CUpti_CallbackId cbid, const void *cbdata) {
     auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
@@ -343,7 +350,8 @@ class DeviceTracerImpl : public DeviceTracer {
       VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid;
     }
   }
-
+  CUpti_SubscriberHandle subscriber_;
+#endif  // PADDLE_WITH_CUPTI
   std::mutex trace_mu_;
   bool enabled_;
   uint64_t start_ns_;
@@ -352,45 +360,9 @@ class DeviceTracerImpl : public DeviceTracer {
   std::vector<MemRecord> mem_records_;
   std::vector<CPURecord> cpu_records_;
   std::unordered_map<uint32_t, std::string> correlations_;
-  CUpti_SubscriberHandle subscriber_;
-};
-
-#endif  // PADDLE_WITH_CUPTI
-
-class DeviceTracerDummy : public DeviceTracer {
- public:
-  DeviceTracerDummy() {}
-
-  void AddAnnotation(uint64_t id, const std::string &anno) {}
-
-  void AddCPURecords(const std::string &anno, uint64_t start_ns,
-                     uint64_t end_ns, int64_t device_id, int64_t thread_id) {}
-
-  void AddMemRecords(const std::string &name, uint64_t start_ns,
-                     uint64_t end_ns, int64_t device_id, int64_t stream_id,
-                     uint32_t correlation_id, uint64_t bytes) {}
-
-  void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
-                        int64_t stream_id, uint32_t correlation_id) {}
-
-  bool IsEnabled() { return false; }
-
-  void Enable() {}
-
-  proto::Profile GenProfile(const std::string &profile_path) {
-    return proto::Profile();
-  }
-
-  void Disable() {}
 };
 
-void CreateTracer(DeviceTracer **t) {
-#ifdef PADDLE_WITH_CUPTI
-  *t = new DeviceTracerImpl();
-#else
-  *t = new DeviceTracerDummy();
-#endif  // PADDLE_WITH_CUPTI
-}
+void CreateTracer(DeviceTracer **t) { *t = new DeviceTracerImpl(); }
 
 DeviceTracer *GetDeviceTracer() {
   std::call_once(tracer_once_flag, CreateTracer, &tracer);
@@ -413,12 +385,5 @@ void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
 void ClearCurBlock() { block_id_stack.pop_back(); }
 
 int BlockDepth() { return block_id_stack.size(); }
-
-void SetCurThread(int thread_id) { cur_thread_id = thread_id; }
-
-void ClearCurThread() { cur_thread_id = 0; }
-
-int CurThread() { return cur_thread_id; }
-
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index 0375c7439c29d4122e8ff6b58734dad4f504b7a2..322996fb4f54d34ebbb034a6e1de420e9c532545 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <sys/time.h>
+#include <time.h>
+#include <chrono>  // NOLINT
 #include <string>
 
 #include "paddle/fluid/platform/dynload/cupti.h"
@@ -25,6 +28,12 @@ namespace platform {
 // WARN: Under Development. Don't depend on it yet.
 //////////////////////
 
+inline uint64_t PosixInNsec() {
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
+}
+
 // DeviceTracer performs the following tasks:
 // 1. Register cuda callbacks for various events: kernel, memcpy, etc.
 // 2. Collect cuda statistics: start/end ts, memory, etc.
@@ -99,9 +108,5 @@ std::string CurAnnotation();
 void SetCurBlock(int block_id);
 void ClearCurBlock();
 int BlockDepth();
-
-void SetCurThread(int thread_id);
-void ClearCurThread();
-int CurThread();
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index ffd183af68514dbb1a8b3de39000c9ca3f56ddc3..efb021c838e3680ab2cdd1c4b298cf7ec2186478 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -67,8 +67,11 @@ struct float16;
 }  // namespace platform
 }  // namespace paddle
 
+// NOTE():
+// Do not move the eigen.h header, otherwise the eigen_vector<bool> will failed.
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
 namespace platform {
@@ -898,6 +901,30 @@ struct is_pod<paddle::platform::float16> {
       is_standard_layout<paddle::platform::float16>::value;
 };
 
+template <>
+struct is_floating_point<paddle::platform::float16>
+    : std::integral_constant<
+          bool, std::is_same<paddle::platform::float16,
+                             typename std::remove_cv<
+                                 paddle::platform::float16>::type>::value> {};
+template <>
+struct is_signed<paddle::platform::float16> {
+  static const bool value = true;
+};
+
+template <>
+struct is_unsigned<paddle::platform::float16> {
+  static const bool value = false;
+};
+
+inline bool isnan(const paddle::platform::float16& a) {
+  return paddle::platform::isnan(a);
+}
+
+inline bool isinf(const paddle::platform::float16& a) {
+  return paddle::platform::isinf(a);
+}
+
 template <>
 struct numeric_limits<paddle::platform::float16> {
   static const bool is_specialized = true;
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index ede294be1e2e26693bd3ead2ccd5e6a6c8a075bc..27e930e6e0a76982b3f27619f38a4a08d82cafa1 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -141,10 +141,36 @@ TEST(float16, lod_tensor_cpu) {
   }
 }
 
+TEST(float16, floating) {
+  // compile time assert.
+  PADDLE_ASSERT(std::is_floating_point<float16>::value);
+}
+
 TEST(float16, print) {
   float16 a = float16(1.0f);
   std::cout << a << std::endl;
 }
 
+// CPU test
+TEST(float16, isinf) {
+  float16 a;
+  a.x = 0x7c00;
+  float16 b = float16(INFINITY);
+  float16 c = static_cast<float16>(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+  EXPECT_EQ(std::isinf(b), true);
+  EXPECT_EQ(std::isinf(c), true);
+}
+
+TEST(float16, isnan) {
+  float16 a;
+  a.x = 0x7fff;
+  float16 b = float16(NAN);
+  float16 c = static_cast<float16>(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+  EXPECT_EQ(std::isnan(b), true);
+  EXPECT_EQ(std::isnan(c), true);
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 1b9cf9b5d3fa2121b588c31d7cf2f4c50cb951bc..e2b7ca9b03809113c31af8ff4d3ad3713748f330 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -11,11 +11,13 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/float16.h"
 
+#include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <bitset>
+#include <iostream>
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/legacy/utils/Logging.h"
 
 #define ARITHMETIC_KERNEL(op_type, sign)                                 \
   __global__ void op_type(const half* in1, const half* in2, half* out) { \
@@ -241,6 +243,72 @@ TEST(float16, lod_tensor_on_gpu) {
   }
 }
 
+template <typename T>
+struct Functor {
+  bool operator()(const T& val) {
+    return std::type_index(typeid(T)) ==
+           std::type_index(typeid(platform::float16));
+  }
+};
+
+TEST(float16, typeid) {
+  // the framework heavily used typeid hash
+  Functor<float16> functor;
+  float16 a = float16(.0f);
+  Functor<int> functor2;
+  int b(0);
+
+  // compile time assert
+  PADDLE_ASSERT(functor(a) == true);
+  PADDLE_ASSERT(functor2(b) == false);
+}
+
+// GPU test
+TEST(float16, isinf) {
+  float16 a;
+  a.x = 0x7c00;
+  float16 b = float16(INFINITY);
+  // underflow to 0
+  float16 native_a(5e-40f);
+  // overflow to inf
+  float16 native_b(5e40f);
+  EXPECT_EQ(std::isinf(a), true);
+  EXPECT_EQ(std::isinf(b), true);
+  EXPECT_EQ(std::isinf(native_b), true);
+  EXPECT_EQ(native_a, float16(0));
+}
+
+TEST(float16, isnan) {
+  float16 a;
+  a.x = 0x7fff;
+  float16 b = float16(NAN);
+  float16 c = float16(5e40);
+  // inf * +-0 will get a nan
+  float16 d = c * float16(0);
+  EXPECT_EQ(std::isnan(a), true);
+  EXPECT_EQ(std::isnan(b), true);
+  EXPECT_EQ(std::isnan(d), true);
+}
+
+TEST(float16, cast) {
+  float16 a;
+  a.x = 0x0070;
+  auto b = a;
+  {
+    // change semantic, keep the same value
+    float16 c = reinterpret_cast<float16&>(reinterpret_cast<unsigned&>(b));
+    EXPECT_EQ(b, c);
+  }
+
+  {
+    // use uint32 low 16 bit store float16
+    uint32_t c = reinterpret_cast<uint32_t&>(b);
+    float16 d;
+    d.x = c;
+    EXPECT_EQ(b, d);
+  }
+}
+
 }  // namespace platform
 }  // namespace paddle
 #endif  // PADDLE_CUDA_FP16
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 0b776528414735e8a7c1e3763e7ccb662bb9f285..6f1f0c4796f3bae2fb419bf103cb6c0c5489bf65 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -23,6 +23,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"
 
+DEFINE_int32(paddle_num_threads, 1,
+             "Number of threads for each paddle instance.");
+
 namespace paddle {
 namespace framework {
 
@@ -115,7 +118,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
   places.emplace_back(platform::CPUPlace());
   platform::DeviceContextPool::Init(places);
 #ifndef PADDLE_WITH_MKLDNN
-  platform::SetNumThreads(1);
+  platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif
 }
 
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index a8f93e6848a1db1f5aa0ee266a076af2b5d0c964..10a3ad256b17ba41380cdc0377905d03188cbaa3 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -223,7 +223,7 @@ class MKLDNNHandler {
   static std::string GetHash(mkldnn::memory::dims& operand_dims,  // NOLINT
                              const std::string& suffix) {
     return dims2str(operand_dims) + suffix;
-  };
+  }
 
  protected:
   static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
@@ -251,5 +251,17 @@ inline mkldnn::memory::format MKLDNNFormatForSize(
   return data_format;
 }
 
+inline mkldnn::memory::format data_format_to_memory_format(
+    const std::string& data_format) {
+  switch (framework::StringToDataLayout(data_format)) {
+    case framework::DataLayout::kNHWC:
+      return mkldnn::memory::format::nhwc;
+    case framework::DataLayout::kNCHW:
+      return mkldnn::memory::format::nchw;
+    default:
+      return mkldnn::memory::format::any;
+  }
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 01de9d7041bf3eb40884e2a6295027cccfaebd2a..d0286719b9ea1aa671294f519051ac1e269c4e93 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 
 #include <sys/time.h>
-#include <time.h>
 #include <algorithm>
 #include <iomanip>
 #include <limits>
@@ -97,12 +96,6 @@ inline uint64_t GetTimeInNsec() {
       .count();
 }
 
-inline uint64_t PosixInNsec() {
-  struct timeval tv;
-  gettimeofday(&tv, nullptr);
-  return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
-}
-
 Event::Event(EventType type, std::string name, uint32_t thread_id,
              const DeviceContext* dev_ctx)
     : type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) {
@@ -110,6 +103,8 @@ Event::Event(EventType type, std::string name, uint32_t thread_id,
   has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
   if (has_cuda_) {
     auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
+    PADDLE_ENFORCE(cudaSetDevice(
+        boost::get<platform::CUDAPlace>(cuda_dev_ctx->GetPlace()).device));
     PADDLE_ENFORCE(cudaGetDevice(&device_));
     PADDLE_ENFORCE(cudaEventCreate(&event_));
     auto stream = cuda_dev_ctx->stream();
@@ -176,6 +171,7 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
 
 RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled) return;
   is_enabled_ = true;
   dev_ctx_ = dev_ctx;
@@ -186,11 +182,12 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
 }
 
 RecordEvent::~RecordEvent() {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
     tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
-                          BlockDepth(), CurThread());
+                          BlockDepth(), g_thread_id);
   }
   ClearCurAnnotation();
   PopEvent(name_, dev_ctx_);
@@ -198,6 +195,7 @@ RecordEvent::~RecordEvent() {
 
 RecordBlock::RecordBlock(int block_id)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled) return;
   is_enabled_ = true;
   SetCurBlock(block_id);
@@ -205,27 +203,18 @@ RecordBlock::RecordBlock(int block_id)
 }
 
 RecordBlock::~RecordBlock() {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
     // We try to put all blocks at the same nested depth in the
     // same timeline lane. and distinguish the using thread_id.
     tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
-                          CurThread());
+                          g_thread_id);
   }
   ClearCurBlock();
 }
 
-RecordThread::RecordThread(int thread_id) {
-  if (g_state == ProfilerState::kDisabled) return;
-  SetCurThread(thread_id);
-}
-
-RecordThread::~RecordThread() {
-  if (g_state == ProfilerState::kDisabled) return;
-  ClearCurThread();
-}
-
 void EnableProfiler(ProfilerState state) {
   PADDLE_ENFORCE(state != ProfilerState::kDisabled,
                  "Can't enbale profling, since the input state is ",
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index bf43925373a12cd9ff2155d68c42d0266ba4df60..c99d9c807d1bfb45d1ce0725b84b9fff09049511 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -95,11 +95,6 @@ struct RecordBlock {
   uint64_t start_ns_;
 };
 
-struct RecordThread {
-  explicit RecordThread(int thread_id);
-  ~RecordThread();
-};
-
 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> GetAllEvents();
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index ee1c8d46ddfb4f0c09591bb78dc720555dc735b4..2320f3e4dbc5d98698670fee19ed983411a802a9 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -394,8 +394,10 @@ All parameter, weight, gradient are variables in Paddle.
     InferenceOptimize(*(origin.Proto()), &pruned_desc);
     return new ProgramDesc(pruned_desc);
   });
-  m.def("empty_var_name", []() { return framework::kEmptyVarName; });
-  m.def("grad_var_suffix", []() { return framework::kGradVarSuffix; });
+  m.def("empty_var_name",
+        []() { return std::string(framework::kEmptyVarName); });
+  m.def("grad_var_suffix",
+        []() { return std::string(framework::kGradVarSuffix); });
   m.def_submodule(
        "var_names",
        "The module will return special predefined variable name in Paddle")
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 9e58a39eb0939fa15e9c19e1e6fc89a6f99d9a0c..8460f93b841fe136db138e0dc7576f3aacdbeb5f 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -419,6 +419,25 @@ EOF
     linkchecker doc/v2/en/html/index.html
     linkchecker doc/v2/cn/html/index.html
     linkchecker doc/v2/api/en/html/index.html
+
+    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
+
+    # Deploy to the the content server if its a "develop" or "release/version" branch
+    # The "develop_doc" branch is reserved to test full deploy process without impacting the real content.
+    if [ "$TRAVIS_BRANCH" == "develop_doc" ]; then
+        PPO_SCRIPT_BRANCH=develop
+    elif [[ "$TRAVIS_BRANCH" == "develop"  ||  "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then
+        PPO_SCRIPT_BRANCH=master
+    else
+        # Early exit, this branch doesn't require documentation build
+        return 0;
+    fi
+     # Fetch the paddlepaddle.org deploy_docs.sh from the appopriate branch
+    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/$PPO_SCRIPT_BRANCH/scripts/deploy/deploy_docs.sh
+    export PYTHONPATH=$PYTHONPATH:${PADDLE_ROOT}/build/python:/paddle/build/python
+    cd ..
+    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH ${PADDLE_ROOT} ${PADDLE_ROOT}/build/doc/ ${PPO_SCRIPT_BRANCH}
+    cd -
 }
 
 function gen_html() {
@@ -534,7 +553,7 @@ EOF
         make -j `nproc` inference_lib_dist
         cd ${PADDLE_ROOT}/build
         cp -r fluid_install_dir fluid
-        tar -cf fluid.tgz fluid
+        tar -czf fluid.tgz fluid
       fi
 }
 
@@ -547,6 +566,7 @@ function test_fluid_inference_lib() {
 EOF
         cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
         ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF}
+        ./clean.sh
       fi
 }
 
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
index 3462deb9c2f88b6da643d6aa833449ed5f4a9b34..174c2a12f007b282a5182c0aec9b0a6bec9e55fa 100755
--- a/paddle/scripts/paddle_docker_build.sh
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -52,6 +52,9 @@ EOL
     ${DOCKER_CMD} run -it \
         ${DOCKER_ENV} \
         -e SCRIPT_NAME=$0 \
+        -e CONTENT_DEC_PASSWD=$CONTENT_DEC_PASSWD \
+        -e TRAVIS_BRANCH=$TRAVIS_BRANCH \
+        -e TRAVIS_PULL_REQUEST=$TRAVIS_PULL_REQUEST \
         -v $PADDLE_ROOT:/paddle \
         -v ${HOME}/.ccache:/root/.ccache \
         -w /paddle \
diff --git a/patches/grpc/completion_queue.h b/patches/grpc/completion_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e92c60ea2db00cc6e227830228888f9a06735c4
--- /dev/null
+++ b/patches/grpc/completion_queue.h
@@ -0,0 +1,386 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/// A completion queue implements a concurrent producer-consumer queue, with
+/// two main API-exposed methods: \a Next and \a AsyncNext. These
+/// methods are the essential component of the gRPC C++ asynchronous API.
+/// There is also a \a Shutdown method to indicate that a given completion queue
+/// will no longer have regular events. This must be called before the
+/// completion queue is destroyed.
+/// All completion queue APIs are thread-safe and may be used concurrently with
+/// any other completion queue API invocation; it is acceptable to have
+/// multiple threads calling \a Next or \a AsyncNext on the same or different
+/// completion queues, or to call these methods concurrently with a \a Shutdown
+/// elsewhere.
+/// \remark{All other API calls on completion queue should be completed before
+/// a completion queue destructor is called.}
+#ifndef GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
+#define GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
+
+#include <typeinfo>
+
+#include <grpc/impl/codegen/atm.h>
+#include <grpcpp/impl/codegen/completion_queue_tag.h>
+#include <grpcpp/impl/codegen/core_codegen_interface.h>
+#include <grpcpp/impl/codegen/grpc_library.h>
+#include <grpcpp/impl/codegen/status.h>
+#include <grpcpp/impl/codegen/time.h>
+
+struct grpc_completion_queue;
+
+namespace grpc {
+
+template <class R>
+class ClientReader;
+template <class W>
+class ClientWriter;
+template <class W, class R>
+class ClientReaderWriter;
+template <class R>
+class ServerReader;
+template <class W>
+class ServerWriter;
+namespace internal {
+template <class W, class R>
+class ServerReaderWriterBody;
+}  // namespace internal
+
+class Channel;
+class ChannelInterface;
+class ClientContext;
+class CompletionQueue;
+class Server;
+class ServerBuilder;
+class ServerContext;
+class ServerInterface;
+
+namespace internal {
+class CompletionQueueTag;
+class RpcMethod;
+template <class ServiceType, class RequestType, class ResponseType>
+class RpcMethodHandler;
+template <class ServiceType, class RequestType, class ResponseType>
+class ClientStreamingHandler;
+template <class ServiceType, class RequestType, class ResponseType>
+class ServerStreamingHandler;
+template <class ServiceType, class RequestType, class ResponseType>
+class BidiStreamingHandler;
+class UnknownMethodHandler;
+template <class Streamer, bool WriteNeeded>
+class TemplatedBidiStreamingHandler;
+template <class InputMessage, class OutputMessage>
+class BlockingUnaryCallImpl;
+}  // namespace internal
+
+extern CoreCodegenInterface* g_core_codegen_interface;
+
+/// A thin wrapper around \ref grpc_completion_queue (see \ref
+/// src/core/lib/surface/completion_queue.h).
+/// See \ref doc/cpp/perf_notes.md for notes on best practices for high
+/// performance servers.
+class CompletionQueue : private GrpcLibraryCodegen {
+ public:
+  /// Default constructor. Implicitly creates a \a grpc_completion_queue
+  /// instance.
+  CompletionQueue()
+      : CompletionQueue(grpc_completion_queue_attributes{
+            GRPC_CQ_CURRENT_VERSION, GRPC_CQ_NEXT, GRPC_CQ_DEFAULT_POLLING}) {}
+
+  /// Wrap \a take, taking ownership of the instance.
+  ///
+  /// \param take The completion queue instance to wrap. Ownership is taken.
+  explicit CompletionQueue(grpc_completion_queue* take);
+
+  /// Destructor. Destroys the owned wrapped completion queue / instance.
+  ~CompletionQueue() {
+    if (typeid(*g_core_codegen_interface).hash_code() !=
+        typeid(CoreCodegenInterface).hash_code()) {
+      g_core_codegen_interface->grpc_completion_queue_destroy(cq_);
+    }
+  }
+
+  /// Tri-state return for AsyncNext: SHUTDOWN, GOT_EVENT, TIMEOUT.
+  enum NextStatus {
+    SHUTDOWN,   ///< The completion queue has been shutdown and fully-drained
+    GOT_EVENT,  ///< Got a new event; \a tag will be filled in with its
+                ///< associated value; \a ok indicating its success.
+    TIMEOUT     ///< deadline was reached.
+  };
+
+  /// Read from the queue, blocking until an event is available or the queue is
+  /// shutting down.
+  ///
+  /// \param tag[out] Updated to point to the read event's tag.
+  /// \param ok[out] true if read a successful event, false otherwise.
+  ///
+  /// Note that each tag sent to the completion queue (through RPC operations
+  /// or alarms) will be delivered out of the completion queue by a call to
+  /// Next (or a related method), regardless of whether the operation succeeded
+  /// or not. Success here means that this operation completed in the normal
+  /// valid manner.
+  ///
+  /// Server-side RPC request: \a ok indicates that the RPC has indeed
+  /// been started. If it is false, the server has been Shutdown
+  /// before this particular call got matched to an incoming RPC.
+  ///
+  /// Client-side StartCall/RPC invocation: \a ok indicates that the RPC is
+  /// going to go to the wire. If it is false, it not going to the wire. This
+  /// would happen if the channel is either permanently broken or
+  /// transiently broken but with the fail-fast option. (Note that async unary
+  /// RPCs don't post a CQ tag at this point, nor do client-streaming
+  /// or bidi-streaming RPCs that have the initial metadata corked option set.)
+  ///
+  /// Client-side Write, Client-side WritesDone, Server-side Write,
+  /// Server-side Finish, Server-side SendInitialMetadata (which is
+  /// typically included in Write or Finish when not done explicitly):
+  /// \a ok means that the data/metadata/status/etc is going to go to the
+  /// wire. If it is false, it not going to the wire because the call
+  /// is already dead (i.e., canceled, deadline expired, other side
+  /// dropped the channel, etc).
+  ///
+  /// Client-side Read, Server-side Read, Client-side
+  /// RecvInitialMetadata (which is typically included in Read if not
+  /// done explicitly): \a ok indicates whether there is a valid message
+  /// that got read. If not, you know that there are certainly no more
+  /// messages that can ever be read from this stream. For the client-side
+  /// operations, this only happens because the call is dead. For the
+  /// server-sider operation, though, this could happen because the client
+  /// has done a WritesDone already.
+  ///
+  /// Client-side Finish: \a ok should always be true
+  ///
+  /// Server-side AsyncNotifyWhenDone: \a ok should always be true
+  ///
+  /// Alarm: \a ok is true if it expired, false if it was canceled
+  ///
+  /// \return true if got an event, false if the queue is fully drained and
+  ///         shut down.
+  bool Next(void** tag, bool* ok) {
+    return (AsyncNextInternal(tag,
+                              ok,
+                              g_core_codegen_interface->gpr_inf_future(
+                                  GPR_CLOCK_REALTIME)) != SHUTDOWN);
+  }
+
+  /// Read from the queue, blocking up to \a deadline (or the queue's shutdown).
+  /// Both \a tag and \a ok are updated upon success (if an event is available
+  /// within the \a deadline).  A \a tag points to an arbitrary location usually
+  /// employed to uniquely identify an event.
+  ///
+  /// \param tag[out] Upon sucess, updated to point to the event's tag.
+  /// \param ok[out] Upon sucess, true if a successful event, false otherwise
+  ///        See documentation for CompletionQueue::Next for explanation of ok
+  /// \param deadline[in] How long to block in wait for an event.
+  ///
+  /// \return The type of event read.
+  template <typename T>
+  NextStatus AsyncNext(void** tag, bool* ok, const T& deadline) {
+    TimePoint<T> deadline_tp(deadline);
+    return AsyncNextInternal(tag, ok, deadline_tp.raw_time());
+  }
+
+  /// EXPERIMENTAL
+  /// First executes \a F, then reads from the queue, blocking up to
+  /// \a deadline (or the queue's shutdown).
+  /// Both \a tag and \a ok are updated upon success (if an event is available
+  /// within the \a deadline).  A \a tag points to an arbitrary location usually
+  /// employed to uniquely identify an event.
+  ///
+  /// \param F[in] Function to execute before calling AsyncNext on this queue.
+  /// \param tag[out] Upon sucess, updated to point to the event's tag.
+  /// \param ok[out] Upon sucess, true if read a regular event, false otherwise.
+  /// \param deadline[in] How long to block in wait for an event.
+  ///
+  /// \return The type of event read.
+  template <typename T, typename F>
+  NextStatus DoThenAsyncNext(F&& f, void** tag, bool* ok, const T& deadline) {
+    CompletionQueueTLSCache cache = CompletionQueueTLSCache(this);
+    f();
+    if (cache.Flush(tag, ok)) {
+      return GOT_EVENT;
+    } else {
+      return AsyncNext(tag, ok, deadline);
+    }
+  }
+
+  /// Request the shutdown of the queue.
+  ///
+  /// \warning This method must be called at some point if this completion queue
+  /// is accessed with Next or AsyncNext. \a Next will not return false
+  /// until this method has been called and all pending tags have been drained.
+  /// (Likewise for \a AsyncNext returning \a NextStatus::SHUTDOWN .)
+  /// Only once either one of these methods does that (that is, once the queue
+  /// has been \em drained) can an instance of this class be destroyed.
+  /// Also note that applications must ensure that no work is enqueued on this
+  /// completion queue after this method is called.
+  void Shutdown();
+
+  /// Returns a \em raw pointer to the underlying \a grpc_completion_queue
+  /// instance.
+  ///
+  /// \warning Remember that the returned instance is owned. No transfer of
+  /// owership is performed.
+  grpc_completion_queue* cq() { return cq_; }
+
+ protected:
+  /// Private constructor of CompletionQueue only visible to friend classes
+  CompletionQueue(const grpc_completion_queue_attributes& attributes) {
+    cq_ = g_core_codegen_interface->grpc_completion_queue_create(
+        g_core_codegen_interface->grpc_completion_queue_factory_lookup(
+            &attributes),
+        &attributes,
+        NULL);
+    InitialAvalanching();  // reserve this for the future shutdown
+  }
+
+ private:
+  // Friend synchronous wrappers so that they can access Pluck(), which is
+  // a semi-private API geared towards the synchronous implementation.
+  template <class R>
+  friend class ::grpc::ClientReader;
+  template <class W>
+  friend class ::grpc::ClientWriter;
+  template <class W, class R>
+  friend class ::grpc::ClientReaderWriter;
+  template <class R>
+  friend class ::grpc::ServerReader;
+  template <class W>
+  friend class ::grpc::ServerWriter;
+  template <class W, class R>
+  friend class ::grpc::internal::ServerReaderWriterBody;
+  template <class ServiceType, class RequestType, class ResponseType>
+  friend class ::grpc::internal::RpcMethodHandler;
+  template <class ServiceType, class RequestType, class ResponseType>
+  friend class ::grpc::internal::ClientStreamingHandler;
+  template <class ServiceType, class RequestType, class ResponseType>
+  friend class ::grpc::internal::ServerStreamingHandler;
+  template <class Streamer, bool WriteNeeded>
+  friend class ::grpc::internal::TemplatedBidiStreamingHandler;
+  friend class ::grpc::internal::UnknownMethodHandler;
+  friend class ::grpc::Server;
+  friend class ::grpc::ServerContext;
+  friend class ::grpc::ServerInterface;
+  template <class InputMessage, class OutputMessage>
+  friend class ::grpc::internal::BlockingUnaryCallImpl;
+
+  /// EXPERIMENTAL
+  /// Creates a Thread Local cache to store the first event
+  /// On this completion queue queued from this thread.  Once
+  /// initialized, it must be flushed on the same thread.
+  class CompletionQueueTLSCache {
+   public:
+    CompletionQueueTLSCache(CompletionQueue* cq);
+    ~CompletionQueueTLSCache();
+    bool Flush(void** tag, bool* ok);
+
+   private:
+    CompletionQueue* cq_;
+    bool flushed_;
+  };
+
+  NextStatus AsyncNextInternal(void** tag, bool* ok, gpr_timespec deadline);
+
+  /// Wraps \a grpc_completion_queue_pluck.
+  /// \warning Must not be mixed with calls to \a Next.
+  bool Pluck(internal::CompletionQueueTag* tag) {
+    auto deadline =
+        g_core_codegen_interface->gpr_inf_future(GPR_CLOCK_REALTIME);
+    auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
+        cq_, tag, deadline, nullptr);
+    bool ok = ev.success != 0;
+    void* ignored = tag;
+    GPR_CODEGEN_ASSERT(tag->FinalizeResult(&ignored, &ok));
+    GPR_CODEGEN_ASSERT(ignored == tag);
+    // Ignore mutations by FinalizeResult: Pluck returns the C API status
+    return ev.success != 0;
+  }
+
+  /// Performs a single polling pluck on \a tag.
+  /// \warning Must not be mixed with calls to \a Next.
+  ///
+  /// TODO: sreek - This calls tag->FinalizeResult() even if the cq_ is already
+  /// shutdown. This is most likely a bug and if it is a bug, then change this
+  /// implementation to simple call the other TryPluck function with a zero
+  /// timeout. i.e:
+  ///      TryPluck(tag, gpr_time_0(GPR_CLOCK_REALTIME))
+  void TryPluck(internal::CompletionQueueTag* tag) {
+    auto deadline = g_core_codegen_interface->gpr_time_0(GPR_CLOCK_REALTIME);
+    auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
+        cq_, tag, deadline, nullptr);
+    if (ev.type == GRPC_QUEUE_TIMEOUT) return;
+    bool ok = ev.success != 0;
+    void* ignored = tag;
+    // the tag must be swallowed if using TryPluck
+    GPR_CODEGEN_ASSERT(!tag->FinalizeResult(&ignored, &ok));
+  }
+
+  /// Performs a single polling pluck on \a tag. Calls tag->FinalizeResult if
+  /// the pluck() was successful and returned the tag.
+  ///
+  /// This exects tag->FinalizeResult (if called) to return 'false' i.e expects
+  /// that the tag is internal not something that is returned to the user.
+  void TryPluck(internal::CompletionQueueTag* tag, gpr_timespec deadline) {
+    auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
+        cq_, tag, deadline, nullptr);
+    if (ev.type == GRPC_QUEUE_TIMEOUT || ev.type == GRPC_QUEUE_SHUTDOWN) {
+      return;
+    }
+
+    bool ok = ev.success != 0;
+    void* ignored = tag;
+    GPR_CODEGEN_ASSERT(!tag->FinalizeResult(&ignored, &ok));
+  }
+
+  /// Manage state of avalanching operations : completion queue tags that
+  /// trigger other completion queue operations. The underlying core completion
+  /// queue should not really shutdown until all avalanching operations have
+  /// been finalized. Note that we maintain the requirement that an avalanche
+  /// registration must take place before CQ shutdown (which must be maintained
+  /// elsehwere)
+  void InitialAvalanching() {
+    gpr_atm_rel_store(&avalanches_in_flight_, static_cast<gpr_atm>(1));
+  }
+  void RegisterAvalanching() {
+    gpr_atm_no_barrier_fetch_add(&avalanches_in_flight_,
+                                 static_cast<gpr_atm>(1));
+  }
+  void CompleteAvalanching();
+
+  grpc_completion_queue* cq_;  // owned
+
+  gpr_atm avalanches_in_flight_;
+};
+
+/// A specific type of completion queue used by the processing of notifications
+/// by servers. Instantiated by \a ServerBuilder.
+class ServerCompletionQueue : public CompletionQueue {
+ public:
+  bool IsFrequentlyPolled() { return polling_type_ != GRPC_CQ_NON_LISTENING; }
+
+ private:
+  grpc_cq_polling_type polling_type_;
+  friend class ServerBuilder;
+  /// \param is_frequently_polled Informs the GRPC library about whether the
+  /// server completion queue would be actively polled (by calling Next() or
+  /// AsyncNext()). By default all server completion queues are assumed to be
+  /// frequently polled.
+  ServerCompletionQueue(grpc_cq_polling_type polling_type)
+      : CompletionQueue(grpc_completion_queue_attributes{
+            GRPC_CQ_CURRENT_VERSION, GRPC_CQ_NEXT, polling_type}),
+        polling_type_(polling_type) {}
+};
+
+}  // namespace grpc
+
+#endif  // GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
diff --git a/patches/grpc/fix_too_early_destory.patch b/patches/grpc/fix_too_early_destory.patch
deleted file mode 100644
index d7790d56b07551b8daae9b9a41be5432e5b8b9cc..0000000000000000000000000000000000000000
--- a/patches/grpc/fix_too_early_destory.patch
+++ /dev/null
@@ -1,47 +0,0 @@
-diff --git a/include/grpcpp/impl/codegen/completion_queue.h b/include/grpcpp/impl/codegen/completion_queue.h
-index 80c7c41982..3f7d8a7714 100644
---- a/include/grpcpp/impl/codegen/completion_queue.h
-+++ b/include/grpcpp/impl/codegen/completion_queue.h
-@@ -32,6 +32,8 @@
- #ifndef GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
- #define GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
- 
-+#include <typeinfo>
-+
- #include <grpc/impl/codegen/atm.h>
- #include <grpcpp/impl/codegen/completion_queue_tag.h>
- #include <grpcpp/impl/codegen/core_codegen_interface.h>
-@@ -106,7 +108,9 @@ class CompletionQueue : private GrpcLibraryCodegen {
- 
-   /// Destructor. Destroys the owned wrapped completion queue / instance.
-   ~CompletionQueue() {
--    g_core_codegen_interface->grpc_completion_queue_destroy(cq_);
-+	if (typeid(*g_core_codegen_interface).hash_code() != typeid(CoreCodegenInterface).hash_code()) {
-+      g_core_codegen_interface->grpc_completion_queue_destroy(cq_);
-+	}
-   }
- 
-   /// Tri-state return for AsyncNext: SHUTDOWN, GOT_EVENT, TIMEOUT.
-diff --git a/include/grpcpp/impl/codegen/grpc_library.h b/include/grpcpp/impl/codegen/grpc_library.h
-index 17c904d71a..a092b2204d 100644
---- a/include/grpcpp/impl/codegen/grpc_library.h
-+++ b/include/grpcpp/impl/codegen/grpc_library.h
-@@ -19,6 +19,8 @@
- #ifndef GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
- #define GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
- 
-+#include <typeinfo>
-+
- #include <grpcpp/impl/codegen/core_codegen_interface.h>
- 
- namespace grpc {
-@@ -47,7 +49,8 @@ class GrpcLibraryCodegen {
-     }
-   }
-   virtual ~GrpcLibraryCodegen() {
--    if (grpc_init_called_) {
-+    if (grpc_init_called_ &&
-+		typeid(*g_glip).hash_code() != typeid(GrpcLibraryInterface).hash_code()) {
-       GPR_CODEGEN_ASSERT(g_glip &&
-                          "gRPC library not initialized. See "
-                          "grpc::internal::GrpcLibraryInitializer.");
diff --git a/patches/grpc/grpc_library.h b/patches/grpc/grpc_library.h
new file mode 100644
index 0000000000000000000000000000000000000000..4870a1cda4b2a6489bc379fe53cf3e9659fffc47
--- /dev/null
+++ b/patches/grpc/grpc_library.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
+#define GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
+
+#include <typeinfo>
+
+#include <grpcpp/impl/codegen/core_codegen_interface.h>
+
+namespace grpc {
+
+class GrpcLibraryInterface {
+ public:
+  virtual ~GrpcLibraryInterface() = default;
+  virtual void init() = 0;
+  virtual void shutdown() = 0;
+};
+
+/// Initialized by \a grpc::GrpcLibraryInitializer from
+/// <grpcpp/impl/grpc_library.h>
+extern GrpcLibraryInterface* g_glip;
+
+/// Classes that require gRPC to be initialized should inherit from this class.
+class GrpcLibraryCodegen {
+ public:
+  GrpcLibraryCodegen(bool call_grpc_init = true) : grpc_init_called_(false) {
+    if (call_grpc_init) {
+      GPR_CODEGEN_ASSERT(g_glip &&
+                         "gRPC library not initialized. See "
+                         "grpc::internal::GrpcLibraryInitializer.");
+      g_glip->init();
+      grpc_init_called_ = true;
+    }
+  }
+  virtual ~GrpcLibraryCodegen() {
+    if (grpc_init_called_ &&
+        typeid(*g_glip).hash_code() !=
+            typeid(GrpcLibraryInterface).hash_code()) {
+      GPR_CODEGEN_ASSERT(g_glip &&
+                         "gRPC library not initialized. See "
+                         "grpc::internal::GrpcLibraryInitializer.");
+      g_glip->shutdown();
+    }
+  }
+
+ private:
+  bool grpc_init_called_;
+};
+
+}  // namespace grpc
+
+#endif  // GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
diff --git a/python/paddle/batch.py b/python/paddle/batch.py
index d48c54fcbb66487617b1946bc69724870c8f879c..008509660739d61245526278735064472b8b06dd 100644
--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -40,4 +40,10 @@ def batch(reader, batch_size, drop_last=False):
         if drop_last == False and len(b) != 0:
             yield b
 
+    # Batch size check
+    batch_size = int(batch_size)
+    if batch_size <= 0:
+        raise ValueError("batch_size should be a positive integeral value, "
+                         "but got batch_size={}".format(batch_size))
+
     return batch_reader
diff --git a/python/paddle/dataset/cifar.py b/python/paddle/dataset/cifar.py
index 79ddd8b7e6f31383fa531f398ef37315b92a9807..f6b4ff8fbd0f83b1d652d37c1b2d04efd3c73cbb 100644
--- a/python/paddle/dataset/cifar.py
+++ b/python/paddle/dataset/cifar.py
@@ -28,11 +28,12 @@ images per class.
 
 """
 
-import cPickle
 import itertools
 import numpy
 import paddle.dataset.common
 import tarfile
+from six.moves import zip
+from six.moves import cPickle as pickle
 
 __all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
 
@@ -48,7 +49,7 @@ def reader_creator(filename, sub_name, cycle=False):
         data = batch['data']
         labels = batch.get('labels', batch.get('fine_labels', None))
         assert labels is not None
-        for sample, label in itertools.izip(data, labels):
+        for sample, label in zip(data, labels):
             yield (sample / 255.0).astype(numpy.float32), int(label)
 
     def reader():
@@ -58,7 +59,7 @@ def reader_creator(filename, sub_name, cycle=False):
 
             while True:
                 for name in names:
-                    batch = cPickle.load(f.extractfile(name))
+                    batch = pickle.load(f.extractfile(name))
                     for item in read_batch(batch):
                         yield item
                 if not cycle:
diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
index 68660601c161d2332b17b448fae089506238ba78..6195cc50df338e83bea1f4ad416529464636a33e 100644
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -20,9 +20,8 @@ import shutil
 import sys
 import importlib
 import paddle.dataset
-import cPickle
+import six.moves.cPickle as pickle
 import glob
-import cPickle as pickle
 
 __all__ = [
     'DATA_HOME',
@@ -75,13 +74,13 @@ def download(url, module_name, md5sum, save_name=None):
     retry_limit = 3
     while not (os.path.exists(filename) and md5file(filename) == md5sum):
         if os.path.exists(filename):
-            print "file md5", md5file(filename), md5sum
+            print("file md5", md5file(filename), md5sum)
         if retry < retry_limit:
             retry += 1
         else:
             raise RuntimeError("Cannot download {0} within retry limit {1}".
                                format(url, retry_limit))
-        print "Cache file %s not found, downloading %s" % (filename, url)
+        print("Cache file %s not found, downloading %s" % (filename, url))
         r = requests.get(url, stream=True)
         total_length = r.headers.get('content-length')
 
@@ -104,8 +103,9 @@ def download(url, module_name, md5sum, save_name=None):
 
 
 def fetch_all():
-    for module_name in filter(lambda x: not x.startswith("__"),
-                              dir(paddle.dataset)):
+    for module_name in [
+            x for x in dir(paddle.dataset) if not x.startswith("__")
+    ]:
         if "fetch" in dir(
                 importlib.import_module("paddle.dataset.%s" % module_name)):
             getattr(
@@ -114,8 +114,9 @@ def fetch_all():
 
 
 def fetch_all_recordio(path):
-    for module_name in filter(lambda x: not x.startswith("__"),
-                              dir(paddle.dataset)):
+    for module_name in [
+            x for x in dir(paddle.dataset) if not x.startswith("__")
+    ]:
         if "convert" in dir(
                 importlib.import_module("paddle.dataset.%s" % module_name)) and \
                 not module_name == "common":
@@ -126,7 +127,7 @@ def fetch_all_recordio(path):
                 "convert")(ds_path)
 
 
-def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
+def split(reader, line_count, suffix="%05d.pickle", dumper=pickle.dump):
     """
     you can call the function as:
 
@@ -167,7 +168,7 @@ def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
 def cluster_files_reader(files_pattern,
                          trainer_count,
                          trainer_id,
-                         loader=cPickle.load):
+                         loader=pickle.load):
     """
     Create a reader that yield element from the given files, select
     a file set according trainer count and trainer_id
@@ -188,7 +189,7 @@ def cluster_files_reader(files_pattern,
         my_file_list = []
         for idx, fn in enumerate(file_list):
             if idx % trainer_count == trainer_id:
-                print "append file: %s" % fn
+                print("append file: %s" % fn)
                 my_file_list.append(fn)
         for fn in my_file_list:
             with open(fn, "r") as f:
@@ -221,7 +222,7 @@ def convert(output_path, reader, line_count, name_prefix):
         for l in lines:
             # FIXME(Yancey1989):
             # dumps with protocol: pickle.HIGHEST_PROTOCOL
-            writer.write(cPickle.dumps(l))
+            writer.write(pickle.dumps(l))
         writer.close()
 
     lines = []
diff --git a/python/paddle/dataset/conll05.py b/python/paddle/dataset/conll05.py
index 4e94ce89892f8e6822c15fdc510805e75dfca988..a97c95d067b876a87f0aa19b2ddd0702a848bd4a 100644
--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
@@ -24,18 +24,19 @@ import tarfile
 import gzip
 import itertools
 import paddle.dataset.common
+from six.moves import zip
 
 __all__ = ['test, get_dict', 'get_embedding', 'convert']
 
 DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
-WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
+WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
 WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
-VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
+VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt'
 VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
-TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
+TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt'
 TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
-EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
+EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb'
 EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
 
 UNK_IDX = 0
@@ -87,12 +88,12 @@ def corpus_reader(data_path, words_name, props_name):
             sentences = []
             labels = []
             one_seg = []
-            for word, label in itertools.izip(words_file, props_file):
+            for word, label in zip(words_file, props_file):
                 word = word.strip()
                 label = label.strip().split()
 
                 if len(label) == 0:  # end of sentence
-                    for i in xrange(len(one_seg[0])):
+                    for i in range(len(one_seg[0])):
                         a_kind_lable = [x[i] for x in one_seg]
                         labels.append(a_kind_lable)
 
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index 2354987d20b908a32209f9ac22a2065ee43c3dfd..914dae348bc94d061072543aa14aba2219f4b52d 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -28,10 +28,9 @@ Graphics and Image Processing (2008)
 http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
 
 """
-import cPickle
 import itertools
 import functools
-from common import download
+from .common import download
 import tarfile
 import scipy.io as scio
 from paddle.dataset.image import *
@@ -39,6 +38,8 @@ from paddle.reader import *
 import os
 import numpy as np
 from multiprocessing import cpu_count
+from six.moves import cPickle as pickle
+from six.moves import zip
 __all__ = ['train', 'test', 'valid']
 
 DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
@@ -116,10 +117,10 @@ def reader_creator(data_file,
                 file = file.strip()
                 batch = None
                 with open(file, 'r') as f:
-                    batch = cPickle.load(f)
+                    batch = pickle.load(f)
                 data = batch['data']
                 labels = batch['label']
-                for sample, label in itertools.izip(data, batch['label']):
+                for sample, label in zip(data, batch['label']):
                     yield sample, int(label) - 1
             if not cycle:
                 break
diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py
index 9235c41e9eb95b25a0dc53a494a203e7a4525981..3b3d89c93c48d611dccf6f14958c310a6cac1a7b 100644
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -36,7 +36,7 @@ except ImportError:
     cv2 = None
 import os
 import tarfile
-import cPickle
+import six.moves.cPickle as pickle
 
 __all__ = [
     "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
@@ -86,10 +86,10 @@ def batch_images_from_tar(data_file,
                 output = {}
                 output['label'] = labels
                 output['data'] = data
-                cPickle.dump(
+                pickle.dump(
                     output,
                     open('%s/batch_%d' % (out_path, file_id), 'w'),
-                    protocol=cPickle.HIGHEST_PROTOCOL)
+                    protocol=pickle.HIGHEST_PROTOCOL)
                 file_id += 1
                 data = []
                 labels = []
@@ -97,10 +97,10 @@ def batch_images_from_tar(data_file,
         output = {}
         output['label'] = labels
         output['data'] = data
-        cPickle.dump(
+        pickle.dump(
             output,
             open('%s/batch_%d' % (out_path, file_id), 'w'),
-            protocol=cPickle.HIGHEST_PROTOCOL)
+            protocol=pickle.HIGHEST_PROTOCOL)
 
     with open(meta_file, 'a') as meta:
         for file in os.listdir(out_path):
diff --git a/python/paddle/dataset/imdb.py b/python/paddle/dataset/imdb.py
index 5ff05b1e9b7f4c42909370a21beb140ecdcd6868..e7fe4e0b7e5832c2bc7ca1307725936a70292c39 100644
--- a/python/paddle/dataset/imdb.py
+++ b/python/paddle/dataset/imdb.py
@@ -42,13 +42,13 @@ def tokenize(pattern):
         # sequential access of member files, other than
         # tarfile.extractfile, which does random access and might
         # destroy hard disks.
-        tf = tarf.next()
+        tf = next(tarf)
         while tf != None:
             if bool(pattern.match(tf.name)):
                 # newline and punctuations removal and ad-hoc tokenization.
                 yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
                     None, string.punctuation).lower().split()
-            tf = tarf.next()
+            tf = next(tarf)
 
 
 def build_dict(pattern, cutoff):
@@ -62,11 +62,11 @@ def build_dict(pattern, cutoff):
             word_freq[word] += 1
 
     # Not sure if we should prune less-frequent words here.
-    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
+    word_freq = [x for x in list(word_freq.items()) if x[1] > cutoff]
 
     dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
     words, _ = list(zip(*dictionary))
-    word_idx = dict(zip(words, xrange(len(words))))
+    word_idx = dict(list(zip(words, list(range(len(words))))))
     word_idx['<unk>'] = len(words)
     return word_idx
 
diff --git a/python/paddle/dataset/imikolov.py b/python/paddle/dataset/imikolov.py
index c6c0a0f54373dd068b2c493f6fbc9c8578593aef..bc007c9d3c8e2f1e4ff091f7c2c93eacbbe8d0e0 100644
--- a/python/paddle/dataset/imikolov.py
+++ b/python/paddle/dataset/imikolov.py
@@ -64,11 +64,11 @@ def build_dict(min_word_freq=50):
             # remove <unk> for now, since we will set it as last index
             del word_freq['<unk>']
 
-        word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items())
+        word_freq = [x for x in list(word_freq.items()) if x[1] > min_word_freq]
 
         word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
         words, _ = list(zip(*word_freq_sorted))
-        word_idx = dict(zip(words, xrange(len(words))))
+        word_idx = dict(list(zip(words, list(range(len(words))))))
         word_idx['<unk>'] = len(words)
 
     return word_idx
diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
index 6259cc35b4f7bb781886bb5da9d16924831d7246..ffa9008c80129b80b3807dbab37bc198e59cf5a2 100644
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -65,7 +65,7 @@ def reader_creator(image_filename, label_filename, buffer_size):
 
                 images = images / 255.0 * 2.0 - 1.0
 
-                for i in xrange(buffer_size):
+                for i in range(buffer_size):
                     yield images[i, :], int(labels[i])
         finally:
             try:
diff --git a/python/paddle/dataset/movielens.py b/python/paddle/dataset/movielens.py
index ab11716202a8298c182e23b661eb1d2ac74bf4da..056ec2178607329dd6daa1764820c2312bbaed59 100644
--- a/python/paddle/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
@@ -16,7 +16,7 @@ Movielens 1-M dataset.
 
 Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000
 movies, which was collected by GroupLens Research. This module will download
-Movielens 1-M dataset from 
+Movielens 1-M dataset from
 http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training
 set and test set into paddle reader creators.
 
@@ -187,7 +187,7 @@ def max_movie_id():
     Get the maximum value of movie id.
     """
     __initialize_meta_info__()
-    return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index
+    return reduce(__max_index_info__, list(MOVIE_INFO.values())).index
 
 
 def max_user_id():
@@ -195,7 +195,7 @@ def max_user_id():
     Get the maximum value of user id.
     """
     __initialize_meta_info__()
-    return reduce(__max_index_info__, USER_INFO.viewvalues()).index
+    return reduce(__max_index_info__, list(USER_INFO.values())).index
 
 
 def __max_job_id_impl__(a, b):
@@ -210,7 +210,7 @@ def max_job_id():
     Get the maximum value of job id.
     """
     __initialize_meta_info__()
-    return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id
+    return reduce(__max_job_id_impl__, list(USER_INFO.values())).job_id
 
 
 def movie_categories():
@@ -243,7 +243,7 @@ def unittest():
     for test_count, _ in enumerate(test()()):
         pass
 
-    print train_count, test_count
+    print(train_count, test_count)
 
 
 def fetch():
diff --git a/python/paddle/dataset/mq2007.py b/python/paddle/dataset/mq2007.py
index d3b3dd524c34be660c5f2d4fc5ce2fa0420efbc1..cc4d088316dfd490dc9d6b247c66c2495cedf2c3 100644
--- a/python/paddle/dataset/mq2007.py
+++ b/python/paddle/dataset/mq2007.py
@@ -26,7 +26,7 @@ http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ20
 import os
 import functools
 import rarfile
-from common import download
+from .common import download
 import numpy as np
 
 # URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar"
@@ -53,7 +53,7 @@ class Query(object):
   ----------
   query_id : int
     query_id in dataset, mapping from query to relevance documents
-  relevance_score : int 
+  relevance_score : int
     relevance score of query and document pair
   feature_vector : array, dense feature
     feature in vector format
@@ -92,7 +92,7 @@ class Query(object):
             sys.stdout.write("expect 48 space split parts, get %d" %
                              (len(parts)))
             return None
-        # format : 0 qid:10 1:0.000272 2:0.000000 .... 
+        # format : 0 qid:10 1:0.000272 2:0.000000 ....
         self.relevance_score = int(parts[0])
         self.query_id = int(parts[1].split(':')[1])
         for p in parts[2:]:
@@ -295,7 +295,7 @@ def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1):
   --------
   filename : string
   fill_missing : fill the missing value. default in MQ2007 is -1
-  
+
   Returns
   ------
   yield
@@ -330,4 +330,4 @@ if __name__ == "__main__":
     mytest = functools.partial(
         __reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise")
     for label, query in mytest():
-        print label, query
+        print(label, query)
diff --git a/python/paddle/dataset/sentiment.py b/python/paddle/dataset/sentiment.py
index f5461164fe6b816356e42fc7b7dcf388eccfadfb..953ada057bc114ebbfe39011d2fd3b5b7a2b0d37 100644
--- a/python/paddle/dataset/sentiment.py
+++ b/python/paddle/dataset/sentiment.py
@@ -43,11 +43,11 @@ def download_data_if_not_yet():
             nltk.data.path.append(paddle.dataset.common.DATA_HOME)
         movie_reviews.categories()
     except LookupError:
-        print "Downloading movie_reviews data set, please wait....."
+        print("Downloading movie_reviews data set, please wait.....")
         nltk.download(
             'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
-        print "Download data set success....."
-        print "Path is " + nltk.data.find('corpora/movie_reviews').path
+        print("Download data set success.....")
+        print("Path is " + nltk.data.find('corpora/movie_reviews').path)
 
 
 def get_word_dict():
@@ -64,7 +64,7 @@ def get_word_dict():
         for field in movie_reviews.fileids(category):
             for words in movie_reviews.words(field):
                 word_freq_dict[words] += 1
-    words_sort_list = word_freq_dict.items()
+    words_sort_list = list(word_freq_dict.items())
     words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
     for index, word in enumerate(words_sort_list):
         words_freq_sorted.append((word[0], index))
@@ -80,7 +80,8 @@ def sort_files():
     files_list = list()
     neg_file_list = movie_reviews.fileids('neg')
     pos_file_list = movie_reviews.fileids('pos')
-    files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
+    files_list = list(
+        chain.from_iterable(list(zip(neg_file_list, pos_file_list))))
     return files_list
 
 
diff --git a/python/paddle/dataset/tests/common_test.py b/python/paddle/dataset/tests/common_test.py
index e7cc02aa83061599ffefa18de6cb02ac0fc9e9b7..777cd06a19726f8ad73774c958c8cb512808a3aa 100644
--- a/python/paddle/dataset/tests/common_test.py
+++ b/python/paddle/dataset/tests/common_test.py
@@ -36,7 +36,7 @@ class TestCommon(unittest.TestCase):
     def test_split(self):
         def test_reader():
             def reader():
-                for x in xrange(10):
+                for x in range(10):
                     yield x
 
             return reader
@@ -49,7 +49,7 @@ class TestCommon(unittest.TestCase):
 
     def test_cluster_file_reader(self):
         _, temp_path = tempfile.mkstemp()
-        for x in xrange(5):
+        for x in range(5):
             with open(temp_path + '/%05d.test' % x) as f:
                 f.write('%d\n' % x)
         reader = paddle.dataset.common.cluster_files_reader(
@@ -63,7 +63,7 @@ class TestCommon(unittest.TestCase):
 
         def test_reader():
             def reader():
-                for x in xrange(record_num):
+                for x in range(record_num):
                     yield x
 
             return reader
diff --git a/python/paddle/dataset/tests/imikolov_test.py b/python/paddle/dataset/tests/imikolov_test.py
index 233fd9fc8cea4cd0b5cd052580030fc8c993693c..50f50d947d221686d6308a6ed44cbcff3b10c6f5 100644
--- a/python/paddle/dataset/tests/imikolov_test.py
+++ b/python/paddle/dataset/tests/imikolov_test.py
@@ -59,7 +59,7 @@ class TestMikolov(unittest.TestCase):
         self.assertEqual(first_line, read_line)
 
     def test_total(self):
-        _, idx = zip(*WORD_DICT.items())
+        _, idx = list(zip(*list(WORD_DICT.items())))
         self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1)
 
 
diff --git a/python/paddle/dataset/tests/test_sentiment.py b/python/paddle/dataset/tests/test_sentiment.py
index 543f4b7378b583ea3857bf785cf330c43e535c2a..37326517f7b39fb74c694684eb8a547d5f021946 100644
--- a/python/paddle/dataset/tests/test_sentiment.py
+++ b/python/paddle/dataset/tests/test_sentiment.py
@@ -24,9 +24,8 @@ from nltk.corpus import movie_reviews
 class TestSentimentMethods(unittest.TestCase):
     def test_get_word_dict(self):
         word_dict = st.get_word_dict()[0:10]
-        test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3),
-                          (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7),
-                          (u'is', 8), (u'in', 9)]
+        test_word_list = [(',', 0), ('the', 1), ('.', 2), ('a', 3), ('and', 4),
+                          ('of', 5), ('to', 6), ("'", 7), ('is', 8), ('in', 9)]
         for idx, each in enumerate(word_dict):
             self.assertEqual(each, test_word_list[idx])
         self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
diff --git a/python/paddle/dataset/uci_housing.py b/python/paddle/dataset/uci_housing.py
index fbfa477d055eb5f484989eacce38cee8d617d729..410ca7af0d6d1dc26acbf92fce5e49fce7d3a3bb 100644
--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
@@ -49,9 +49,12 @@ def feature_range(maximums, minimums):
     import matplotlib.pyplot as plt
     fig, ax = plt.subplots()
     feature_num = len(maximums)
-    ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
+    ax.bar(list(range(feature_num)),
+           maximums - minimums,
+           color='r',
+           align='center')
     ax.set_title('feature scale')
-    plt.xticks(range(feature_num), feature_names)
+    plt.xticks(list(range(feature_num)), feature_names)
     plt.xlim([-1, feature_num])
     fig.set_figheight(6)
     fig.set_figwidth(10)
@@ -71,7 +74,7 @@ def load_data(filename, feature_num=14, ratio=0.8):
     maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
         axis=0) / data.shape[0]
     feature_range(maximums[:-1], minimums[:-1])
-    for i in xrange(feature_num - 1):
+    for i in range(feature_num - 1):
         data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
     offset = int(data.shape[0] * ratio)
     UCI_TRAIN_DATA = data[:offset]
diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py
index f0908c737874fa7335cca5b5f0cba83190c9f90f..c9a257ba3f86aac79bfe891d78fdaf36d2df8a50 100644
--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -40,7 +40,7 @@ URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
              'wmt_shrinked_data/wmt14.tgz')
 MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
 # BLEU of this trained model is 26.92
-URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
+URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz'
 MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
 
 START = "<s>"
@@ -154,8 +154,8 @@ def get_dict(dict_size, reverse=True):
     tar_file = paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
     src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
     if reverse:
-        src_dict = {v: k for k, v in src_dict.items()}
-        trg_dict = {v: k for k, v in trg_dict.items()}
+        src_dict = {v: k for k, v in list(src_dict.items())}
+        trg_dict = {v: k for k, v in list(trg_dict.items())}
     return src_dict, trg_dict
 
 
diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
index 540d43b692e0f65460f558dd74a52715ff4db68d..4e3c466c38e402cc574e93ef3a5935edf8f9dd3b 100644
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -70,7 +70,9 @@ def __build_dict(tar_file, dict_size, save_path, lang):
         fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
         for idx, word in enumerate(
                 sorted(
-                    word_dict.iteritems(), key=lambda x: x[1], reverse=True)):
+                    iter(list(word_dict.items())),
+                    key=lambda x: x[1],
+                    reverse=True)):
             if idx + 3 == dict_size: break
             fout.write("%s\n" % (word[0]))
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 9903047f749b942c50692ac7f9164fc1c91569b4..cccb4abe6cf14ac3e90ba59d970c9398b09b7db2 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -14,81 +14,81 @@
 
 from __future__ import print_function
 # import all class inside framework into fluid module
-import framework
-from framework import *
+from . import framework
+from .framework import *
 # import all class inside executor into fluid module
-import executor
-from executor import *
-
-import trainer
-from trainer import Trainer
-from trainer import BeginEpochEvent
-from trainer import EndEpochEvent
-from trainer import BeginStepEvent
-from trainer import EndStepEvent
-from trainer import CheckpointConfig
-
-import inferencer
-from inferencer import Inferencer
-
-import io
-import evaluator
-import initializer
-import layers
-import contrib
-import nets
-import optimizer
-import backward
-import regularizer
-import average
-import metrics
-import transpiler
-from param_attr import ParamAttr, WeightNormParamAttr
-from data_feeder import DataFeeder
-from core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
-from transpiler import DistributeTranspiler, InferenceTranspiler, \
+from . import executor
+from .executor import *
+
+from . import trainer
+from .trainer import Trainer
+from .trainer import BeginEpochEvent
+from .trainer import EndEpochEvent
+from .trainer import BeginStepEvent
+from .trainer import EndStepEvent
+from .trainer import CheckpointConfig
+
+from . import inferencer
+from .inferencer import Inferencer
+
+from . import io
+from . import evaluator
+from . import initializer
+from . import layers
+from . import contrib
+from . import nets
+from . import optimizer
+from . import backward
+from . import regularizer
+from . import average
+from . import metrics
+from . import transpiler
+from .param_attr import ParamAttr, WeightNormParamAttr
+from .data_feeder import DataFeeder
+from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
+from .transpiler import DistributeTranspiler, InferenceTranspiler, \
     memory_optimize, release_memory, DistributeTranspilerConfig
-from concurrency import (Go, make_channel, channel_send, channel_recv,
-                         channel_close, Select)
-from lod_tensor import create_lod_tensor, create_random_int_lodtensor
-import clip
-import profiler
-import unique_name
-import recordio_writer
-import parallel_executor
-from parallel_executor import *
+from .concurrency import (Go, make_channel, channel_send, channel_recv,
+                          channel_close, Select)
+from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
+from . import clip
+from . import profiler
+from . import unique_name
+from . import recordio_writer
+from . import parallel_executor
+from .parallel_executor import *
 from paddle.fluid.layers.math_op_patch import monkey_patch_variable
 
 Tensor = LoDTensor
 
 __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
-          trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
-          parallel_executor.__all__ + lod_tensor.__all__ + [
-              'io',
-              'initializer',
-              'layers',
-              'contrib',
-              'transpiler',
-              'nets',
-              'optimizer',
-              'learning_rate_decay',
-              'backward',
-              'regularizer',
-              'LoDTensor',
-              'LoDTensorArray',
-              'CPUPlace',
-              'CUDAPlace',
-              'CUDAPinnedPlace',
-              'Tensor',
-              'ParamAttr',
-              'WeightNormParamAttr',
-              'DataFeeder',
-              'clip',
-              'profiler',
-              'unique_name',
-              'recordio_writer',
-              'Scope',
-          ]
+    trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
+    parallel_executor.__all__ + lod_tensor.__all__ + [
+        'io',
+        'initializer',
+        'layers',
+        'contrib',
+        'transpiler',
+        'nets',
+        'optimizer',
+        'learning_rate_decay',
+        'backward',
+        'regularizer',
+        'LoDTensor',
+        'LoDTensorArray',
+        'CPUPlace',
+        'CUDAPlace',
+        'CUDAPinnedPlace',
+        'Tensor',
+        'ParamAttr',
+        'WeightNormParamAttr',
+        'DataFeeder',
+        'clip',
+        'profiler',
+        'unique_name',
+        'recordio_writer',
+        'Scope',
+    ]
 
 
 def __bootstrap__():
@@ -99,8 +99,8 @@ def __bootstrap__():
         None
     """
     import sys
-    import core
     import os
+    from . import core
 
     in_test = 'unittest' in sys.modules
 
@@ -123,10 +123,13 @@ def __bootstrap__():
     read_env_flags = [
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
         'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
-        'init_allocated_mem'
+        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
+        'cpu_deterministic'
     ]
     if core.is_compiled_with_dist():
         read_env_flags.append('rpc_deadline')
+        read_env_flags.append('rpc_server_profile_period')
+        read_env_flags.append('rpc_server_profile_path')
 
     if core.is_compiled_with_cuda():
         read_env_flags += [
diff --git a/python/paddle/fluid/annotations.py b/python/paddle/fluid/annotations.py
index bb8756a4664013643c278c013ca21bb237a6b4a7..15e7976354f2a22065f1723bfa696d056181dac2 100644
--- a/python/paddle/fluid/annotations.py
+++ b/python/paddle/fluid/annotations.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
 import functools
 import sys
 
@@ -28,7 +29,7 @@ def deprecated(since, instead, extra_message=""):
 
         @functools.wraps(func)
         def wrapper(*args, **kwargs):
-            print >> sys.stderr, err_msg
+            print(err_msg, file=sys.stderr)
             return func(*args, **kwargs)
 
         wrapper.__doc__ += "\n    "
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 812f68bdd849544456b2e0ebf0b739f4f92b09ea..6b739745119836a80518d7bc6138b37101e74431 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -16,7 +16,8 @@ from paddle.fluid import framework as framework
 from . import core
 import collections
 import copy
-import unique_name
+import six
+from . import unique_name
 
 __all__ = ['append_backward']
 
@@ -44,17 +45,25 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
     """
     op_desc = core.OpDesc()
     op_desc.set_type(op_type)
-    for para, args in inputs.iteritems():
-        op_desc.set_input(para, args)
-    for para, args in outputs.iteritems():
-        op_desc.set_output(para, args)
+    for para, args in list(inputs.items()):
+        op_desc.set_input(
+            para,
+            list(
+                map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
+                    args)))
+    for para, args in list(outputs.items()):
+        op_desc.set_output(
+            para,
+            list(
+                map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
+                    args)))
 
     op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
 
     if op_role_attr_name not in attrs:
         attrs[
             op_role_attr_name] = core.op_proto_and_checker_maker.OpRole.Backward
-    for name, val in attrs.iteritems():
+    for name, val in list(attrs.items()):
         if isinstance(val, framework.Block):
             op_desc.set_block_attr(name, val.desc)
         else:
@@ -105,7 +114,9 @@ def _strip_grad_suffix_(name):
     e.g. x@GRAD ==> x
          y@GRAD@RENAME@1 ==> y
     """
-    pos = name.find(core.grad_var_suffix())
+    if isinstance(name, six.text_type):
+        name = name.encode()
+    pos = name.find(six.b(core.grad_var_suffix()))
     return name[:pos] if pos != -1 else name
 
 
@@ -114,7 +125,9 @@ def _append_grad_suffix_(name):
     Append grad suffix to the given variable name
     e.g. x ==> x@GRAD
     """
-    return name + core.grad_var_suffix()
+    if isinstance(name, six.text_type):
+        name = name.encode()
+    return name + six.b(core.grad_var_suffix())
 
 
 def _addup_repetitive_outputs_(op_descs):
@@ -174,7 +187,7 @@ def _addup_repetitive_outputs_(op_descs):
                     op_desc.set_output(param_name, arg_names)
                     renamed_vars[var_name].append(new_name)
 
-    for var_name, inputs in renamed_vars.iteritems():
+    for var_name, inputs in list(renamed_vars.items()):
         if len(inputs) > 1:
             pending_sum_ops.append(
                 (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
@@ -198,16 +211,19 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
         out_arg_names = op_desc.output_arg_names()
         if len(out_arg_names) == 0 or _all_in_set_(out_arg_names, no_grad_set):
             return True
-        if _all_in_set_(
-                filter(lambda name: name.find(core.grad_var_suffix()) != -1,
-                       op_desc.input_arg_names()), no_grad_set):
+        if _all_in_set_([
+                name for name in op_desc.input_arg_names()
+                if name.find(core.grad_var_suffix()) != -1
+        ], no_grad_set):
             no_grad_set.update(out_arg_names)
             return True
         return False
 
     # Remove ops whose outputs are all in no_grad_dict
-    op_descs = filter(
-        lambda op_desc: not _op_can_be_removed_(op_desc, no_grad_set), op_descs)
+    op_descs = [
+        op_desc for op_desc in op_descs
+        if not _op_can_be_removed_(op_desc, no_grad_set)
+    ]
     # Insert fill_zeros_like_op
     to_insert = []
     for idx, op_desc in enumerate(op_descs):
@@ -217,12 +233,12 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
                     "X": [_strip_grad_suffix_(arg)]
                 }, {"Out": [arg]}, {}), idx))
 
-    map(lambda p: op_descs.insert(p[1], p[0]), reversed(to_insert))
+    list([op_descs.insert(p[1], p[0]) for p in reversed(to_insert)])
 
     return op_descs
 
 
-import proto.framework_pb2 as framework_pb2
+from .proto import framework_pb2
 
 
 def serialize_op_decs(op_desc):
@@ -244,8 +260,10 @@ def _callback_lookup_(op):
     if op.type == 'parallel_do' and op.attr('use_nccl'):
         all_vars = op.block.vars
         param_names = set(op.input('parameters'))
-        param_names = filter(lambda name: all_vars[name].stop_gradient is False,
-                             param_names)
+        param_names = [
+            name for name in param_names
+            if all_vars[name].stop_gradient is False
+        ]
         param_grad_names = [n + "@GRAD" for n in param_names]
 
         class ParallelDoCallBack(object):
@@ -399,7 +417,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
                 continue
             block.desc.var(grad_var_name)
             new_vars.add(grad_var_name)
-            if not grad_to_var.has_key(grad_var_name):
+            if grad_var_name not in grad_to_var:
                 continue
             grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, block)
         # infer_shape and infer_type
@@ -427,7 +445,7 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
                 op_desc.rename_output(name, new_name)
                 var_map[name] = new_name
 
-    for g, ng in var_map.iteritems():
+    for g, ng in list(var_map.items()):
         if g in grad_to_var:
             grad_to_var[ng] = grad_to_var[g]
             grad_to_var.pop(g)
@@ -439,7 +457,7 @@ def _get_stop_gradients_(program):
     for block in program.blocks:
         assert isinstance(block, framework.Block)
         block_no_grad_set = set()
-        for var in block.vars.itervalues():
+        for var in list(block.vars.values()):
             assert isinstance(var, framework.Variable)
             if var.stop_gradient:
                 block_no_grad_set.add(_append_grad_suffix_(var.name))
@@ -452,51 +470,51 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
     """
     Append backward part to main_program.
 
-    A complete neural network training is made up of forward and backward 
-    propagation. However, when we configure a network, we only need to 
-    specify its forwrd part. The backward part is generated automatically 
+    A complete neural network training is made up of forward and backward
+    propagation. However, when we configure a network, we only need to
+    specify its forwrd part. The backward part is generated automatically
     according to the forward part by this function.
 
-    In most cases, users do not need to invoke this function manually. It 
+    In most cases, users do not need to invoke this function manually. It
     will be automatically invoked by the optimizer's `minimize` function.
 
     Args:
         loss(Variable): The loss variable of the network.
-        parameter_list(list[string]|None): Names of parameters that need 
-                                           to be updated by optimizers. 
-                                           If it is None, all parameters 
+        parameter_list(list[string]|None): Names of parameters that need
+                                           to be updated by optimizers.
+                                           If it is None, all parameters
                                            will be updated.
                                            Default: None
-        no_grad_set(set|None): Variables in the Block 0 whose gradients 
-                               should be ignored. All variables with 
-                               `step_gradient=True` from all blocks will 
+        no_grad_set(set|None): Variables in the Block 0 whose gradients
+                               should be ignored. All variables with
+                               `step_gradient=True` from all blocks will
                                be automatically added into this set.
                                Default: None
-        callbacks(list[callable object]|None): The callbacks are used for 
-                                               doing some custom jobs during 
-                                               backward part building. All 
-                                               callable objects in it will 
-                                               be invoked once each time a 
-                                               new gradient operator is added 
-                                               into the program. The callable 
-                                               object must has two input 
-                                               parameters: 'block' and 'context'. 
-                                               The 'block' is the block which 
-                                               the new gradient operator will 
-                                               be added to. The 'context' is a 
-                                               map, whose keys are gradient 
-                                               variable names and values are 
+        callbacks(list[callable object]|None): The callbacks are used for
+                                               doing some custom jobs during
+                                               backward part building. All
+                                               callable objects in it will
+                                               be invoked once each time a
+                                               new gradient operator is added
+                                               into the program. The callable
+                                               object must has two input
+                                               parameters: 'block' and 'context'.
+                                               The 'block' is the block which
+                                               the new gradient operator will
+                                               be added to. The 'context' is a
+                                               map, whose keys are gradient
+                                               variable names and values are
                                                corresponding original variables.
-                                               In addition to this, the 'context' 
-                                               has another special key-value pair: 
-                                               the key is string '__current_op_desc__' 
-                                               and the value is the op_desc of the 
-                                               gradient operator who has just 
-                                               triggered the callable object. 
+                                               In addition to this, the 'context'
+                                               has another special key-value pair:
+                                               the key is string '__current_op_desc__'
+                                               and the value is the op_desc of the
+                                               gradient operator who has just
+                                               triggered the callable object.
 
     Returns:
-        list[(Variable,Variable)]: Pairs of parameter and its 
-        corresponding gradients. The key is the parameter and the 
+        list[(Variable,Variable)]: Pairs of parameter and its
+        corresponding gradients. The key is the parameter and the
         value is gradient variable.
 
     Raises:
@@ -535,7 +553,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
         no_grad_set = set()
     no_grad_set = copy.copy(no_grad_set)
     no_grad_dict = _get_stop_gradients_(program)
-    no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
+    no_grad_dict[0].update(list(map(_append_grad_suffix_, no_grad_set)))
 
     grad_info_map = dict()
     root_block = program.block(0)
@@ -558,7 +576,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
 
     block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
     op_path = _find_op_path_(root_block, [loss], [], block_no_grad_set)
-    no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
+    no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
 
     _append_backward_ops_(root_block, op_path, root_block, no_grad_dict,
                           grad_to_var, callbacks)
@@ -572,8 +590,6 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
 
     program.current_block_idx = current_block_idx
     program._sync_with_cpp()
-    # FIXME(zcd): prevent loss.grad optimized by mem_opt.
-    loss.block.var(_append_grad_suffix_(loss.name)).persistable = True
 
     if parameter_list is not None:
         parameters = parameter_list
@@ -699,7 +715,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
         no_grad_set = set()
     no_grad_set = copy.copy(no_grad_set)
     no_grad_dict = _get_stop_gradients_(prog)
-    no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
+    no_grad_dict[0].update(list(map(_append_grad_suffix_, no_grad_set)))
 
     fwd_op_num = block.desc.op_size()
 
@@ -733,7 +749,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
 
     block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
     op_path = _find_op_path_(block, targets, inputs, block_no_grad_set)
-    no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
+    no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
     grad_to_var = dict()
     grad_info_map = dict()
     _append_backward_ops_(block, op_path, block, no_grad_dict, grad_to_var)
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index c029662ebc1b7e7f7d1ea44b4ebd4b08b812a579..4b0a792f784fffcce3f911d3e7448b472d39f8e1 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -13,10 +13,11 @@
 # limitations under the License.
 
 import copy
+import six
 
 import functools
-import layers
-import framework
+from . import layers
+from . import framework
 from . import core
 
 __all__ = [
@@ -80,8 +81,7 @@ def error_clip_callback(block, context):
     # the context is a grad_to_var map
     grad_to_var = context
     op_desc = block.desc.op(block.desc.op_size() - 1)
-    for grad_n in filter(lambda n: grad_to_var.has_key(n),
-                         op_desc.output_arg_names()):
+    for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
         fwd_var = block._var_recursive(grad_to_var[grad_n])
         error_clip = getattr(fwd_var, "error_clip", None)
         if not (error_clip is None or isinstance(error_clip,
@@ -247,8 +247,8 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
     """
 
     def __init__(self, clip_norm, group_name="default_group"):
-        if not isinstance(group_name, basestring):
-            raise TypeError("'group_name' must be a basestring.")
+        if not isinstance(group_name, six.string_types):
+            raise TypeError("'group_name' must be a %s." % (six.string_types))
 
         self.clip_norm = clip_norm
         self.group_name = group_name
@@ -284,7 +284,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
                 x=clip_var,
                 y=layers.elementwise_max(
                     x=clip_var, y=group_norm_var))
-            assert group_scale_var.shape == (1L, )
+            assert group_scale_var.shape == (1, )
             self.context[group_scale_name] = group_scale_var
 
         new_grad = layers.elementwise_mul(
@@ -313,7 +313,7 @@ def set_gradient_clip(clip, param_list=None, program=None):
         program = framework.default_main_program()
     if param_list is None:
         param_list = program.block(0).all_parameters()
-    if all(isinstance(elem, basestring) for elem in param_list):
+    if all(isinstance(elem, six.string_types) for elem in param_list):
         param_list = [program.block(0).var(elem) for elem in param_list]
     if not all(isinstance(elem, framework.Parameter) for elem in param_list):
         raise TypeError(
diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py
index b8fe9bd4c1988dd3f6fa82df391c3059dfbfcf93..a8c4d66720d6eda857e5960d86fc3b8ec8f11ade 100644
--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from layers.control_flow import BlockGuard, equal
+from .layers.control_flow import BlockGuard, equal
 from .framework import Operator
-from layer_helper import LayerHelper, unique_name
-from layers import fill_constant
-import core
+from .layer_helper import LayerHelper, unique_name
+from .layers import fill_constant
+from . import core
 
 __all__ = [
     'Go', 'make_channel', 'channel_send', 'channel_recv', 'channel_close',
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index 12cd5d918e93181c6b7e328e6aee4ad941b0a0da..58f2da1c3ba2f84602e7a18c7b1c78d1f0d2ede1 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import decoder
-from decoder import *
+from . import decoder
+from .decoder import *
+from . import memory_usage_calc
+from .memory_usage_calc import *
 
-__all__ = decoder.__all__
+__all__ = decoder.__all__ + memory_usage_calc.__all__
diff --git a/python/paddle/fluid/contrib/decoder/__init__.py b/python/paddle/fluid/contrib/decoder/__init__.py
index 22cfe692690a686f32eba34ee34b9193f0d5ba35..6343c1543d206f82e605c5c986fa91d70c467113 100644
--- a/python/paddle/fluid/contrib/decoder/__init__.py
+++ b/python/paddle/fluid/contrib/decoder/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import beam_search_decoder
-from beam_search_decoder import *
+from . import beam_search_decoder
+from .beam_search_decoder import *
 
 __all__ = beam_search_decoder.__all__
diff --git a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
index ba6e13878291ad9f30e92f998767df6d8c6f32c3..d268a948f7a2cf038a419c95521b81088ed8215f 100644
--- a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
+++ b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
@@ -22,6 +22,7 @@ This API is still under active development and may change drastically.
 
 import contextlib
 import numpy as np
+import six
 
 from ... import layers
 from ...framework import Variable
@@ -191,7 +192,7 @@ class StateCell(object):
         self._helper = LayerHelper('state_cell', name=name)
         self._cur_states = {}
         self._state_names = []
-        for state_name, state in states.items():
+        for state_name, state in six.iteritems(states):
             if not isinstance(state, InitState):
                 raise ValueError('state must be an InitState object.')
             self._cur_states[state_name] = state
@@ -346,7 +347,7 @@ class StateCell(object):
         if self._in_decoder and not self._switched_decoder:
             self._switch_decoder()
 
-        for input_name, input_value in inputs.items():
+        for input_name, input_value in six.iteritems(inputs):
             if input_name not in self._inputs:
                 raise ValueError('Unknown input %s. '
                                  'Please make sure %s in input '
@@ -361,7 +362,7 @@ class StateCell(object):
         if self._in_decoder and not self._switched_decoder:
             self._switched_decoder()
 
-        for state_name, decoder_state in self._states_holder.items():
+        for state_name, decoder_state in six.iteritems(self._states_holder):
             if id(self._cur_decoder_obj) not in decoder_state:
                 raise ValueError('Unknown decoder object, please make sure '
                                  'switch_decoder been invoked.')
@@ -671,7 +672,7 @@ class BeamSearchDecoder(object):
             feed_dict = {}
             update_dict = {}
 
-            for init_var_name, init_var in self._input_var_dict.items():
+            for init_var_name, init_var in six.iteritems(self._input_var_dict):
                 if init_var_name not in self.state_cell._inputs:
                     raise ValueError('Variable ' + init_var_name +
                                      ' not found in StateCell!\n')
@@ -721,7 +722,8 @@ class BeamSearchDecoder(object):
                     self.state_cell.update_states()
                     self.update_array(prev_ids, selected_ids)
                     self.update_array(prev_scores, selected_scores)
-                    for update_name, var_to_update in update_dict.items():
+                    for update_name, var_to_update in six.iteritems(
+                            update_dict):
                         self.update_array(var_to_update, feed_dict[update_name])
 
     def read_array(self, init, is_ids=False, is_scores=False):
diff --git a/python/paddle/fluid/contrib/memory_usage_calc.py b/python/paddle/fluid/contrib/memory_usage_calc.py
new file mode 100644
index 0000000000000000000000000000000000000000..5da846edb63c28efd791fdfac4046cfa56c24181
--- /dev/null
+++ b/python/paddle/fluid/contrib/memory_usage_calc.py
@@ -0,0 +1,102 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module privides a memory usage calculate function for user.
+The purpose of this API is to allow users to estimate memory usage of
+a program under a special batch size, then user can set appropriate 
+batch size to fully utilize a GPU. 
+
+This API is still under active development and may change drastically.
+"""
+
+from .. import core
+from ..framework import Program, Variable
+
+__all__ = ['memory_usage']
+
+dtype_to_size = {
+    core.VarDesc.VarType.FP16: 2,
+    core.VarDesc.VarType.FP32: 4,
+    core.VarDesc.VarType.FP64: 8,
+    core.VarDesc.VarType.INT16: 2,
+    core.VarDesc.VarType.INT32: 4,
+    core.VarDesc.VarType.INT64: 8,
+    core.VarDesc.VarType.BOOL: 1,
+    core.VarDesc.VarType.UINT8: 1,
+}
+
+DEBUG = False
+
+
+def memory_usage(program, batch_size):
+    """
+    Get the estimate memory usage of program with input batch size.
+
+    Args:
+        program(Program): The current Program.
+        batch_size(int): The current input data batch_size.  
+    
+    Returns:
+        min_total_memory(float): the estimate memory usage lower bound.
+        max_total_memory(float): the estimate memory usage upper bound.
+        unit_str(string): the unit of estimate usage result.
+    
+    Examples:
+        
+        >>> import paddle.fluid as fluid
+        >>> lower_usage, upper_usage, unit = fluid.contrib.memory_usage(
+                fluid.default_main_program(), batch_size=10)
+        >>> print "memory usage is about %.3f - %.3f %s" % \
+                (lower_usage, upper_usage, unit)
+
+    """
+
+    # Parameters check
+    if not isinstance(program, Program):
+        raise TypeError(
+            "Calculating Memory Usage requires Program as its Parameter."
+            "But you passed in %s" % (type(prgram)))
+    if batch_size <= 0:
+        raise ValueError("The batch size need to be positive.")
+
+    # Get the var_name list of first block and calculate
+    total_memory = 0.0
+    for var in program.global_block().vars.itervalues():
+        data_count = 1
+        for x in var.shape:
+            if x == -1:
+                data_count *= batch_size
+            else:
+                data_count *= x
+        var_memory = data_count * dtype_to_size[var.dtype]
+        if DEBUG:
+            print "%s memory usage: %d" % (var.name, var_memory)
+        total_memory += var_memory
+    if DEBUG:
+        print "total memory usage: %.2f" % (total_memory)
+
+    # Convert appropriate unit
+    unit_str = "B"
+    if total_memory > 1024:
+        total_memory /= 1024
+        unit_str = "KB"
+        if total_memory > 1024:
+            total_memory /= 1024
+            unit_str = "MB"
+
+    # Append extra memory consumption (5% - 10%)
+    min_total_memory = total_memory * 1.05
+    max_total_memory = total_memory * 1.1
+
+    return min_total_memory, max_total_memory, unit_str
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index c859778b3757f638ac531620f241e684522add57..9452cf0e2a3a2eddb761149466bfc1ee3d23dfd9 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-import core
+from . import core
 import numpy
 import os
-import six.moves as six
+import six
+from six.moves import zip, range, xrange
 import multiprocessing
 
-from framework import Variable, default_main_program
+from .framework import Variable, default_main_program
 
 __all__ = ['DataFeeder']
 
@@ -53,7 +53,7 @@ class DataToLoDTensorConverter(object):
         self.data = []
         self.lod = []
 
-        for i in six.range(lod_level):
+        for i in six.moves.range(lod_level):
             self.lod.append([])
 
     def feed(self, data):
@@ -142,7 +142,7 @@ class DataFeeder(object):
         if program is None:
             program = default_main_program()
         for each_var in feed_list:
-            if isinstance(each_var, basestring):
+            if isinstance(each_var, six.string_types):
                 each_var = program.block(0).var(each_var)
             if not isinstance(each_var, Variable):
                 raise TypeError("Feed list should contain a list of variable")
@@ -174,7 +174,7 @@ class DataFeeder(object):
             dict: the result of conversion.
         """
         converter = []
-        for lod_level, shape, dtype in six.zip(
+        for lod_level, shape, dtype in six.moves.zip(
                 self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
             converter.append(
                 DataToLoDTensorConverter(
@@ -187,10 +187,12 @@ class DataFeeder(object):
             assert len(each_sample) == len(converter), (
                 "The number of fields in data (%s) does not match " +
                 "len(feed_list) (%s)") % (len(each_sample), len(converter))
-            for each_converter, each_slot in six.zip(converter, each_sample):
+            for each_converter, each_slot in six.moves.zip(converter,
+                                                           each_sample):
                 each_converter.feed(each_slot)
         ret_dict = {}
-        for each_name, each_converter in six.zip(self.feed_names, converter):
+        for each_name, each_converter in six.moves.zip(self.feed_names,
+                                                       converter):
             ret_dict[each_name] = each_converter.done()
         return ret_dict
 
@@ -212,12 +214,14 @@ class DataFeeder(object):
         if isinstance(self.place, core.CUDAPlace):
             places = [
                 core.CUDAPlace(i)
-                for i in six.xrange(self._get_number_of_places_(num_places))
+                for i in six.moves.xrange(
+                    self._get_number_of_places_(num_places))
             ]
         else:
             places = [
                 core.CPUPlace()
-                for _ in six.xrange(self._get_number_of_places_(num_places))
+                for _ in six.moves.xrange(
+                    self._get_number_of_places_(num_places))
             ]
 
         if len(iterable) != len(places):
@@ -227,7 +231,7 @@ class DataFeeder(object):
                              "must be same.")
 
         place = self.place
-        for p, batch in six.zip(places, iterable):
+        for p, batch in six.moves.zip(places, iterable):
             self.place = p
             yield self.feed(batch)
         self.place = place
diff --git a/python/paddle/fluid/debugger.py b/python/paddle/fluid/debugger.py
index 1c56064a1e8bdc5d975837cb5a75a40d557765ad..b7a92cf044900acdd41ede378dd68aa2d9c6b2dc 100644
--- a/python/paddle/fluid/debugger.py
+++ b/python/paddle/fluid/debugger.py
@@ -14,8 +14,8 @@
 
 import sys
 import re
-from graphviz import GraphPreviewGenerator
-import proto.framework_pb2 as framework_pb2
+from .graphviz import GraphPreviewGenerator
+from .proto import framework_pb2
 from google.protobuf import text_format
 
 _vartype2str_ = [
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index 00ba1a0457583d1cc1fa7136ebd51e9ced167832..c0671cce9a1f169f02ba03a839c45b6e4df2c47a 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -15,11 +15,11 @@
 import warnings
 import numpy as np
 
-import layers
-from framework import Program, Variable, program_guard
-import unique_name
-from layer_helper import LayerHelper
-from initializer import Constant
+from . import layers
+from .framework import Program, Variable, program_guard
+from . import unique_name
+from .layer_helper import LayerHelper
+from .initializer import Constant
 
 __all__ = [
     'ChunkEvaluator',
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 4178971398c953236bf8de4d5cb6e93d0e33380c..35da1d06a2c1da8ba663ea0f0b9a0e58ea7c4470 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -14,7 +14,8 @@
 
 import numpy as np
 import contextlib
-from framework import Program, default_main_program, Variable
+import six
+from .framework import Program, default_main_program, Variable
 from . import core
 
 __all__ = [
@@ -204,19 +205,19 @@ def fetch_var(name, scope=None, return_numpy=True):
 
 
 def _get_program_cache_key(feed, fetch_list):
-    feed_var_names = feed.keys()
+    feed_var_names = list(feed.keys())
 
     def to_name_str(var):
         if isinstance(var, Variable):
             return var.desc.name()
         elif isinstance(var, str):
             return var
-        elif isinstance(var, basestring):
+        elif isinstance(var, six.string_types):
             return str(var)
         else:
             raise TypeError(str(var) + " should be Variable or str")
 
-    fetch_var_names = map(to_name_str, fetch_list)
+    fetch_var_names = list(map(to_name_str, fetch_list))
 
     return str(feed_var_names + fetch_var_names)
 
@@ -229,8 +230,8 @@ class Executor(object):
     to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
     the variables(or names) that user want to get after program run. Note: the executor will run all
     operators in the program but not only the operators dependent by the fetch_list.
-    It store the global variables into the global scope, and create a local scope for the temporary 
-    variables. The local scope contents will be discarded after every minibatch forward/backward finished. 
+    It store the global variables into the global scope, and create a local scope for the temporary
+    variables. The local scope contents will be discarded after every minibatch forward/backward finished.
     But the global scope variables will be persistent through different runs.
     All of ops in program will be running in sequence.
 
@@ -345,7 +346,7 @@ class Executor(object):
     def _fetch_data(self, fetch_list, fetch_var_name, scope):
         outs = [
             core.get_fetch_variable(scope, fetch_var_name, i)
-            for i in xrange(len(fetch_list))
+            for i in range(len(fetch_list))
         ]
         return outs
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 9fc3849ee071dba0c4d39e2d661f5afd064d1210..3111674abbf92e3f608b625023993c69cf326351 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -15,21 +15,22 @@
 import collections
 import contextlib
 import re
+import six
 
 import numpy as np
 
-import proto.framework_pb2 as framework_pb2
+from .proto import framework_pb2
 try:
     from . import core
-except ImportError, e:
+except ImportError as e:
     raise ImportError(
         """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
     if you encounters \"libmkldnn.so not found\" errors. If you have python
     installed in other directory, replace \"/usr/local/lib\" with your own
     directory. The original error is: \n""" + e.message)
-except Exception, e:
+except Exception as e:
     raise e
-import unique_name
+from . import unique_name
 
 __all__ = [
     'Program',
@@ -86,7 +87,7 @@ def convert_np_dtype_to_dtype_(np_dtype):
     elif dtype == np.uint8:
         return core.VarDesc.VarType.UINT8
     else:
-        raise ValueError("Not supported numpy dtype " + str(dtype))
+        raise ValueError("Not supported numpy dtype " + six.binary_type(dtype))
 
 
 def dtype_is_floating(dtype):
@@ -129,15 +130,15 @@ def _debug_string_(proto, throw_on_error=True):
 
 class Variable(object):
     """
-    In Fluid, every input and output of an operator is a variable. In most 
-    cases, variables are used for holding different kinds of data or training 
-    labels. A variable belongs to a block. All variable has its own name and 
+    In Fluid, every input and output of an operator is a variable. In most
+    cases, variables are used for holding different kinds of data or training
+    labels. A variable belongs to a block. All variable has its own name and
     two variables in different blocks could have the same name.
 
-    There are many kinds of variables. Each kind of them has its own attributes 
-    and usages. Please reference the framework.proto for details. 
+    There are many kinds of variables. Each kind of them has its own attributes
+    and usages. Please reference the framework.proto for details.
 
-    Most of a Variable's member variables can be setted to be None. It mean 
+    Most of a Variable's member variables can be setted to be None. It mean
     it is not available or will be specified later.
 
     Args:
@@ -197,6 +198,7 @@ class Variable(object):
         if name is None:
             name = unique_name.generate('_generated_var')
         is_new_var = False
+        name = name if isinstance(name, six.binary_type) else name.encode()
         self.desc = self.block.desc.find_var(name)
 
         if self.desc is None:
@@ -290,13 +292,13 @@ class Variable(object):
         assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                                bool)
         protostr = self.desc.serialize_to_string()
-        proto = framework_pb2.VarDesc.FromString(str(protostr))
+        proto = framework_pb2.VarDesc.FromString(six.binary_type(protostr))
         res_str = _debug_string_(proto, throw_on_error)
         if with_details:
             additional_attr = ("error_clip", "stop_gradient")
             for attr_name in additional_attr:
-                res_str += "%s: %s\n" % (attr_name,
-                                         str(getattr(self, attr_name)))
+                res_str += "%s: %s\n" % (
+                    attr_name, six.binary_type(getattr(self, attr_name)))
         return res_str
 
     __repr__ = __str__
@@ -369,7 +371,7 @@ def get_all_op_protos():
     protostrs = core.get_all_op_protos()
     ret_values = []
     for pbstr in protostrs:
-        op_proto = framework_pb2.OpProto.FromString(str(pbstr))
+        op_proto = framework_pb2.OpProto.FromString(six.binary_type(pbstr))
         ret_values.append(op_proto)
     return ret_values
 
@@ -472,7 +474,6 @@ class Operator(object):
                  inputs=None,
                  outputs=None,
                  attrs=None):
-
         self.block = block
         self.desc = desc
         self.attrs = attrs
@@ -523,10 +524,19 @@ class Operator(object):
                             % (in_proto.name, len(in_args)))
                     in_arg_names = []
                     for arg in in_args:
-                        if isinstance(arg, basestring):
+                        if isinstance(arg, six.string_types):
                             in_arg_names.append(arg)
+                        elif isinstance(arg, six.binary_type):
+                            in_arg_names.append(arg.decode())
                         else:
-                            in_arg_names.append(arg.name)
+                            if isinstance(arg.name, six.string_types):
+                                in_arg_names.append(arg.name)
+                            elif isinstance(arg.name, six.binary_type):
+                                in_arg_names.append(arg.name.decode())
+                            else:
+                                raise TypeError(
+                                    "arguments require unicode, str or bytes, but get %s instead."
+                                    % (type(arg.name)))
                     self.desc.set_input(in_proto.name, in_arg_names)
                 else:
                     self.desc.set_input(in_proto.name, [])
@@ -541,8 +551,9 @@ class Operator(object):
             if not given == need:
                 raise ValueError(("Incorrect setting for output(s) of "
                                   "operator \"%s\". Need: [%s] Given: [%s]") %
-                                 (type, ", ".join(str(e) for e in need),
-                                  ", ".join(str(e) for e in given)))
+                                 (type,
+                                  ", ".join(six.binary_type(e) for e in need),
+                                  ", ".join(six.binary_type(e) for e in given)))
 
             for out_proto in proto.outputs:
                 out_args = outputs[out_proto.name]
@@ -554,7 +565,14 @@ class Operator(object):
                         (out_proto.name, len(out_args)))
                 out_arg_names = []
                 for arg in out_args:
-                    out_arg_names.append(arg.name)
+                    if isinstance(arg.name, six.string_types):
+                        out_arg_names.append(arg.name)
+                    elif isinstance(arg.name, six.binary_type):
+                        out_arg_names.append(arg.name.decode())
+                    else:
+                        raise TypeError(
+                            "arguments require unicode, str or bytes, but get %s instead."
+                            % (type(arg.name)))
                     arg.op = self
                 self.desc.set_output(out_proto.name, out_arg_names)
 
@@ -590,7 +608,7 @@ class Operator(object):
 
         """
         protostr = self.desc.serialize_to_string()
-        proto = framework_pb2.OpDesc.FromString(str(protostr))
+        proto = framework_pb2.OpDesc.FromString(six.binary_type(protostr))
         return _debug_string_(proto, throw_on_error)
 
     def __str__(self):
@@ -845,7 +863,7 @@ class Block(object):
             re_add_indent = re.compile(r"\n(.)")
             res_str = "blocks {\n  idx: %d\n  parent_idx: %d" % (
                 self.idx, self.parent_idx)
-            for var in self.vars.itervalues():
+            for var in list(self.vars.values()):
                 res_str += "\n  vars {\n    %s  }" % re_add_indent.sub(
                     r"\n    \1", var.to_string(throw_on_error, with_details))
             for op in self.ops:
@@ -854,7 +872,8 @@ class Block(object):
             res_str += "\n}"
         else:
             protostr = self.desc.serialize_to_string()
-            proto = framework_pb2.BlockDesc.FromString(str(protostr))
+            proto = framework_pb2.BlockDesc.FromString(
+                six.binary_type(protostr))
             res_str = _debug_string_(proto, throw_on_error)
         return res_str
 
@@ -898,10 +917,11 @@ class Block(object):
         Returns:
             Variable: the Variable with the giving name.
         """
-        if not isinstance(name, basestring):
-            raise TypeError(
-                "var require string as parameter, but get %s instead." %
-                (type(name)))
+        if not isinstance(name, six.string_types):
+            if not isinstance(name, six.binary_type):
+                raise TypeError(
+                    "var require string as parameter, but get %s instead." %
+                    (type(name)))
         v = self.vars.get(name, None)
         if v is None:
             raise ValueError("var %s not in this block" % name)
@@ -949,10 +969,10 @@ class Block(object):
         raise ValueError("Var {0} is not found recursively".format(name))
 
     def all_parameters(self):
-        return list(self._iter_parameters())
+        return list(self.iter_parameters())
 
-    def _iter_parameters(self):
-        return (item[1] for item in self.vars.iteritems()
+    def iter_parameters(self):
+        return (item[1] for item in list(self.vars.items())
                 if isinstance(item[1], Parameter))
 
     def create_var(self, *args, **kwargs):
@@ -1038,7 +1058,26 @@ class Block(object):
         global_block = self.program.global_block()
         param = Parameter(global_block, *args, **kwargs)
         if 'initializer' in kwargs:
-            kwargs['initializer'](param, self)
+
+            def _is_inited_by(block, var):
+                init_ops = []
+                for op in block.ops:
+                    if var.name in op.output_arg_names:
+                        init_ops.append(op)
+                return init_ops
+
+            initializer = kwargs['initializer']
+            init_ops = _is_inited_by(global_block, param)
+            init_ops_len = len(init_ops)
+            if init_ops_len > 1:
+                raise RuntimeError("param " + param.name +
+                                   " is inited by multiple init ops " + str(
+                                       init_ops))
+            elif init_ops_len == 1:
+                #TODO already inited, do nothing, should log a warning
+                pass
+            else:
+                initializer(param, self)
         return param
 
     def append_op(self, *args, **kwargs):
@@ -1113,7 +1152,7 @@ class Block(object):
                 self.create_var(name=var.name(), desc=var, type=var.type())
 
         # sync variables removed from c++ end
-        for var in self.vars.keys():
+        for var in list(self.vars.keys()):
             if not self.desc.find_var(var):
                 self.vars.pop(var)
 
@@ -1185,7 +1224,7 @@ class Block(object):
         if not isinstance(other, Block):
             raise TypeError(
                 "_copy_param_info_from should be invoked with Block")
-        for p in other._iter_parameters():
+        for p in other.iter_parameters():
             assert isinstance(p, Parameter)
             v = self.vars.get(p.name, None)
             if v is None:
@@ -1385,7 +1424,8 @@ class Program(object):
                 res_str += block.to_string(throw_on_error, with_details)
         else:
             protostr = self.desc.serialize_to_string()
-            proto = framework_pb2.ProgramDesc.FromString(str(protostr))
+            proto = framework_pb2.ProgramDesc.FromString(
+                six.binary_type(protostr))
             res_str = _debug_string_(proto, throw_on_error)
         return res_str
 
@@ -1483,7 +1523,7 @@ class Program(object):
         else:
             p = Program()
             p.desc = core.ProgramDesc(self.desc)
-            p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
+            p.blocks = [Block(p, i) for i in range(self.desc.num_blocks())]
             p._sync_with_cpp()
 
         p._copy_param_info_from(self)
@@ -1535,13 +1575,18 @@ class Program(object):
             targets_idx.append([t.block.idx, t.idx])
         res = Program()
         res.desc = core.prune(self.desc, targets_idx)
-        res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
+        res.blocks = [Block(res, i) for i in range(res.desc.num_blocks())]
         res._sync_with_cpp()
         return res
 
     def inference_optimize(self):
         """
-        This method will create a new program and change the :code:`is_test`
+        This method will create a new program and do following adjustments on it:
+        1. Remove all reader variables and their creator ops if exist.
+
+        2. Remove the :code:`read_op` if exists.
+
+        3. change the :code:`is_test`
         attribute of operators to :code:`True`. All the :code:`Parameter`
         information will be lost.
 
@@ -1555,13 +1600,29 @@ class Program(object):
         # core.inference_optimize being fixed.
         res = Program()
         res.desc = core.ProgramDesc(self.desc)
-        for i in xrange(res.desc.num_blocks()):
+
+        # remove all readers and the read_op if exist
+        read_op_idx = 0
+        root_block = res.desc.block(0)
+        while True:
+            if read_op_idx >= root_block.op_size() or root_block.op(
+                    read_op_idx).type() == 'read':
+                break
+            read_op_idx += 1
+        if read_op_idx < root_block.op_size():
+            root_block._remove_op(0, read_op_idx + 1)
+        for var in root_block.all_vars():
+            if var.type() == core.VarDesc.VarType.READER:
+                root_block._remove_var(var.name())
+
+        # change all `is_test` attributes to True
+        for i in range(res.desc.num_blocks()):
             block = res.desc.block(i)
-            for j in xrange(block.op_size()):
+            for j in range(block.op_size()):
                 op = block.op(j)
                 if op.has_attr('is_test'):
                     op.set_attr('is_test', True)
-        res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
+        res.blocks = [Block(res, i) for i in range(res.desc.num_blocks())]
         res._sync_with_cpp()
         return res
 
@@ -1574,14 +1635,14 @@ class Program(object):
         and deserialization.
 
         Args:
-            binary_str(str): The binary prootbuf string.
+            binary_str_type(str): The binary prootbuf string.
 
         Returns:
             Program: A deserialized program desc.
         """
         p = Program()
         p.desc = core.ProgramDesc(binary_str)
-        p.blocks = [Block(p, i) for i in xrange(p.desc.num_blocks())]
+        p.blocks = [Block(p, i) for i in range(p.desc.num_blocks())]
         p._sync_with_cpp()
         return p
 
@@ -1609,7 +1670,7 @@ class Program(object):
         self._seed = seed
 
     def __repr__(self):
-        return str(self)
+        return self.__str__()
 
     def global_block(self):
         """
@@ -1720,7 +1781,7 @@ class Program(object):
         if len(self.blocks) != len(other.blocks):
             raise ValueError("_copy_param_info_from should be invoked with two "
                              "program, with represent the same topology")
-        for var in other.global_block().vars.itervalues():
+        for var in list(other.global_block().vars.values()):
             if var.is_data:
                 self.global_block().var(var.name).is_data = True
 
@@ -1732,15 +1793,15 @@ class Program(object):
             iterable: The generator will yield every variable in this program.
         """
         for each_block in self.blocks:
-            for each_var in each_block.vars.itervalues():
+            for each_var in list(each_block.vars.values()):
                 yield each_var
 
 
 class Parameter(Variable):
     """
-    Parameter is derived from Variable. A parameter is a persistable 
+    Parameter is derived from Variable. A parameter is a persistable
     Variable, and will be updated by optimizers after each iteration.
-    The training of a neural network is essentially the updating of 
+    The training of a neural network is essentially the updating of
     its parameters.
 
     Relative to a general Variable, a Parameter has several its own
@@ -1806,8 +1867,8 @@ class Parameter(Variable):
             additional_attr = ("trainable", "optimize_attr", "regularizer",
                                "gradient_clip_attr", "do_model_average")
             for attr_name in additional_attr:
-                res_str += "%s: %s\n" % (attr_name,
-                                         str(getattr(self, attr_name)))
+                res_str += "%s: %s\n" % (
+                    attr_name, six.binary_type(getattr(self, attr_name)))
         else:
             res_str = Variable.to_string(self, throw_on_error, False)
         return res_str
diff --git a/python/paddle/fluid/graphviz.py b/python/paddle/fluid/graphviz.py
index 125b4efa9d476e561bd78d0365cd92bbf7e66605..ba67bf5ae6fe44ea23414d444a270c436c195326 100644
--- a/python/paddle/fluid/graphviz.py
+++ b/python/paddle/fluid/graphviz.py
@@ -14,12 +14,13 @@
 
 import os
 import random
+import six
 import subprocess
 import logging
 
 
 def crepr(v):
-    if type(v) is str or type(v) is unicode:
+    if isinstance(v, six.string_types):
         return '"%s"' % v
     return str(v)
 
@@ -104,7 +105,7 @@ class Graph(object):
 
     def _rank_repr(self):
         ranks = sorted(
-            self.rank_groups.items(),
+            list(self.rank_groups.items()),
             cmp=lambda a, b: a[1].priority > b[1].priority)
         repr = []
         for x in ranks:
@@ -148,7 +149,7 @@ class Node(object):
             name=self.name,
             label=self.label,
             extra=',' + ','.join("%s=%s" % (key, crepr(value))
-                                 for key, value in self.attrs.items())
+                                 for key, value in list(self.attrs.items()))
             if self.attrs else "")
         return reprs
 
@@ -172,7 +173,7 @@ class Edge(object):
             target=self.target.name,
             extra="" if not self.attrs else
             "[" + ','.join("{}={}".format(attr[0], crepr(attr[1]))
-                           for attr in self.attrs.items()) + "]")
+                           for attr in list(self.attrs.items())) + "]")
         return repr
 
 
diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py
index a81e39695b78f235d6ae896d90117dd392692634..ff382d8b832b4b2bc6779dbb28d3fd95c8a0984e 100644
--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -14,14 +14,14 @@
 
 import contextlib
 
-import core
-
-import executor
-import framework
-import io
-import parallel_executor
-import unique_name
-from trainer import check_and_get_place
+from . import core
+
+from . import executor
+from . import framework
+from . import io
+from . import parallel_executor
+from . import unique_name
+from .trainer import check_and_get_place
 
 __all__ = ['Inferencer', ]
 
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 0e640bf280d396504deec1183821da3e8a156530..83290ac60839b855a0348696ad6898af7335e2fc 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import framework
+from . import framework
 import numpy as np
 import contextlib
-from framework import convert_np_dtype_to_dtype_
-from core import VarDesc
+from .framework import convert_np_dtype_to_dtype_
+from .core import VarDesc
 
 __all__ = [
     'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'MSRA',
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index f36c0a7b0a8848305d5811a81cb1aab322a89aa9..80b258c9c96d53c4ee7196fcbb59e4a22b04ddef 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -16,6 +16,7 @@ import os
 import errno
 import time
 import shutil
+import six
 
 from paddle.fluid.evaluator import Evaluator
 from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable
@@ -92,34 +93,34 @@ def save_vars(executor,
     """
     Save variables to the given directory by executor.
 
-    There are two ways to specify variables to be saved: The first way, list 
-    variables in a list and assign it to the `vars`. The second way, assign the 
-    `main_program` with an existing program, then all variables in the program 
-    will be saved. The first way has a higher priority. In other words, if `vars` 
+    There are two ways to specify variables to be saved: The first way, list
+    variables in a list and assign it to the `vars`. The second way, assign the
+    `main_program` with an existing program, then all variables in the program
+    will be saved. The first way has a higher priority. In other words, if `vars`
     are assigned, the `main_program` and the `predicate` will be ignored.
 
-    The `dirname` are used to specify the folder where to save variables. 
-    If you prefer to save variables in separate files in the folder `dirname`, 
-    set `filename` None; if you prefer to save all variables in a single file, 
+    The `dirname` are used to specify the folder where to save variables.
+    If you prefer to save variables in separate files in the folder `dirname`,
+    set `filename` None; if you prefer to save all variables in a single file,
     use `filename` to specify it.
 
     Args:
         executor(Executor): The executor to run for saving variables.
         dirname(str): The directory path.
-        main_program(Program|None): The program whose variables will be saved. 
-                                    If it is None, the default main program will 
+        main_program(Program|None): The program whose variables will be saved.
+                                    If it is None, the default main program will
                                     be used automatically.
                                     Default: None
-        vars(list[Variable]|None): The list that contains all variables to save. 
+        vars(list[Variable]|None): The list that contains all variables to save.
                                    It has a higher priority than the `main_program`.
                                    Default: None
-        predicate(function|None): If it is not None, only variables in the 
-                                  `main_program` that makes predicate(variable)==True 
-                                  will be saved. It only works when we are using the 
-                                  `main_program` to specify variables (In other words 
+        predicate(function|None): If it is not None, only variables in the
+                                  `main_program` that makes predicate(variable)==True
+                                  will be saved. It only works when we are using the
+                                  `main_program` to specify variables (In other words
                                   `vars` is None).
                                   Default: None
-        filename(str|None): The file which to save all variables. If you prefer to save 
+        filename(str|None): The file which to save all variables. If you prefer to save
                             variables separately, set it to None.
                             Default: None
 
@@ -149,7 +150,7 @@ def save_vars(executor,
 
             # The second usage: using `vars` to specify variables
             var_list = [var_a, var_b, var_c]
-            fluid.io.save_vars(executor=exe, dirname=path, vars=var_list, 
+            fluid.io.save_vars(executor=exe, dirname=path, vars=var_list,
                                filename="vars_file")
             # var_a, var_b and var_c will be saved. And they are going to be
             # saved in the same file named 'var_file' in the path "./my_paddle_model".
@@ -163,7 +164,7 @@ def save_vars(executor,
         save_vars(
             executor,
             dirname=dirname,
-            vars=filter(predicate, main_program.list_vars()),
+            vars=list(filter(predicate, main_program.list_vars())),
             filename=filename)
     else:
         save_program = Program()
@@ -203,14 +204,14 @@ def save_params(executor, dirname, main_program=None, filename=None):
     This function filters out all parameters from the give `main_program`
     and then save them to the folder `dirname` or the file `filename`.
 
-    Use the `dirname` to specify the saving folder. If you would like to 
-    save parameters in separate files, set `filename` None; if you would 
-    like to save all parameters in a single file, use `filename` to specify 
+    Use the `dirname` to specify the saving folder. If you would like to
+    save parameters in separate files, set `filename` None; if you would
+    like to save all parameters in a single file, use `filename` to specify
     the file name.
 
-    NOTICE: Some variables are not Parameter while they are necessary for 
-    training. So you can NOT save and continue your training just by 
-    `save_params()` and `load_params()`. Please use `save_persistables()` 
+    NOTICE: Some variables are not Parameter while they are necessary for
+    training. So you can NOT save and continue your training just by
+    `save_params()` and `load_params()`. Please use `save_persistables()`
     and `load_persistables()` instead.
 
     Args:
@@ -220,8 +221,8 @@ def save_params(executor, dirname, main_program=None, filename=None):
                                     saved. If it is None, the default
                                     main program will be used automatically.
                                     Default: None
-        filename(str|None): The file to save all parameters. If you prefer 
-                            to save parameters in differnet files, set it 
+        filename(str|None): The file to save all parameters. If you prefer
+                            to save parameters in differnet files, set it
                             to None.
                             Default: None
 
@@ -234,7 +235,7 @@ def save_params(executor, dirname, main_program=None, filename=None):
             exe = fluid.Executor(fluid.CPUPlace())
             param_path = "./my_paddle_model"
             prog = fluid.default_main_program()
-            fluid.io.save_params(executor=exe, dirname=param_path, 
+            fluid.io.save_params(executor=exe, dirname=param_path,
                                  main_program=None)
     """
     save_vars(
@@ -248,23 +249,23 @@ def save_params(executor, dirname, main_program=None, filename=None):
 
 def save_persistables(executor, dirname, main_program=None, filename=None):
     """
-    This function filters out all variables with `persistable==True` from the 
-    give `main_program` and then saves these variables to the folder `dirname` 
+    This function filters out all variables with `persistable==True` from the
+    give `main_program` and then saves these variables to the folder `dirname`
     or file `filename`.
 
-    The `dirname` is used to specify the folder where persistable variables 
-    are going to be saved. If you would like to save variables in separate 
-    files, set `filename` None; if you would like to save all variables in a 
+    The `dirname` is used to specify the folder where persistable variables
+    are going to be saved. If you would like to save variables in separate
+    files, set `filename` None; if you would like to save all variables in a
     single file, use `filename` to specify the file name.
 
     Args:
         executor(Executor): The executor to run for saving persistable variables.
         dirname(str): The directory path.
-        main_program(Program|None): The program whose persistbale variables will 
-                                    be saved. If it is None, the default main 
+        main_program(Program|None): The program whose persistbale variables will
+                                    be saved. If it is None, the default main
                                     program will be used automatically.
                                     Default: None
-        filename(str|None): The file to saved all variables. If you prefer to 
+        filename(str|None): The file to saved all variables. If you prefer to
                             save variables in differnet files, set it to None.
                             Default: None
 
@@ -277,7 +278,7 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
             exe = fluid.Executor(fluid.CPUPlace())
             param_path = "./my_paddle_model"
             prog = fluid.default_main_program()
-            fluid.io.save_persistables(executor=exe, dirname=param_path, 
+            fluid.io.save_persistables(executor=exe, dirname=param_path,
                                        main_program=None)
     """
     save_vars(
@@ -298,34 +299,34 @@ def load_vars(executor,
     """
     Load variables from the given directory by executor.
 
-    There are two ways to specify variables to be loaded: The first way, list 
-    variables in a list and assign it to the `vars`. The second way, assign the 
-    `main_program` with an existing program, then all variables in the program 
-    will be loaded. The first way has a higher priority. In other words if `vars` 
+    There are two ways to specify variables to be loaded: The first way, list
+    variables in a list and assign it to the `vars`. The second way, assign the
+    `main_program` with an existing program, then all variables in the program
+    will be loaded. The first way has a higher priority. In other words if `vars`
     are assigned, the `main_program` and the `predicate` will be ignored.
 
-    The `dirname` are used to specify the folder where to load variables. 
-    If variables were saved in separate files in the folder `dirname`, 
-    set `filename` None; if all variables were saved in a single file, 
+    The `dirname` are used to specify the folder where to load variables.
+    If variables were saved in separate files in the folder `dirname`,
+    set `filename` None; if all variables were saved in a single file,
     use `filename` to specify it.
 
     Args:
         executor(Executor): The executor to run for loading variables.
         dirname(str): The directory path.
-        main_program(Program|None): The program whose variables will be loaded. 
-                                    If it is None, the default main program will 
+        main_program(Program|None): The program whose variables will be loaded.
+                                    If it is None, the default main program will
                                     be used automatically.
                                     Default: None
-        vars(list[Variable]|None): The list that contains all variables to load. 
+        vars(list[Variable]|None): The list that contains all variables to load.
                                    It has a higher priority than the `main_program`.
                                    Default: None
-        predicate(function|None): If it is not None, only variables in the 
-                                  `main_program` that makes predicate(variable)==True 
-                                  will be loaded. It only works when we are using the 
-                                  `main_program` to specify variables (In other words 
+        predicate(function|None): If it is not None, only variables in the
+                                  `main_program` that makes predicate(variable)==True
+                                  will be loaded. It only works when we are using the
+                                  `main_program` to specify variables (In other words
                                   `vars` is None).
                                   Default: None
-        filename(str|None): The file which saved all required variables. If variables 
+        filename(str|None): The file which saved all required variables. If variables
                             were saved in differnet files, set it to None.
                             Default: None
 
@@ -355,9 +356,9 @@ def load_vars(executor,
 
             # The second usage: using `vars` to specify variables
             var_list = [var_a, var_b, var_c]
-            fluid.io.load_vars(executor=exe, dirname=path, vars=var_list, 
+            fluid.io.load_vars(executor=exe, dirname=path, vars=var_list,
                                filename="vars_file")
-            # var_a, var_b and var_c will be loaded. And they are supposed to haven 
+            # var_a, var_b and var_c will be loaded. And they are supposed to haven
             # been saved in the same file named 'var_file' in the path "./my_paddle_model".
     """
     if vars is None:
@@ -369,8 +370,7 @@ def load_vars(executor,
         load_vars(
             executor,
             dirname=dirname,
-            main_program=main_program,
-            vars=filter(predicate, main_program.list_vars()),
+            vars=list(filter(predicate, main_program.list_vars())),
             filename=filename)
     else:
         load_prog = Program()
@@ -414,15 +414,15 @@ def load_params(executor, dirname, main_program=None, filename=None):
     and then trys to load these parameters from the folder `dirname` or
     the file `filename`.
 
-    Use the `dirname` to specify the folder where parameters were saved. If 
-    parameters were saved in separate files in the folder `dirname`, set 
-    `filename` None; if all parameters were saved in a single file, use 
+    Use the `dirname` to specify the folder where parameters were saved. If
+    parameters were saved in separate files in the folder `dirname`, set
+    `filename` None; if all parameters were saved in a single file, use
     `filename` to specify the file name.
 
-    NOTICE: Some variables are not Parameter while they are necessary for 
-    training. So you can NOT save and continue your training just by 
-    `save_params()` and `load_params()`. Please use `save_persistables()` 
-    and `load_persistables()` instead. 
+    NOTICE: Some variables are not Parameter while they are necessary for
+    training. So you can NOT save and continue your training just by
+    `save_params()` and `load_params()`. Please use `save_persistables()`
+    and `load_persistables()` instead.
 
     Args:
         executor(Executor): The executor to run for loading parameters.
@@ -431,7 +431,7 @@ def load_params(executor, dirname, main_program=None, filename=None):
                                     loaded. If it is None, the default
                                     main program will be used automatically.
                                     Default: None
-        filename(str|None): The file which saved all parameters. If parameters 
+        filename(str|None): The file which saved all parameters. If parameters
                             were saved in differnet files, set it to None.
                             Default: None
 
@@ -444,7 +444,7 @@ def load_params(executor, dirname, main_program=None, filename=None):
             exe = fluid.Executor(fluid.CPUPlace())
             param_path = "./my_paddle_model"
             prog = fluid.default_main_program()
-            fluid.io.load_params(executor=exe, dirname=param_path, 
+            fluid.io.load_params(executor=exe, dirname=param_path,
                                 main_program=None)
     """
     load_vars(
@@ -457,23 +457,23 @@ def load_params(executor, dirname, main_program=None, filename=None):
 
 def load_persistables(executor, dirname, main_program=None, filename=None):
     """
-    This function filters out all variables with `persistable==True` from the 
-    give `main_program` and then trys to load these variables from the folder 
+    This function filters out all variables with `persistable==True` from the
+    give `main_program` and then trys to load these variables from the folder
     `dirname` or the file `filename`.
 
-    Use the `dirname` to specify the folder where persistable variables were 
-    saved. If variables were saved in separate files, set `filename` None; 
-    if all variables were saved in a single file, use `filename` to specify 
+    Use the `dirname` to specify the folder where persistable variables were
+    saved. If variables were saved in separate files, set `filename` None;
+    if all variables were saved in a single file, use `filename` to specify
     the file name.
 
     Args:
         executor(Executor): The executor to run for loading persistable variables.
         dirname(str): The directory path.
-        main_program(Program|None): The program whose persistbale variables will 
-                                    be loaded. If it is None, the default main 
+        main_program(Program|None): The program whose persistbale variables will
+                                    be loaded. If it is None, the default main
                                     program will be used automatically.
                                     Default: None
-        filename(str|None): The file which saved all variables. If variables were 
+        filename(str|None): The file which saved all variables. If variables were
                             saved in differnet files, set it to None.
                             Default: None
 
@@ -486,7 +486,7 @@ def load_persistables(executor, dirname, main_program=None, filename=None):
             exe = fluid.Executor(fluid.CPUPlace())
             param_path = "./my_paddle_model"
             prog = fluid.default_main_program()
-            fluid.io.load_persistables(executor=exe, dirname=param_path, 
+            fluid.io.load_persistables(executor=exe, dirname=param_path,
                                        main_program=None)
     """
     load_vars(
@@ -565,20 +565,20 @@ def save_inference_model(dirname,
 
     Args:
         dirname(str): The directory path to save the inference model.
-        feeded_var_names(list[str]): Names of variables that need to be feeded data 
+        feeded_var_names(list[str]): Names of variables that need to be feeded data
                                      during inference.
-        target_vars(list[Variable]): Variables from which we can get inference 
+        target_vars(list[Variable]): Variables from which we can get inference
                                      results.
         executor(Executor): The executor that saves the inference model.
-        main_program(Program|None): The original program, which will be pruned to 
-                                    build the inference model. If is setted None, 
+        main_program(Program|None): The original program, which will be pruned to
+                                    build the inference model. If is setted None,
                                     the default main program will be used.
                                     Default: None.
-        model_filename(str|None): The name of file to save the inference program 
-                                  itself. If is setted None, a default filename 
+        model_filename(str|None): The name of file to save the inference program
+                                  itself. If is setted None, a default filename
                                   `__model__` will be used.
-        params_filename(str|None): The name of file to save all related parameters. 
-                                   If it is setted None, parameters will be saved 
+        params_filename(str|None): The name of file to save all related parameters.
+                                   If it is setted None, parameters will be saved
                                    in separate files .
 
     Returns:
@@ -596,20 +596,32 @@ def save_inference_model(dirname,
             fluid.io.save_inference_model(dirname=path, feeded_var_names=['img'],
                          target_vars=[predict_var], executor=exe)
 
-            # In this exsample, the function will prune the default main program 
-            # to make it suitable for infering the `predict_var`. The pruned 
-            # inference program is going to be saved in the "./infer_model/__model__" 
+            # In this exsample, the function will prune the default main program
+            # to make it suitable for infering the `predict_var`. The pruned
+            # inference program is going to be saved in the "./infer_model/__model__"
             # and parameters are going to be saved in separate files under folder
-            # "./infer_model". 
+            # "./infer_model".
 
     """
-    if isinstance(feeded_var_names, basestring):
+    if isinstance(feeded_var_names, six.binary_type):
         feeded_var_names = [feeded_var_names]
+    elif isinstance(feeded_var_names, six.text_type):
+        feeded_var_names = [feeded_var_names.encode()]
     else:
         if len(feeded_var_names) > 0:
+            # TODO(paddle-dev): polish these code blocks
             if not (bool(feeded_var_names) and all(
-                    isinstance(name, basestring) for name in feeded_var_names)):
-                raise ValueError("'feed_var_names' should be a list of str.")
+                    isinstance(name, six.binary_type)
+                    for name in feeded_var_names)):
+                if not (all(
+                        isinstance(name, six.text_type)
+                        for name in feeded_var_names)):
+                    raise ValueError(
+                        "'feed_var_names' should be a list of str.")
+                else:
+                    feeded_var_names = [
+                        name.encode() for name in feeded_var_names
+                    ]
 
     if isinstance(target_vars, Variable):
         target_vars = [target_vars]
@@ -666,22 +678,22 @@ def load_inference_model(dirname,
         dirname(str): The directory path
         executor(Executor): The executor to run for loading inference model.
         model_filename(str|None): The name of file to load inference program.
-                                  If it is None, the default filename 
+                                  If it is None, the default filename
                                   '__model__' will be used.
                                   Default: None
         params_filename(str|None): The name of file to load all parameters.
-                                   It is only used for the case that all 
-                                   parameters were saved in a single binary 
-                                   file. If parameters were saved in separate 
+                                   It is only used for the case that all
+                                   parameters were saved in a single binary
+                                   file. If parameters were saved in separate
                                    files, set it as 'None'.
 
     Returns:
         tuple: The return of this function is a tuple with three elements:
-        (program, feed_target_names, fetch_targets). The `program` is a 
-        Program, it's the program for inference. The `feed_target_names` is 
-        a list of str, it contains Names of variables that need to feed 
-        data in the inference program. The `fetch_targets` is a list of 
-        Variable. It contains variables from which we can get inference 
+        (program, feed_target_names, fetch_targets). The `program` is a
+        Program, it's the program for inference. The `feed_target_names` is
+        a list of str, it contains Names of variables that need to feed
+        data in the inference program. The `fetch_targets` is a list of
+        Variable. It contains variables from which we can get inference
         results.
 
     Raises:
@@ -692,17 +704,17 @@ def load_inference_model(dirname,
 
             exe = fluid.Executor(fluid.CPUPlace())
             path = "./infer_model"
-            [inference_program, feed_target_names, fetch_targets] = 
+            [inference_program, feed_target_names, fetch_targets] =
                 fluid.io.load_inference_model(dirname=path, executor=exe)
             results = exe.run(inference_program,
                           feed={feed_target_names[0]: tensor_img},
                           fetch_list=fetch_targets)
 
-            # In this exsample, the inference program was saved in the 
-            # "./infer_model/__model__" and parameters were saved in 
-            # separate files in ""./infer_model". 
-            # After getting inference program, feed target names and 
-            # fetch targets, we can use an Executor to run the inference 
+            # In this exsample, the inference program was saved in the
+            # "./infer_model/__model__" and parameters were saved in
+            # separate files in ""./infer_model".
+            # After getting inference program, feed target names and
+            # fetch targets, we can use an Executor to run the inference
             # program to get the inference result.
 
     """
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index de752d1daeb6bc725cf6eff1bb74a786e2ad6b95..0c2b1eb795860373220eb254612161f7dc816ffd 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -14,12 +14,14 @@
 
 import copy
 import itertools
+import six
 
-from framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
-import unique_name
+from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
+from . import unique_name
 from paddle.fluid.initializer import Constant, Xavier
-from param_attr import ParamAttr, WeightNormParamAttr
-import core
+from .param_attr import ParamAttr, WeightNormParamAttr
+from . import core
+from six.moves import zip
 
 
 class LayerHelper(object):
@@ -83,7 +85,7 @@ class LayerHelper(object):
             raise ValueError("parameter number mismatch")
         elif len(param_attr) == 1 and length != 1:
             tmp = [None] * length
-            for i in xrange(length):
+            for i in range(length):
                 tmp[i] = copy.deepcopy(param_attr[0])
             param_attr = tmp
         return param_attr
@@ -91,7 +93,7 @@ class LayerHelper(object):
     def iter_inputs_and_params(self, input_param_name='input'):
         inputs = self.multiple_input(input_param_name)
         param_attrs = self.multiple_param_attr(len(inputs))
-        for ipt, param_attr in itertools.izip(inputs, param_attrs):
+        for ipt, param_attr in zip(inputs, param_attrs):
             yield ipt, param_attr
 
     def input_dtype(self, input_param_name='input'):
@@ -218,7 +220,7 @@ class LayerHelper(object):
                 norm = __norm_op(reshape, dim=0, block=block)
                 __reshape_op(norm, out=out, shape=out_shape, block=block)
             else:
-                perm = range(len(x.shape))
+                perm = list(range(len(x.shape)))
                 perm[0], perm[dim] = dim, 0
                 transpose = __transpose_op(x, perm, block=block)
                 norm = __norm_op(transpose, dim=0, block=block)
@@ -397,8 +399,10 @@ class LayerHelper(object):
         act = self.kwargs.get('act', None)
         if act is None:
             return input_var
-        if isinstance(act, basestring):
+        if isinstance(act, six.string_types):
             act = {'type': act}
+        else:
+            raise TypeError(str(act) + " should be unicode or str")
 
         if 'use_cudnn' in self.kwargs and self.kwargs.get('use_cudnn'):
             act['use_cudnn'] = self.kwargs.get('use_cudnn')
diff --git a/python/paddle/fluid/layers/__init__.py b/python/paddle/fluid/layers/__init__.py
index 4917e67de0d20ff9e8f9a27f38e1bd2abef5c503..a48e360463456ab7e00534dc0684aa153c8205cd 100644
--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
@@ -12,25 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import ops
-from ops import *
-import nn
-from nn import *
-import io
-from io import *
-import tensor
-from tensor import *
-import control_flow
-from control_flow import *
-import device
-from device import *
-import math_op_patch
-from math_op_patch import *
-import detection
-from detection import *
-import metric_op
-from metric_op import *
-from learning_rate_scheduler import *
+from . import ops
+from .ops import *
+from . import nn
+from .nn import *
+from . import io
+from .io import *
+from . import tensor
+from .tensor import *
+from . import control_flow
+from .control_flow import *
+from . import device
+from .device import *
+from . import math_op_patch
+from .math_op_patch import *
+from . import detection
+from .detection import *
+from . import metric_op
+from .metric_op import *
+from .learning_rate_scheduler import *
 
 __all__ = []
 __all__ += nn.__all__
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index f05ae6d5d1900560e37370121bf64f1fcab14357..9fb7b4d0cad67db2d2d4b56e43d8837b8160cdb0 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -13,14 +13,16 @@
 # limitations under the License.
 import contextlib
 
-from layer_function_generator import autodoc, templatedoc
-from tensor import assign, fill_constant
+from .layer_function_generator import autodoc, templatedoc
+from .tensor import assign, fill_constant
 from .. import core
 from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
 from ..initializer import force_init_on_cpu
-from ops import logical_and, logical_not, logical_or
+from .ops import logical_and, logical_not, logical_or
 import numpy
+import warnings
+from functools import reduce
 
 __all__ = [
     'While',
@@ -275,11 +277,14 @@ class ParallelDo(object):
           avg_cost = fluid.layers.mean(x=cost)
 
     .. warning::
-    
+
        It will be soon deprecated, please use ParallelExecutor instead.
     """
 
     def __init__(self, places, use_nccl=False, name=None):
+        warnings.warn(
+            "API ParallelDo is deprecated since 0.15.0. Please use ParallelExecutor instead.",
+            Warning)
         self.helper = LayerHelper("parallel_do", name=name)
         self.inputs = []
         self.places = places
@@ -338,7 +343,7 @@ class ParallelDo(object):
 
         return [parent_block.var(name) for name in params]
 
-    def complete_op(self):
+    def _complete_op(self):
         main_program = self.helper.main_program
         current_block = main_program.current_block()
         parent_block = self.parent_block()
@@ -394,7 +399,7 @@ class BlockGuardWithCompletion(BlockGuard):
         if exc_type is not None:
             return False
         self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
-        self.rnn.complete_op()
+        self.rnn._complete_op()
         return super(BlockGuardWithCompletion, self).__exit__(exc_type, exc_val,
                                                               exc_tb)
 
@@ -470,7 +475,7 @@ class StaticRNN(object):
             if shape is None or batch_ref is None:
                 raise ValueError(
                     "if init is None, memory at least need shape and batch_ref")
-            parent_block = self.parent_block()
+            parent_block = self._parent_block()
             var_name = unique_name.generate("@".join(
                 [self.helper.name, "memory_boot"]))
             boot_var = parent_block.create_var(
@@ -527,7 +532,7 @@ class StaticRNN(object):
             outputs={'Out': tmp_o},
             attrs={'dtype': o.dtype})
 
-        out_var = self.parent_block().create_var(
+        out_var = self._parent_block().create_var(
             name=tmp_o.name,
             shape=[self.seq_len] + list(tmp_o.shape),
             dtype=tmp_o.dtype)
@@ -543,7 +548,7 @@ class StaticRNN(object):
             raise TypeError("update memory should take variables")
         self.memories[mem.name].mem = var
 
-    def parent_block(self):
+    def _parent_block(self):
         prog = self.helper.main_program
         parent_idx = prog.current_block().parent_idx
         assert parent_idx >= 0
@@ -560,10 +565,10 @@ class StaticRNN(object):
         else:
             return self.outputs
 
-    def complete_op(self):
+    def _complete_op(self):
         main_program = self.helper.main_program
         rnn_block = main_program.current_block()
-        parent_block = self.parent_block()
+        parent_block = self._parent_block()
 
         local_inputs = set()
 
@@ -597,7 +602,7 @@ class StaticRNN(object):
         boot_memories = []
         pre_memories = []
         memories = []
-        for _, mem in self.memories.iteritems():
+        for _, mem in list(self.memories.items()):
             boot_memories.append(mem.init)
             pre_memories.append(mem.pre_mem.name)
             mem_var = rnn_block.var(mem.mem.name)
@@ -643,7 +648,7 @@ class WhileGuard(BlockGuard):
         if exc_type is not None:
             return False
         self.while_op.status = While.AFTER_WHILE_BLOCK
-        self.while_op.complete()
+        self.while_op._complete()
         return super(WhileGuard, self).__exit__(exc_type, exc_val, exc_tb)
 
 
@@ -690,7 +695,7 @@ class While(object):
     def block(self):
         return WhileGuard(self)
 
-    def complete(self):
+    def _complete(self):
         main_program = self.helper.main_program
         while_block = main_program.current_block()
         parent_block = main_program.block(main_program.current_block()
@@ -815,21 +820,21 @@ def max_sequence_len(rank_table):
 
 
 def lod_tensor_to_array(x, table):
-    """ 
+    """
     Convert a LoDTensor to a LoDTensorArray.
 
-    This function split a LoDTesnor to a LoDTensorArray according to its LoD 
-    information. LoDTensorArray is an alias of C++ std::vector<LoDTensor> in 
-    PaddlePaddle. The generated LoDTensorArray of this function can be further read 
-    or written by `read_from_array()` and `write_to_array()` operators. However, 
-    this function is generally an internal component of PaddlePaddle `DynamicRNN`. 
+    This function split a LoDTesnor to a LoDTensorArray according to its LoD
+    information. LoDTensorArray is an alias of C++ std::vector<LoDTensor> in
+    PaddlePaddle. The generated LoDTensorArray of this function can be further read
+    or written by `read_from_array()` and `write_to_array()` operators. However,
+    this function is generally an internal component of PaddlePaddle `DynamicRNN`.
     Users should not use it directly.
 
     Args:
         x (Variable|list): The LoDTensor to be converted to a LoDTensorArray.
         table (ParamAttr|list): The variable that stores the level of lod
                                 which is ordered by sequence length in
-                                descending order. It is generally generated 
+                                descending order. It is generally generated
                                 by `layers.lod_rank_table()` API.
 
     Returns:
@@ -1063,9 +1068,9 @@ def array_read(array, i):
         Given:
 
         array = [0.6, 0.1, 0.3, 0.1]
-        
+
         And:
-        
+
         i = 2
 
         Then:
@@ -1172,9 +1177,9 @@ def array_length(array):
 
 class ConditionalBlockGuard(BlockGuard):
     """
-    ConditionalBlockGuard is derived from BlockGuard. It is dedicated for 
-    holding a ConditionalBlock, and helping users entering and exiting the 
-    ConditionalBlock via Python's 'with' keyword. However, ConditionalBlockGuard 
+    ConditionalBlockGuard is derived from BlockGuard. It is dedicated for
+    holding a ConditionalBlock, and helping users entering and exiting the
+    ConditionalBlock via Python's 'with' keyword. However, ConditionalBlockGuard
     is generally an internal component of IfElse, users should not use it directly.
     """
 
@@ -1508,7 +1513,7 @@ class IfElse(object):
     def __call__(self):
         if self.status != self.OUT_IF_ELSE_BLOCKS:
             raise ValueError("IfElse::__call__ must be out of sub-block")
-        false_len, true_len = map(len, self.output_table)
+        false_len, true_len = list(map(len, self.output_table))
         if false_len == 0 and true_len == 0:
             raise ValueError("Must invoke true_block/false_block before "
                              "__call__")
@@ -1928,7 +1933,7 @@ def is_empty(x, cond=None, **ignored):
 
     Args:
         x (Variable): The Variable to be tested.
-        cond (Variable|None): Output parameter. Returns the test result 
+        cond (Variable|None): Output parameter. Returns the test result
                               of given 'x'. Default: None
 
     Returns:
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 3ef4afa691b1dfba07fb132753f380727bb4f3ae..9fae96d9bc7f5e6e89f8ee3894d28ae306e10ca9 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -15,12 +15,13 @@
 All layers just related to the detection neural network.
 """
 
-from layer_function_generator import generate_layer_fn
-from layer_function_generator import autodoc, templatedoc
+from .layer_function_generator import generate_layer_fn
+from .layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
-import tensor
-import nn
+from . import tensor
+from . import nn
 import math
+from functools import reduce
 
 __all__ = [
     'prior_box',
@@ -37,6 +38,7 @@ __all__ = [
 __auto__ = [
     'iou_similarity',
     'box_coder',
+    'polygon_box_transform',
 ]
 
 __all__ += __auto__
@@ -1031,7 +1033,7 @@ def multi_box_head(inputs,
         min_sizes = []
         max_sizes = []
         step = int(math.floor(((max_ratio - min_ratio)) / (num_layer - 2)))
-        for ratio in xrange(min_ratio, max_ratio + 1, step):
+        for ratio in range(min_ratio, max_ratio + 1, step):
             min_sizes.append(base_size * ratio / 100.)
             max_sizes.append(base_size * (ratio + step) / 100.)
         min_sizes = [base_size * .10] + min_sizes
diff --git a/python/paddle/fluid/layers/device.py b/python/paddle/fluid/layers/device.py
index 384d302a709eeec220864b9e8c9210ed028470f6..bb1fb7fd571a56acf367e663af0cf9431211bcea 100644
--- a/python/paddle/fluid/layers/device.py
+++ b/python/paddle/fluid/layers/device.py
@@ -15,7 +15,7 @@
 All util layers.
 """
 
-from layer_function_generator import autodoc
+from .layer_function_generator import autodoc
 from ..framework import unique_name
 from ..layer_helper import LayerHelper
 from ..annotations import deprecated
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index df6becabd166599df9f9963f704e372262104b2d..327ae309816344a0bcebfe70ffb59a00eab1d86f 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -16,8 +16,8 @@ import multiprocessing
 import threading
 
 from ..data_feeder import DataFeeder
-from control_flow import BlockGuard
-from layer_function_generator import templatedoc
+from .control_flow import BlockGuard
+from .layer_function_generator import templatedoc
 from .. import core
 from ..executor import global_scope
 from ..framework import convert_np_dtype_to_dtype_, default_main_program, \
@@ -69,7 +69,7 @@ def data(name,
     """
     helper = LayerHelper('data', **locals())
     shape = list(shape)
-    for i in xrange(len(shape)):
+    for i in range(len(shape)):
         if shape[i] is None:
             shape[i] = -1
             append_batch_size = False
@@ -387,9 +387,9 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
     Create a uniform random data generator
 
     This layer returns a Reader Variable.
-    Instead of opening a file and reading data from it, this 
-    Reader Variable generates float uniform random data by itself. 
-    It can be used as a dummy reader to test a network without 
+    Instead of opening a file and reading data from it, this
+    Reader Variable generates float uniform random data by itself.
+    It can be used as a dummy reader to test a network without
     opening a real file.
 
     Args:
@@ -443,9 +443,6 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
     main_prog_var = _copy_reader_var_(default_main_program().current_block(),
                                       startup_var)
 
-    if for_parallel:
-        main_prog_var = parallel(reader=main_prog_var)
-
     return monkey_patch_reader_methods(main_prog_var)
 
 
@@ -710,9 +707,9 @@ def open_files(filenames,
     """
     Open files
 
-    This layer takes a list of files to read from and returns a Reader Variable. 
-    Via the Reader Variable, we can get data from given files. All files must 
-    have name suffixs to indicate their formats, e.g., '*.recordio'. 
+    This layer takes a list of files to read from and returns a Reader Variable.
+    Via the Reader Variable, we can get data from given files. All files must
+    have name suffixs to indicate their formats, e.g., '*.recordio'.
 
     Args:
        filenames(list): The list of file names.
@@ -828,9 +825,9 @@ def shuffle(reader, buffer_size):
 
 def batch(reader, batch_size):
     """
-    This layer is a reader decorator. It takes a reader and adds 
-    'batching' decoration on it. When reading with the result 
-    decorated reader, output data will be automatically organized 
+    This layer is a reader decorator. It takes a reader and adds
+    'batching' decoration on it. When reading with the result
+    decorated reader, output data will be automatically organized
     to the form of batches.
 
     Args:
@@ -855,11 +852,11 @@ def batch(reader, batch_size):
             # If we read data with the raw_reader:
             #     data = fluid.layers.read_file(raw_reader)
             # We can only get data instance by instance.
-            # 
+            #
             # However, if we read data with the batch_reader:
             #     data = fluid.layers.read_file(batch_reader)
-            # Each 5 adjacent instances will be automatically combined together 
-            # to become a batch. So what we get('data') is a batch data instead 
+            # Each 5 adjacent instances will be automatically combined together
+            # to become a batch. So what we get('data') is a batch data instead
             # of an instance.
     """
     return __create_unshared_decorated_reader__(
@@ -906,8 +903,8 @@ def read_file(reader):
     """
     Execute the given reader and get data via it.
 
-    A reader is also a Variable. It can be a raw reader generated by 
-    `fluid.layers.open_files()` or a decorated one generated by 
+    A reader is also a Variable. It can be a raw reader generated by
+    `fluid.layers.open_files()` or a decorated one generated by
     `fluid.layers.double_buffer()` and so on.
 
     Args:
@@ -1008,7 +1005,7 @@ class Preprocessor(object):
         source_lod_levels = self.underlying_reader.desc.lod_levels()
         self.source_var_names = [
             unique_name("preprocessor_source")
-            for _ in xrange(len(source_shapes))
+            for _ in range(len(source_shapes))
         ]
         source_vars = []
         for var_name, shape, dtype, lod_level in zip(
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 3096389101a5e5b302c78145b8bc9f1d71f6b8cb..c0d72620b1ddb183f43ebce766688518b5a737ac 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
-import cStringIO
 import functools
 import warnings
 import string
 
+from six.moves import cStringIO
 from ..proto import framework_pb2
 from ..framework import OpProtoHolder, Variable
 from ..layer_helper import LayerHelper
@@ -70,7 +70,7 @@ def _generate_doc_string_(op_proto):
     if not isinstance(op_proto, framework_pb2.OpProto):
         raise TypeError("OpProto should be `framework_pb2.OpProto`")
 
-    buf = cStringIO.StringIO()
+    buf = cStringIO()
     buf.write(escape_math(op_proto.comment))
     buf.write('\nArgs:\n')
     for each_input in op_proto.inputs:
@@ -119,9 +119,9 @@ def generate_layer_fn(op_type):
     """
     op_proto = OpProtoHolder.instance().get_op_proto(op_type)
     not_intermediate_outputs = \
-        filter(lambda output: not output.intermediate, op_proto.outputs)
+        [output for output in op_proto.outputs if not output.intermediate]
     intermediate_outputs = \
-        filter(lambda output: output.intermediate, op_proto.outputs)
+        [output for output in op_proto.outputs if output.intermediate]
 
     if len(not_intermediate_outputs) != 1:
         raise ValueError("Only one non intermediate output operator can be",
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index c7966e36f15ef0e3f30f8a96ad71df04aece0fa1..daf91a40f7ad7935d355a287819ad1dbcdd84eb8 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -20,10 +20,10 @@ User can also implement their own learning_rate_decay
 strategy according to this module.
 """
 
-import control_flow
-import nn
-import ops
-import tensor
+from . import control_flow
+from . import nn
+from . import ops
+from . import tensor
 from ..initializer import init_on_cpu
 from ..framework import default_main_program, Parameter
 
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index f814c41633fbac76eb9411e2f418f521e8e9679d..0e10a91d25877984396f9bcf9aae6438707eeab1 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from ..framework import Variable, unique_name
-from layer_function_generator import OpProtoHolder
+from .layer_function_generator import OpProtoHolder
 from ..initializer import force_init_on_cpu
 
 
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
index e7d7a9e826de95514b6f2e04e7408075ab0b8cb6..49bae1e8af768d93294120e1d13ef0242313aa3c 100644
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -20,7 +20,7 @@ from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
-import nn
+from . import nn
 
 __all__ = ['accuracy', 'auc']
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 5d7f1eadd93a82dc2bdb88c5f5c80e437df4e29f..0960b54123d36b0ddcced6841384399cab816f48 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -33,11 +33,12 @@ from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
-from layer_function_generator import autodoc, templatedoc
-from tensor import concat
-import utils
+from .layer_function_generator import autodoc, templatedoc
+from .tensor import concat
+from . import utils
 import random
 from .. import unique_name
+from functools import reduce
 
 __all__ = [
     'fc',
@@ -949,6 +950,10 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
     helper = LayerHelper('dropout', **locals())
     out = helper.create_tmp_variable(dtype=x.dtype)
     mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True)
+
+    if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
+        seed = helper.main_program.random_seed
+
     helper.append_op(
         type='dropout',
         inputs={'X': [x]},
@@ -1313,13 +1318,16 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
 
 def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
     """
-    The input of the softmax layer is a 2-D tensor with shape N x K (N is the
-    batch_size, K is the dimension of input feature). The output tensor has the
-    same shape as the input tensor.
+    The input of the softmax operator is a tensor of any rank. The output tensor 
+    has the same shape as the input.
 
-    For each row of the input tensor, the softmax operator squashes the
-    K-dimensional vector of arbitrary real values to a K-dimensional vector of real
-    values in the range [0, 1] that add up to 1.
+    The input tensor will first be logically flattened to a 2-D matrix. The matrix's 
+    second dimension(row length) is as same as the last dimension of the input 
+    tensor, and the first dimension(column length) is the product of all other 
+    dimensions of the input tensor. For each row of the matrix, the softmax operator 
+    squashes the K-dimensional(K is the width of the matrix, which is also the size 
+    of the input tensor's last dimension) vector of arbitrary real values to a 
+    K-dimensional vector of real values in the range [0, 1] that add up to 1.
 
     It computes the exponential of the given dimension and the sum of exponential
     values of all the other dimensions in the K-dimensional vector input.
@@ -1327,7 +1335,7 @@ def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
     exponential values of all the other dimensions is the output of the softmax
     operator.
 
-    For each row :math:`i` and each column :math:`j` in Input(X), we have:
+    For each row :math:`i` and each column :math:`j` in the matrix, we have:
 
     .. math::
 
@@ -4473,15 +4481,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
                 "except one unknown dimension.")
 
     helper = LayerHelper("reshape", **locals())
-    reshaped = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_tmp_variable(dtype=x.dtype)
     helper.append_op(
         type="reshape",
         inputs=inputs,
-        attrs={"shape": shape,
-               "inplace": inplace},
-        outputs={"Out": reshaped})
+        attrs={"shape": shape},
+        outputs={"Out": out})
 
-    return helper.append_activation(reshaped)
+    return helper.append_activation(out)
 
 
 def lod_reset(x, y=None, target_lod=None):
@@ -4843,7 +4850,7 @@ def dice_loss(input, label, epsilon=0.00001):
             loss = fluid.layers.dice_loss(input=predictions, label=label, 2)
     """
     label = one_hot(label, depth=input.shape[-1])
-    reduce_dim = range(1, len(input.shape))
+    reduce_dim = list(range(1, len(input.shape)))
     inse = reduce_sum(input * label, dim=reduce_dim)
     dice_denominator = reduce_sum(
         input, dim=reduce_dim) + reduce_sum(
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 9e97ec9a6f55680a2eb44ad712ac002df4fecda5..f70c7f2258ce588444cf46d6c8affc4c9555203e 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from layer_function_generator import generate_layer_fn
+from .layer_function_generator import generate_layer_fn
 
 __activations__ = [
     'sigmoid',
@@ -66,9 +66,7 @@ __all__ = [
     'scatter',
     'sum',
     'slice',
-    'polygon_box_transform',
     'shape',
-    'iou_similarity',
     'maxout',
 ] + __activations__
 
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index b6614ecf3bc16e73683f4991779769049c6800ed..b93d721c12cb6ead044dc790f2f2af8a61a63b60 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -18,7 +18,7 @@ from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
 from ..initializer import Constant, force_init_on_cpu
 from ..core import VarDesc
-from layer_function_generator import templatedoc
+from .layer_function_generator import templatedoc
 import numpy
 
 __all__ = [
diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py
index b2b3186c1e8dd84e1527ff18744bd611f1f74c5f..53c33616f55be5f5ef7068a6e94418e17d739e3c 100644
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import core
+from . import core
 import numpy as np
 
 __all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
@@ -24,7 +24,7 @@ def create_lod_tensor(data, recursive_seq_lens, place):
 
     Create a lod tensor by doing the following:
 
-    1. Check that the length-based level of detail (LoD) also known as 
+    1. Check that the length-based level of detail (LoD) also known as
        recursive_sequence_lengths of the input is valid.
 
     2. Convert recursive_sequence_lengths to a offset-based LoD.
@@ -33,7 +33,7 @@ def create_lod_tensor(data, recursive_seq_lens, place):
        CPU or GPU device (based on input place).
 
     4. Set the level of detail (LoD) using the offset-based LoD.
-    
+
     Examples:
 
         Suppose we want LoDTensor to hold data for sequences of word, where each
@@ -51,7 +51,7 @@ def create_lod_tensor(data, recursive_seq_lens, place):
     Args:
         data(numpy.ndarray|list|LoDTensor): a numpy array or a LoDTensor or a
             list holding the data to be copied.
-        recursive_seq_lens(list): a list of lists indicating the length-based level of detail 
+        recursive_seq_lens(list): a list of lists indicating the length-based level of detail
             info specified by the user.
         place(Place): CPU or GPU place indicating where the data in the new
             LoDTensor will be stored.
@@ -62,10 +62,10 @@ def create_lod_tensor(data, recursive_seq_lens, place):
     if isinstance(data, core.LoDTensor):
         return create_lod_tensor(np.array(data), recursive_seq_lens, place)
     elif isinstance(data, list):
-        # When input data is a list, it only deal with the case where the base element 
-        # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated 
-        # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number 
-        # of words or other indexes in the sequence. 
+        # When input data is a list, it only deal with the case where the base element
+        # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated
+        # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number
+        # of words or other indexes in the sequence.
         new_recursive_seq_lens = []
         for seq in data:
             new_recursive_seq_lens.append(len(seq))
@@ -109,12 +109,12 @@ def create_random_int_lodtensor(recursive_seq_lens, base_shape, place, low,
     Suppose we want LoDTensor to hold data for sequences of word, where each
     word is represented by an integer. If we want to create a LoDTensor to
     represent two sentences, one of 2 words, and one of 3 words. Then
-    'base_shape' is [1], input length-based 'recursive_seq_lens' is [[2, 3]]. 
-    Then the overall shape of the LoDTensor would be [5, 1], holding 5 words 
+    'base_shape' is [1], input length-based 'recursive_seq_lens' is [[2, 3]].
+    Then the overall shape of the LoDTensor would be [5, 1], holding 5 words
     for two sentences.
 
     Args:
-        recursive_seq_lens(list): a list of lists indicating the length-based 
+        recursive_seq_lens(list): a list of lists indicating the length-based
             level of detail info specified by the user.
         base_shape(list): the shape of the basic element to be held by the
             LoDTensor.
@@ -124,11 +124,11 @@ def create_random_int_lodtensor(recursive_seq_lens, base_shape, place, low,
         high(int): the upper bound of the random integers.
 
     Returns:
-        A fluid LoDTensor object with tensor data and recursive_seq_lens info. 
+        A fluid LoDTensor object with tensor data and recursive_seq_lens info.
     """
     assert isinstance(base_shape, list), "base_shape should be a list"
     # append the total number of basic elements to the front of its shape
     overall_shape = [sum(recursive_seq_lens[-1])] + base_shape
-    # the range of integer data elements is [low, high]    
+    # the range of integer data elements is [low, high]
     data = np.random.random_integers(low, high, overall_shape).astype("int64")
     return create_lod_tensor(data, recursive_seq_lens, place)
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index b37b09ac81687882443c948569d9c4fca9310f78..cd8934522755691217a99a2cca271badda55368e 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -79,10 +79,10 @@ class MetricBase(object):
         """
         states = {
             attr: value
-            for attr, value in self.__dict__.iteritems()
+            for attr, value in list(self.__dict__.items())
             if not attr.startswith("_")
         }
-        for attr, value in states.iteritems():
+        for attr, value in list(states.items()):
             if isinstance(value, int):
                 setattr(self, attr, 0)
             elif isinstance(value, float):
@@ -105,7 +105,7 @@ class MetricBase(object):
         """
         states = {
             attr: value
-            for attr, value in self.__dict__.iteritems()
+            for attr, value in list(self.__dict__.items())
             if not attr.startswith("_")
         }
         config = {}
diff --git a/python/paddle/fluid/net_drawer.py b/python/paddle/fluid/net_drawer.py
index 73946a0721dc4a6d03074a4708cf574951412e66..623a7d3fd05567a26bb6923550f597a0e1e27e32 100644
--- a/python/paddle/fluid/net_drawer.py
+++ b/python/paddle/fluid/net_drawer.py
@@ -24,7 +24,7 @@ logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
 try:
-    from graphviz import Digraph
+    from .graphviz import Digraph
 except ImportError:
     logger.info(
         'Cannot import graphviz, which is required for drawing a network. This '
@@ -77,7 +77,7 @@ def parse_graph(program, graph, var_dict, **kwargs):
     # fill the known variables
     for block in program.blocks:
         for var in block.vars:
-            if not var_dict.has_key(var):
+            if var not in var_dict:
                 var_dict[var] = "Feed"
 
     temp_id = 0
@@ -93,17 +93,17 @@ def parse_graph(program, graph, var_dict, **kwargs):
                     var_dict[arg] = op.type
             for e in op.inputs:
                 for arg in e.arguments:
-                    if var_dict.has_key(arg):
+                    if arg in var_dict:
                         graph.edge(**draw_edge(var_dict, op, e, arg))
         break  # only plot the first block
 
 
 def draw_graph(startup_program, main_program, **kwargs):
-    if kwargs.has_key("graph_attr"):
+    if "graph_attr" in kwargs:
         GRAPH_STYLE.update(kwargs[graph_attr])
-    if kwargs.has_key("node_attr"):
+    if "node_attr" in kwargs:
         OP_STYLE.update(kwargs[node_attr])
-    if kwargs.has_key("edge_attr"):
+    if "edge_attr" in kwargs:
         VAR_STYLE.update(kwargs[edge_attr])
 
     graph_id = unique_id()
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 9b3f2aebee73e56ee820dc8ff4c9cfabd1456aaa..08480671d8a5c50bbec97930c451cbcdc241e1fe 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import layers
+from . import layers
 
 __all__ = [
     "simple_img_conv_pool",
@@ -210,7 +210,7 @@ def img_conv_group(input,
     conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
     conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)
 
-    for i in xrange(len(conv_num_filter)):
+    for i in range(len(conv_num_filter)):
         local_conv_act = conv_act
         if conv_with_batchnorm[i]:
             local_conv_act = None
@@ -488,10 +488,11 @@ def scaled_dot_product_attention(queries,
         trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
         return layers.reshape(
             x=trans_x,
-            shape=map(int, [
-                trans_x.shape[0], trans_x.shape[1],
-                trans_x.shape[2] * trans_x.shape[3]
-            ]))
+            shape=list(
+                map(int, [
+                    trans_x.shape[0], trans_x.shape[1], trans_x.shape[2] *
+                    trans_x.shape[3]
+                ])))
 
     q, k, v = __compute_qkv(queries, keys, values, num_heads)
 
diff --git a/python/paddle/fluid/op.py b/python/paddle/fluid/op.py
index 0b76e94157e378b40baff641c466968e239d8a83..93f021a360ac61f64e769d057df188d79f6f2bb6 100644
--- a/python/paddle/fluid/op.py
+++ b/python/paddle/fluid/op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import six
+
 import paddle.fluid.core as core
 import paddle.fluid.proto.framework_pb2 as framework_pb2
 
@@ -24,13 +26,13 @@ def get_all_op_protos():
     protostrs = core.get_all_op_protos()
     ret_values = []
     for pbstr in protostrs:
-        op_proto = framework_pb2.OpProto.FromString(str(pbstr))
+        op_proto = framework_pb2.OpProto.FromString(six.binary_type(pbstr))
         ret_values.append(op_proto)
     return ret_values
 
 
 def is_str(s):
-    return isinstance(s, str) or isinstance(s, unicode)
+    return isinstance(s, six.string_types)
 
 
 class OpDescCreationMethod(object):
@@ -189,7 +191,7 @@ class OperatorFactory(object):
         return self.get_op_info(t).method(**kwargs)
 
     def types(self):
-        return self.op_methods.keys()
+        return list(self.op_methods.keys())
 
     def get_op_info(self, t):
         if t not in self.op_methods:
@@ -197,13 +199,13 @@ class OperatorFactory(object):
         return self.op_methods.get(t)
 
     def get_op_input_names(self, type):
-        return map(lambda x: x[0], self.get_op_info(type).inputs)
+        return [x[0] for x in self.get_op_info(type).inputs]
 
     def get_op_inputs(self, type):
         return self.get_op_info(type).inputs
 
     def get_op_output_names(self, type):
-        return map(lambda x: x[0], self.get_op_info(type).outputs)
+        return [x[0] for x in self.get_op_info(type).outputs]
 
     def get_op_outputs(self, type):
         return self.get_op_info(type).outputs
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 3fe99f55011ab7f745c3ad98ec44dfe277a13e05..a07325f46a2892222c2d1dcd74aa7cb01f6760a1 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -14,15 +14,15 @@
 import re
 from collections import defaultdict
 from paddle.fluid.framework import Program, Variable
-import framework
-import layers
-from backward import append_backward
-from framework import program_guard
-import unique_name
-from initializer import Constant
-from layer_helper import LayerHelper
-from regularizer import append_regularization_ops
-from clip import append_gradient_clip_ops, error_clip_callback
+from . import framework
+from . import layers
+from .backward import append_backward
+from .framework import program_guard
+from . import unique_name
+from .initializer import Constant
+from .layer_helper import LayerHelper
+from .regularizer import append_regularization_ops
+from .clip import append_gradient_clip_ops, error_clip_callback
 from contextlib import contextmanager
 
 __all__ = [
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 10028a8c6e33edcea27650d925ca7378b770f143..eabe6bb901ad1b7afe71717eb3f5f5765f261201 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -12,10 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import core
+from __future__ import print_function
 import multiprocessing
-import framework
-import executor
+from . import core
+from . import framework
+from . import executor
 import warnings
 import sys
 import os
@@ -94,7 +95,7 @@ class ParallelExecutor(object):
         self._places = []
         self._act_places = []
         if use_cuda:
-            for i in xrange(core.get_cuda_device_count()):
+            for i in range(core.get_cuda_device_count()):
                 p = core.Place()
                 self._act_places.append(core.CUDAPlace(i))
                 p.set_place(self._act_places[-1])
@@ -102,7 +103,7 @@ class ParallelExecutor(object):
         else:
             cpu_num = int(
                 os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-            for i in xrange(cpu_num):
+            for i in range(cpu_num):
                 p = core.Place()
                 self._act_places.append(core.CPUPlace())
                 p.set_place(self._act_places[-1])
@@ -121,7 +122,7 @@ class ParallelExecutor(object):
             else:
                 cpu_num = int(
                     os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-                exec_strategy.num_threads = cpu_num
+                exec_strategy.num_threads = cpu_num * 2
 
         if build_strategy is None:
             build_strategy = BuildStrategy()
@@ -143,16 +144,16 @@ class ParallelExecutor(object):
         ) if share_vars_from else []
 
         self.persistable_vars = [
-            v.name
-            for v in filter(
-                lambda var: var.persistable and var.type != core.VarDesc.VarType.RAW,
-                main.list_vars())
+            v.name for v in [
+                var for var in main.list_vars()
+                if var.persistable and var.type != core.VarDesc.VarType.RAW
+            ]
         ]
 
         self.executor = core.ParallelExecutor(
             self._places,
             set([
-                p.name for p in main.global_block()._iter_parameters()
+                p.name for p in main.global_block().iter_parameters()
                 if not p.stop_gradient
             ]),
             set(self.persistable_vars), main.desc, loss_name
@@ -227,7 +228,9 @@ class ParallelExecutor(object):
         """
         if feed is None and feed_dict is not None:
             feed = feed_dict
-            print >> sys.stderr, "`feed_dict` is deprecated. Please use `feed=`"
+            print(
+                "`feed_dict` is deprecated. Please use `feed=`",
+                file=sys.stderr)
 
         if isinstance(feed, dict):
             feed_tensor_dict = dict()
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index 4a61f85ec4b5c5108ded31632af75dbbdaaaba71..afae577656c8970338f3b02208fcb4c738628ab6 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from initializer import Initializer, Xavier, Constant
-from regularizer import WeightDecayRegularizer
+import six
+
+from .initializer import Initializer, Xavier, Constant
+from .regularizer import WeightDecayRegularizer
 
 __all__ = [
     'ParamAttr',
@@ -134,7 +136,7 @@ class ParamAttr(object):
             return [ParamAttr._to_attr(a) for a in arg]
         elif isinstance(arg, ParamAttr):
             return arg
-        elif isinstance(arg, str) or isinstance(arg, unicode):
+        elif isinstance(arg, six.string_types):
             return ParamAttr(name=arg)
         elif isinstance(arg, Initializer):
             return ParamAttr(initializer=arg)
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index 6a321ae024dcb50452bc4d96d7e7e70f590a42c6..01983a830351b018770e6358f604781ffaae5800 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import core
+from . import core
 from contextlib import contextmanager
 import os
 
@@ -218,20 +218,20 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
 def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
     """The profiler interface.
     Different from cuda_profiler, this profiler can be used to profile both CPU
-    and GPU program. By defalut, it records the CPU and GPU operator kernels,
+    and GPU program. By default, it records the CPU and GPU operator kernels,
     if you want to profile other program, you can refer the profiling tutorial
     to add more records in C++ code.
 
     If the state == 'All', a profile proto file will be written to
     `profile_path`. This file records timeline information during the execution.
-    Then users can visualize this file to see the timeline, please refer 
+    Then users can visualize this file to see the timeline, please refer
     https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/timeline.md
 
     Args:
         state (string) : The profiling state, which should be 'CPU' or 'GPU',
             telling the profiler to use CPU timer or GPU timer for profiling.
             Although users may have already specified the execution place
-            (CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
+            (CPUPlace/CUDAPlace) in the beginning, for flexibility the profiler
             would not inherit this place.
         sorted_key (string) : If None, the profiling results will be printed
             in the order of first end time of events. Otherwise, the profiling
diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py
index bd57772713057f12b876942de58ee43527e94834..93b38ad3fa37bd4bff04c529cd5518a8138e55ea 100644
--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import os
-import core
 import contextlib
+from . import core
 __all__ = [
     'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files'
 ]
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 080c185420bdc79d6da1d5a52fdd11fa4105d59a..6eaac4432d4df1288f37607a01484434542f1138 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import framework
+from . import framework
 from . import core
 
 __all__ = ['L1Decay', 'L2Decay', 'L1DecayRegularizer', 'L2DecayRegularizer']
@@ -142,14 +142,20 @@ class L2DecayRegularizer(WeightDecayRegularizer):
             dtype="float32", shape=param.shape, lod_level=param.lod_level)
 
         if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+            idx = block.create_var(
+                dtype="int64",
+                shape=param.shape,
+                type=core.VarDesc.VarType.LOD_TENSOR)
             decay = block.create_var(
                 dtype="float32",
                 shape=param.shape,
                 type=core.VarDesc.VarType.SELECTED_ROWS)
+            block.append_op(
+                type='extract_rows', inputs={'X': grad}, outputs={'Out': idx})
             block.append_op(
                 type='lookup_table',
                 inputs={'W': param,
-                        'Ids': grad},
+                        'Ids': idx},
                 outputs={'Out': decay},
                 attrs={'is_sparse': True})
             param = decay
@@ -216,14 +222,20 @@ class L1DecayRegularizer(WeightDecayRegularizer):
             dtype="float32", shape=param.shape, lod_level=param.lod_level)
 
         if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+            idx = block.create_var(
+                dtype="int64",
+                shape=param.shape,
+                type=core.VarDesc.VarType.LOD_TENSOR)
             decay = block.create_var(
                 dtype="float32",
                 shape=param.shape,
                 type=core.VarDesc.VarType.SELECTED_ROWS)
+            block.append_op(
+                type='extract_rows', inputs={'X': grad}, outputs={'Out': idx})
             block.append_op(
                 type='lookup_table',
                 inputs={'W': param,
-                        'Ids': grad},
+                        'Ids': idx},
                 outputs={'Out': decay},
                 attrs={'is_sparse': True})
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
index ad28c9eff560507e5b326451159be3949353f58f..36a1a223cfd7c69aff3e8648da990d23e4e75202 100644
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -63,7 +63,7 @@ def train(use_cuda, train_program, params_dirname):
             if event.step == 10:
                 test_metrics = trainer.test(
                     reader=test_reader, feed_order=['x', 'y'])
-                print test_metrics
+                print(test_metrics)
                 '''
                 ...
                 ['25.768919467926025']
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
index 7fed6d914f75b690e34411aa154359c93b6ca989..9e4c384d92943227c2d68da829e6019e649a35fb 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
@@ -28,11 +28,12 @@ images per class.
 
 """
 
-import cPickle
 import itertools
 import numpy
 import paddle.v2.dataset.common
 import tarfile
+from six.moves import cPickle as pickle
+from six.moves import zip
 
 __all__ = ['train10']
 
@@ -46,7 +47,7 @@ def reader_creator(filename, sub_name, batch_size=None):
         data = batch['data']
         labels = batch.get('labels', batch.get('fine_labels', None))
         assert labels is not None
-        for sample, label in itertools.izip(data, labels):
+        for sample, label in zip(data, labels):
             yield (sample / 255.0).astype(numpy.float32), int(label)
 
     def reader():
@@ -56,7 +57,7 @@ def reader_creator(filename, sub_name, batch_size=None):
 
             batch_count = 0
             for name in names:
-                batch = cPickle.load(f.extractfile(name))
+                batch = pickle.load(f.extractfile(name))
                 for item in read_batch(batch):
                     if isinstance(batch_size, int) and batch_count > batch_size:
                         break
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
index 8e222d26907e8fe697b596a67e62cc9df84afe0e..a1f62db093904b617f0e37dc20d586ccea7eacd2 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 import numpy
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
index dbc7bc06c93157f271c79e85b6925468e861e57f..8429551765740e7db0eda82ce0b17cff129359b0 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 import numpy
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
index 67aa21e8c5699f1cb568dad23cd13f4cb51a6ec9..e3602e2d5643c233b2575d1adb7f181127f60287 100755
--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 import numpy as np
@@ -178,14 +176,15 @@ def train(use_cuda, train_program, params_dirname):
             if float(avg_cost) < 100.0:  # Large value to increase CI speed
                 trainer.save_params(params_dirname)
             else:
-                print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
-                                                              float(avg_cost)))
+                print(
+                    ('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
+                                                             float(avg_cost))))
                 if math.isnan(float(avg_cost)):
                     sys.exit("got NaN loss, training failed.")
 
         elif isinstance(event, fluid.EndStepEvent):
             print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(np.array, event.metrics)))
+                event.step, event.epoch, list(map(np.array, event.metrics))))
             if event.step == 1:  # Run 2 iterations to speed CI
                 trainer.save_params(params_dirname)
                 trainer.stop()
@@ -207,14 +206,14 @@ def infer(use_cuda, inference_program, params_dirname):
         inference_program, param_path=params_dirname, place=place)
 
     # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of 
-    # each word (base_shape) should be [1] since it is simply an index to 
+    # Here each word is the basic element of the LoDTensor and the shape of
+    # each word (base_shape) should be [1] since it is simply an index to
     # look up for the corresponding word vector.
     # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only 
-    # one higher level structure (sequence of words, or sentence) than the basic 
-    # element (word). Hence the LoDTensor will hold data for three sentences of 
-    # length 3, 4 and 2, respectively. 
+    # which has only one level of detail. Then the created LoDTensor will have only
+    # one higher level structure (sequence of words, or sentence) than the basic
+    # element (word). Hence the LoDTensor will hold data for three sentences of
+    # length 3, 4 and 2, respectively.
     # Note that recursive_sequence_lengths should be a list of lists.
     recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
index 8becd2404b0201c44b587a28e88995958082cd28..6fb0c85a8be2b4560ea1fdb32f01146a9206ee78 100644
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -250,7 +250,7 @@ def decode_main(use_cuda, is_sparse):
     feeder = fluid.DataFeeder(feed_list, place)
 
     for data in train_data():
-        feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+        feed_dict = feeder.feed([[x[0]] for x in data])
         feed_dict['init_ids'] = init_ids
         feed_dict['init_scores'] = init_scores
 
@@ -259,7 +259,7 @@ def decode_main(use_cuda, is_sparse):
             feed=feed_dict,
             fetch_list=[translation_ids, translation_scores],
             return_numpy=False)
-        print result_ids.recursive_sequence_lengths()
+        print(result_ids.recursive_sequence_lengths())
         break
 
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
index fd278f45f1c1b71a1653c3b28ace8bca8e4b1545..898807db6f343cbefcc877e0f03ed6c5b82dd669 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
+
 import argparse
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -89,8 +89,10 @@ def train(use_cuda, train_program, params_dirname):
                 if math.isnan(avg_cost):
                     sys.exit("got NaN loss, training failed.")
         elif isinstance(event, fluid.EndStepEvent):
-            print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(numpy.array, event.metrics)))
+            print(
+                ("Step {0}, Epoch {1} Metrics {2}".format(
+                    event.step, event.epoch,
+                    list(map(numpy.array, event.metrics)))))
 
     train_reader = paddle.batch(
         paddle.reader.shuffle(
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
index b2b544e791d7ea35ff7d2c9a2dce7ce7f5680f38..6dd64be315159f1835244fa027e578434e6cb038 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
+
 import argparse
 import paddle.fluid as fluid
 import paddle
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
index c860f1641708d947fd2a8008d3d3ccd0a231f6c2..60f3d8e105209938360487d963b0328d95e7b1f0 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
@@ -186,8 +186,9 @@ def train(use_cuda, train_program, params_dirname):
                 trainer.save_params(params_dirname)
                 trainer.stop()
             else:
-                print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
-                                                              float(avg_cost)))
+                print(
+                    ('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
+                                                             float(avg_cost))))
                 if math.isnan(float(avg_cost)):
                     sys.exit("got NaN loss, training failed.")
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
index 1668ae83d3581125b799508c8c3115a038e93d5a..24e65d1bd54cff7ad64453a3a61f50351d32ef08 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 from functools import partial
@@ -98,7 +96,7 @@ def train(use_cuda, train_program, params_dirname):
                     sys.exit("got NaN loss, training failed.")
         elif isinstance(event, fluid.EndStepEvent):
             print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(np.array, event.metrics)))
+                event.step, event.epoch, list(map(np.array, event.metrics))))
             if event.step == 1:  # Run 2 iterations to speed CI
                 trainer.save_params(params_dirname)
                 trainer.stop()
@@ -125,14 +123,14 @@ def infer(use_cuda, inference_program, params_dirname=None):
         place=place)
 
     # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of 
-    # each word (base_shape) should be [1] since it is simply an index to 
+    # Here each word is the basic element of the LoDTensor and the shape of
+    # each word (base_shape) should be [1] since it is simply an index to
     # look up for the corresponding word vector.
     # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only 
-    # one higher level structure (sequence of words, or sentence) than the basic 
-    # element (word). Hence the LoDTensor will hold data for three sentences of 
-    # length 3, 4 and 2, respectively. 
+    # which has only one level of detail. Then the created LoDTensor will have only
+    # one higher level structure (sequence of words, or sentence) than the basic
+    # element (word). Hence the LoDTensor will hold data for three sentences of
+    # length 3, 4 and 2, respectively.
     # Note that recursive_sequence_lengths should be a list of lists.
     recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
index 8da89d82cb8e00853eebfd794602a0e1e1020e7c..b3b1505a0fad07144f3f53c22abd5553054d8c51 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 from functools import partial
@@ -113,7 +111,7 @@ def train(use_cuda, train_program, params_dirname):
                     sys.exit("got NaN loss, training failed.")
         elif isinstance(event, fluid.EndStepEvent):
             print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(np.array, event.metrics)))
+                event.step, event.epoch, list(map(np.array, event.metrics))))
             if event.step == 1:  # Run 2 iterations to speed CI
                 trainer.save_params(params_dirname)
                 trainer.stop()
@@ -140,14 +138,14 @@ def infer(use_cuda, inference_program, params_dirname=None):
         place=place)
 
     # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of 
-    # each word (base_shape) should be [1] since it is simply an index to 
+    # Here each word is the basic element of the LoDTensor and the shape of
+    # each word (base_shape) should be [1] since it is simply an index to
     # look up for the corresponding word vector.
     # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only 
-    # one higher level structure (sequence of words, or sentence) than the basic 
-    # element (word). Hence the LoDTensor will hold data for three sentences of 
-    # length 3, 4 and 2, respectively. 
+    # which has only one level of detail. Then the created LoDTensor will have only
+    # one higher level structure (sequence of words, or sentence) than the basic
+    # element (word). Hence the LoDTensor will hold data for three sentences of
+    # length 3, 4 and 2, respectively.
     # Note that recursive_sequence_lengths should be a list of lists.
     recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
index 74faa2e8aa734cd644dfcc38127fd12df1fb1092..25f99ff0fd2d1050bb62338a6bf87aa29f913fb6 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 from functools import partial
@@ -107,7 +105,7 @@ def train(use_cuda, train_program, params_dirname):
                     sys.exit("got NaN loss, training failed.")
         elif isinstance(event, fluid.EndStepEvent):
             print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(np.array, event.metrics)))
+                event.step, event.epoch, list(map(np.array, event.metrics))))
             if event.step == 1:  # Run 2 iterations to speed CI
                 trainer.save_params(params_dirname)
                 trainer.stop()
@@ -135,14 +133,14 @@ def infer(use_cuda, inference_program, params_dirname=None):
         place=place)
 
     # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of 
-    # each word (base_shape) should be [1] since it is simply an index to 
+    # Here each word is the basic element of the LoDTensor and the shape of
+    # each word (base_shape) should be [1] since it is simply an index to
     # look up for the corresponding word vector.
     # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only 
-    # one higher level structure (sequence of words, or sentence) than the basic 
-    # element (word). Hence the LoDTensor will hold data for three sentences of 
-    # length 3, 4 and 2, respectively. 
+    # which has only one level of detail. Then the created LoDTensor will have only
+    # one higher level structure (sequence of words, or sentence) than the basic
+    # element (word). Hence the LoDTensor will hold data for three sentences of
+    # length 3, 4 and 2, respectively.
     # Note that recursive_sequence_lengths should be a list of lists.
     recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
index 95002aa7f9bb639828b47eb1e86e4ef954fb85ff..ce6342c2dad0b33e57d0ea90fc6ef1660ae4e68b 100644
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
+
 from paddle.fluid.layers.device import get_places
 import unittest
 import paddle.fluid as fluid
@@ -175,7 +175,7 @@ def train(word_dict,
     def train_loop(main_program):
         exe.run(fluid.default_startup_program())
 
-        for pass_id in xrange(PASS_NUM):
+        for pass_id in range(PASS_NUM):
             for data in train_data():
                 cost_val, acc_val = exe.run(main_program,
                                             feed=feeder.feed(data),
@@ -235,14 +235,14 @@ def infer(word_dict, use_cuda, save_dirname=None):
         word_dict_len = len(word_dict)
 
         # Setup input by creating LoDTensor to represent sequence of words.
-        # Here each word is the basic element of the LoDTensor and the shape of 
-        # each word (base_shape) should be [1] since it is simply an index to 
+        # Here each word is the basic element of the LoDTensor and the shape of
+        # each word (base_shape) should be [1] since it is simply an index to
         # look up for the corresponding word vector.
         # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-        # which has only one level of detail. Then the created LoDTensor will have only 
-        # one higher level structure (sequence of words, or sentence) than the basic 
-        # element (word). Hence the LoDTensor will hold data for three sentences of 
-        # length 3, 4 and 2, respectively. 
+        # which has only one level of detail. Then the created LoDTensor will have only
+        # one higher level structure (sequence of words, or sentence) than the basic
+        # element (word). Hence the LoDTensor will hold data for three sentences of
+        # length 3, 4 and 2, respectively.
         # Note that recursive_sequence_lengths should be a list of lists.
         recursive_seq_lens = [[3, 4, 2]]
         base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index 71bf5f8b3a9b17f24ce35220a9348bb871852623..37b64fa94a9aad7042e153e414ed29de3142db5a 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -114,7 +114,7 @@ def infer(use_cuda, save_dirname=None):
         test_reader = paddle.batch(
             paddle.dataset.uci_housing.test(), batch_size=batch_size)
 
-        test_data = test_reader().next()
+        test_data = next(test_reader())
         test_feat = numpy.array(
             [data[0] for data in test_data]).astype("float32")
         test_label = numpy.array(
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index a2fb186b86c9706ac1aff0de49defbfb06e2eb0f..de6fe5f140a86545e3291db165af824739a814ef 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 import contextlib
@@ -121,7 +119,7 @@ def train(net_type, use_cuda, save_dirname, is_local):
     avg_cost = fluid.layers.mean(cost)
     acc = fluid.layers.accuracy(input=predict, label=label)
 
-    # Test program 
+    # Test program
     test_program = fluid.default_main_program().clone(for_test=True)
 
     optimizer = fluid.optimizer.Adam(learning_rate=0.001)
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index d489feae9c568ec1d9e3a230766d10d1ced0200a..b7ac911cafdc751f38c7f66bc48263a17a84dc08 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -181,7 +181,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
 
         start_time = time.time()
         batch_id = 0
-        for pass_id in xrange(PASS_NUM):
+        for pass_id in range(PASS_NUM):
             for data in train_data():
                 cost = exe.run(main_program,
                                feed=feeder.feed(data),
@@ -248,14 +248,14 @@ def infer(use_cuda, save_dirname=None):
          fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
 
         # Setup input by creating LoDTensor to represent sequence of words.
-        # Here each word is the basic element of the LoDTensor and the shape of 
-        # each word (base_shape) should be [1] since it is simply an index to 
+        # Here each word is the basic element of the LoDTensor and the shape of
+        # each word (base_shape) should be [1] since it is simply an index to
         # look up for the corresponding word vector.
         # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-        # which has only one level of detail. Then the created LoDTensor will have only 
-        # one higher level structure (sequence of words, or sentence) than the basic 
-        # element (word). Hence the LoDTensor will hold data for three sentences of 
-        # length 3, 4 and 2, respectively. 
+        # which has only one level of detail. Then the created LoDTensor will have only
+        # one higher level structure (sequence of words, or sentence) than the basic
+        # element (word). Hence the LoDTensor will hold data for three sentences of
+        # length 3, 4 and 2, respectively.
         # Note that recursive_sequence_lengths should be a list of lists.
         recursive_seq_lens = [[3, 4, 2]]
         base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index 90c301a66105d8d872ee531556c5060b5d727515..462faad3e1cb7108f3bd6934017efe25fb9a4276 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -199,7 +199,7 @@ def train_main(use_cuda, is_sparse, is_local=True):
         feeder = fluid.DataFeeder(feed_list, place)
 
         batch_id = 0
-        for pass_id in xrange(1):
+        for pass_id in range(1):
             for data in train_data():
                 outs = exe.run(main_program,
                                feed=feeder.feed(data),
@@ -273,7 +273,7 @@ def decode_main(use_cuda, is_sparse):
     feeder = fluid.DataFeeder(feed_list, place)
 
     for data in train_data():
-        feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+        feed_dict = feeder.feed([[x[0]] for x in data])
         feed_dict['init_ids'] = init_ids
         feed_dict['init_scores'] = init_scores
 
@@ -282,7 +282,7 @@ def decode_main(use_cuda, is_sparse):
             feed=feed_dict,
             fetch_list=[translation_ids, translation_scores],
             return_numpy=False)
-        print result_ids.recursive_sequence_lengths()
+        print(result_ids.recursive_sequence_lengths())
         break
 
 
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index c471863920999a28cbede93a7965f07ee784f96d..3e5f76d12d41d016c995e5c85feda3c1847e356f 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
 
 import paddle.fluid.core as core
 import math
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index 6548766ef5d0162b50d4dd072e8e91dd95dc5d2b..b30c8771fcf267260c4c5aa7076bedc89e3b7e8b 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -260,15 +260,15 @@ def infer(use_cuda, save_dirname=None):
 
         # Use the first data from paddle.dataset.movielens.test() as input
         assert feed_target_names[0] == "user_id"
-        # Use create_lod_tensor(data, recursive_sequence_lengths, place) API 
-        # to generate LoD Tensor where `data` is a list of sequences of index 
-        # numbers, `recursive_sequence_lengths` is the length-based level of detail 
+        # Use create_lod_tensor(data, recursive_sequence_lengths, place) API
+        # to generate LoD Tensor where `data` is a list of sequences of index
+        # numbers, `recursive_sequence_lengths` is the length-based level of detail
         # (lod) info associated with `data`.
         # For example, data = [[10, 2, 3], [2, 3]] means that it contains
         # two sequences of indexes, of length 3 and 2, respectively.
-        # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one 
-        # level of detail info, indicating that `data` consists of two sequences 
-        # of length 3 and 2, respectively. 
+        # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one
+        # level of detail info, indicating that `data` consists of two sequences
+        # of length 3 and 2, respectively.
         user_id = fluid.create_lod_tensor([[1]], [[1]], place)
 
         assert feed_target_names[1] == "gender_id"
diff --git a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
index 467282624154086a874b0e73736ed5b1358915ff..2e79be2bd0fc7a368df86e188b7fa616055bb3e7 100644
--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -175,7 +175,7 @@ def train(use_cuda, save_dirname=None):
     feeder = fluid.DataFeeder(feed_list, place)
 
     batch_id = 0
-    for pass_id in xrange(2):
+    for pass_id in range(2):
         for data in train_data():
             outs = exe.run(framework.default_main_program(),
                            feed=feeder.feed(data),
@@ -213,14 +213,14 @@ def infer(use_cuda, save_dirname=None):
          fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
 
         # Setup input by creating LoDTensor to represent sequence of words.
-        # Here each word is the basic element of the LoDTensor and the shape of 
-        # each word (base_shape) should be [1] since it is simply an index to 
+        # Here each word is the basic element of the LoDTensor and the shape of
+        # each word (base_shape) should be [1] since it is simply an index to
         # look up for the corresponding word vector.
         # Suppose the recursive_sequence_lengths info is set to [[4, 6]],
-        # which has only one level of detail. Then the created LoDTensor will have only 
-        # one higher level structure (sequence of words, or sentence) than the basic 
-        # element (word). Hence the LoDTensor will hold data for two sentences of 
-        # length 4 and 6, respectively. 
+        # which has only one level of detail. Then the created LoDTensor will have only
+        # one higher level structure (sequence of words, or sentence) than the basic
+        # element (word). Hence the LoDTensor will hold data for two sentences of
+        # length 4 and 6, respectively.
         # Note that recursive_sequence_lengths should be a list of lists.
         recursive_seq_lens = [[4, 6]]
         base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 3b957508ca1f11fea3bbc182dca7eaa938594cb6..e761e05795313da23a9d984263ac2e202939b1e7 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -85,9 +85,11 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
         pd = fluid.layers.ParallelDo(places)
         with pd.do():
             avg_cost, predict_word = __network__(
-                map(pd.read_input, [
-                    first_word, second_word, third_word, forth_word, next_word
-                ]))
+                list(
+                    map(pd.read_input, [
+                        first_word, second_word, third_word, forth_word,
+                        next_word
+                    ])))
             pd.write_output(avg_cost)
 
         avg_cost = fluid.layers.mean(pd())
@@ -167,11 +169,11 @@ def infer(use_cuda, save_dirname=None):
         word_dict = paddle.dataset.imikolov.build_dict()
         dict_size = len(word_dict)
 
-        # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
-        # is simply an index to look up for the corresponding word vector and hence 
-        # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths, 
-        # which is length-based level of detail (lod) of each LoDTensor, should be [[1]] 
-        # meaning there is only one level of detail and there is only one sequence of 
+        # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word
+        # is simply an index to look up for the corresponding word vector and hence
+        # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths,
+        # which is length-based level of detail (lod) of each LoDTensor, should be [[1]]
+        # meaning there is only one level of detail and there is only one sequence of
         # one word on this level.
         # Note that recursive_sequence_lengths should be a list of lists.
         recursive_seq_lens = [[1]]
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index bec9f8594ff7c1aff8ae5ed55c9623754d9ea091..ccc62b442f62fa9fa175de031b0732febe38ee9a 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -78,7 +78,7 @@ for pass_id in range(PASS_NUM):
 
         if avg_loss_value[0] < 10.0:
             exit(0)  # if avg cost less than 10.0, we think our code is good.
-        print avg_loss_value[0]
+        print(avg_loss_value[0])
         if math.isnan(float(avg_loss_value)):
             sys.exit("got NaN loss, training failed.")
 exit(1)
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
index dfebb9a06ea4f290f128c486dcaccaeccdcef8c4..b2a59d27da9b3348b581d51a68d769bbf3b90d35 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import sys
 
 import paddle
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
index fa696acdfa9058af14f0bd34ce1a2980db5aeafc..323ddfb6911fdd57b32344933373189370005126 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -118,7 +118,7 @@ def main():
     feeder = fluid.DataFeeder(feed_list, place)
 
     batch_id = 0
-    for pass_id in xrange(10):
+    for pass_id in range(10):
         for data in train_data():
             outs = exe.run(fluid.default_main_program(),
                            feed=feeder.feed(data),
diff --git a/python/paddle/fluid/tests/demo/fc_gan.py b/python/paddle/fluid/tests/demo/fc_gan.py
index 8ea1b2b15cc0c0eb5bca67a9c5a6ac6c6774e7e2..3d92f50f0adeca79adefc291cdfba6a012fc2118 100644
--- a/python/paddle/fluid/tests/demo/fc_gan.py
+++ b/python/paddle/fluid/tests/demo/fc_gan.py
@@ -137,7 +137,7 @@ def main():
             generated_img = exe.run(g_program,
                                     feed={'noise': n},
                                     fetch_list={g_img})[0]
-            real_data = numpy.array(map(lambda x: x[0], data)).astype('float32')
+            real_data = numpy.array([x[0] for x in data]).astype('float32')
             real_data = real_data.reshape(num_true, 784)
             total_data = numpy.concatenate([real_data, generated_img])
             total_label = numpy.concatenate([
@@ -150,7 +150,7 @@ def main():
                                 feed={'img': total_data,
                                       'label': total_label},
                                 fetch_list={d_loss})[0]
-            for _ in xrange(NUM_TRAIN_TIMES_OF_DG):
+            for _ in range(NUM_TRAIN_TIMES_OF_DG):
                 n = numpy.random.uniform(
                     low=-1.0, high=1.0,
                     size=[2 * num_true * NOISE_SIZE]).astype('float32').reshape(
diff --git a/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
index b839e14889884bca8d27586aa8c1d76fba3458c1..a00325d79be2eba4d7f770b5316c5857952fe272 100644
--- a/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
+++ b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
@@ -36,7 +36,7 @@ if len(sys.argv) == 1:
 else:
     word_dict = load_vocab(sys.argv[1])
     word_dict["<unk>"] = len(word_dict)
-print "Dict dim = ", len(word_dict)
+print("Dict dim = ", len(word_dict))
 
 # input text data
 data = fluid.layers.data(name="words", shape=[1], dtype="int64", lod_level=1)
diff --git a/python/paddle/fluid/tests/no_test_concurrency.py b/python/paddle/fluid/tests/no_test_concurrency.py
index e8f6cfb4a907b2c01e9662e7e9bf2cb0fbd6cb1b..3bc0c9808e2345b610dea79abc56cfb0065ea46f 100644
--- a/python/paddle/fluid/tests/no_test_concurrency.py
+++ b/python/paddle/fluid/tests/no_test_concurrency.py
@@ -194,7 +194,7 @@ class TestRoutineOp(unittest.TestCase):
             quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
 
             with fluid.Go():
-                for i in xrange(10):
+                for i in range(10):
                     fluid.channel_recv(ch1, result)
                     Print(result)
 
diff --git a/python/paddle/fluid/tests/test_beam_search_decoder.py b/python/paddle/fluid/tests/test_beam_search_decoder.py
index 7a2502fa2f9733a7280e8e8d884b61b419719492..8bf750940d570cdad5e110168afc5f632202e869 100644
--- a/python/paddle/fluid/tests/test_beam_search_decoder.py
+++ b/python/paddle/fluid/tests/test_beam_search_decoder.py
@@ -155,7 +155,7 @@ def train_main(use_cuda):
         ]
         feeder = fluid.DataFeeder(feed_list, place)
 
-        for pass_id in xrange(1):
+        for pass_id in range(1):
             for batch_id, data in enumerate(train_reader()):
                 outs = exe.run(main_program,
                                feed=feeder.feed(data),
@@ -204,8 +204,8 @@ def decode_main(use_cuda):
     ]
     feeder = fluid.DataFeeder(feed_list, place)
 
-    data = train_reader().next()
-    feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+    data = next(train_reader())
+    feed_dict = feeder.feed([[x[0]] for x in data])
     feed_dict['init_ids'] = init_ids
     feed_dict['init_scores'] = init_scores
 
@@ -214,7 +214,7 @@ def decode_main(use_cuda):
         feed=feed_dict,
         fetch_list=[translation_ids, translation_scores],
         return_numpy=False)
-    print result_ids.lod()
+    print(result_ids.lod())
 
 
 class TestBeamSearchDecoder(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 2d70c986b1b6c42ff709e9cf3b4234cf4fc26836..fd45abd0a77cb54a3ca8e60cf80a1efe9f9d2060 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid.framework import Program, program_guard
diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py
index 3dc858971c584cca947cd958680dbdcf25df9e99..e8edd7fbbb31b1a6ecbf2a25a7d39e7b3f66363a 100644
--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
 import numpy as np
 import paddle
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/test_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py
index 799c31dfe5161ff6aef47601f1b6f6e38885760b..082f64c146f65eee4be0757d07495c33764fa841 100644
--- a/python/paddle/fluid/tests/test_if_else_op.py
+++ b/python/paddle/fluid/tests/test_if_else_op.py
@@ -76,15 +76,15 @@ class TestMNISTIfElseOp(unittest.TestCase):
         PASS_NUM = 100
         for pass_id in range(PASS_NUM):
             for data in train_reader():
-                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
-                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                x_data = np.array([x[0] for x in data]).astype("float32")
+                y_data = np.array([x[1] for x in data]).astype("int64")
                 y_data = np.expand_dims(y_data, axis=1)
 
                 outs = exe.run(prog,
                                feed={'x': x_data,
                                      'y': y_data},
                                fetch_list=[avg_loss])
-                print outs[0]
+                print(outs[0])
                 if outs[0] < 1.0:
                     return
         self.assertFalse(True)
@@ -131,15 +131,15 @@ class TestMNISTIfElseOp(unittest.TestCase):
         PASS_NUM = 100
         for pass_id in range(PASS_NUM):
             for data in train_reader():
-                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
-                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                x_data = np.array([x[0] for x in data]).astype("float32")
+                y_data = np.array([x[1] for x in data]).astype("int64")
                 y_data = y_data.reshape((y_data.shape[0], 1))
 
                 outs = exe.run(prog,
                                feed={'x': x_data,
                                      'y': y_data},
                                fetch_list=[avg_loss])
-                print outs[0]
+                print(outs[0])
                 if outs[0] < 1.0:
                     return
         self.assertFalse(True)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 322d76515e76c3d322ac7c4f989bbc95875cb654..a6a911721dfa31e5fb8d57645071af42adc968be 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -40,7 +40,7 @@ function(py_test_modules TARGET_NAME)
              ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     if (py_test_modules_SERIAL)
-        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
     endif()
   endif()
 endfunction()
@@ -49,6 +49,9 @@ list(REMOVE_ITEM TEST_OPS test_dist_train)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
 list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
+list(REMOVE_ITEM TEST_OPS test_dist_transformer)
+list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
+list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -61,4 +64,7 @@ if(WITH_DISTRIBUTE)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
+py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
 py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
+py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
+py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
diff --git a/python/paddle/fluid/tests/unittests/benchmark.py b/python/paddle/fluid/tests/unittests/benchmark.py
index e891ee932f1440001eb25b222f1f4613e97dfcb1..b98a92dcbe5626c6cca93b3f5894302399793bf9 100644
--- a/python/paddle/fluid/tests/unittests/benchmark.py
+++ b/python/paddle/fluid/tests/unittests/benchmark.py
@@ -16,6 +16,7 @@ import numpy as np
 import unittest
 import time
 import itertools
+import six
 
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -40,8 +41,8 @@ class BenchmarkSuite(OpTest):
             expect_t = np.array(item_cpu_out)
             actual = item_gpu_out
             actual_t = np.array(item_gpu_out)
-            var_name = variable if isinstance(variable,
-                                              basestring) else variable.name
+            var_name = variable if isinstance(
+                variable, six.string_types) else variable.name
             self.assertTrue(
                 np.allclose(
                     actual_t, expect_t, atol=atol),
@@ -53,7 +54,7 @@ class BenchmarkSuite(OpTest):
 
     def _get_input_names(self):
         inputs = []
-        for name, value in self.inputs.iteritems():
+        for name, value in list(self.inputs.items()):
             if isinstance(value, list):
                 inputs.extend([sub_name for sub_name, _ in value])
             inputs.append(name)
@@ -61,7 +62,7 @@ class BenchmarkSuite(OpTest):
 
     def _get_output_names(self):
         outputs = []
-        for var_name, var in self.outputs.iteritems():
+        for var_name, var in list(self.outputs.items()):
             if isinstance(var, list):
                 for sub_var_name, sub_var in var:
                     outputs.append(sub_var_name)
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
index 72bc1729b0f63b23ad7ecb5ad703b984a4c614ac..62ca67ae4c041c0fe6c0836c79ebe6c23248e43e 100644
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -14,6 +14,7 @@
 
 import numpy as np
 import argparse
+import six
 import time
 import math
 
@@ -174,6 +175,9 @@ class SE_ResNeXt():
             padding=(filter_size - 1) / 2,
             groups=groups,
             act=None,
+            # avoid pserver CPU init differs from GPU
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant()),
             bias_attr=False)
         return fluid.layers.batch_norm(input=conv, act=act)
 
@@ -194,10 +198,8 @@ class SE_ResNeXt():
 
 def get_model(batch_size):
     # Input data
-    image = fluid.layers.fill_constant(
-        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
-    label = fluid.layers.fill_constant(
-        shape=[batch_size, 1], dtype='int64', value=0.0)
+    image = fluid.layers.data(name="data", shape=[3, 224, 224], dtype='float32')
+    label = fluid.layers.data(name="int64", shape=[1], dtype='int64')
 
     # Train program
     model = SE_ResNeXt(layers=50)
@@ -222,8 +224,10 @@ def get_model(batch_size):
     lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
 
     optimizer = fluid.optimizer.Momentum(
-        learning_rate=fluid.layers.piecewise_decay(
-            boundaries=bd, values=lr),
+        # FIXME(typhoonzero): add back LR decay once ParallelExecutor fixed.
+        #learning_rate=fluid.layers.piecewise_decay(
+        #    boundaries=bd, values=lr),
+        learning_rate=base_lr,
         momentum=0.9,
         regularization=fluid.regularizer.L2Decay(1e-4))
     optimizer.minimize(avg_cost)
@@ -232,7 +236,7 @@ def get_model(batch_size):
     train_reader = paddle.batch(
         paddle.dataset.flowers.train(), batch_size=batch_size)
     test_reader = paddle.batch(
-        paddle.dataset.flowers.test(), batch_size=batch_size)
+        paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
 
     return test_program, avg_cost, train_reader, test_reader, acc_top1, out
 
@@ -256,7 +260,6 @@ class DistSeResneXt2x2:
                            trainers)
         pserver_prog = t.get_pserver_program(current_endpoint)
         startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
-
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(startup_prog)
@@ -278,7 +281,7 @@ class DistSeResneXt2x2:
 
     def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
         test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model(
-            batch_size=20)
+            batch_size=2)
         if is_dist:
             t = get_transpiler(trainer_id,
                                fluid.default_main_program(), endpoints,
@@ -294,24 +297,27 @@ class DistSeResneXt2x2:
         strategy.num_threads = 1
         strategy.allow_op_delay = False
         exe = fluid.ParallelExecutor(
-            True,
-            loss_name=avg_cost.name,
-            exec_strategy=strategy,
-            num_trainers=trainers,
-            trainer_id=trainer_id)
+            True, loss_name=avg_cost.name, exec_strategy=strategy)
 
         feed_var_list = [
-            var for var in trainer_prog.global_block().vars.itervalues()
+            var for var in trainer_prog.global_block().vars.values()
             if var.is_data
         ]
 
         feeder = fluid.DataFeeder(feed_var_list, place)
-        reader_generator = train_reader()
-        first_loss, = exe.run(fetch_list=[avg_cost.name])
+        reader_generator = test_reader()
+
+        data = next(reader_generator)
+        first_loss, = exe.run(fetch_list=[avg_cost.name],
+                              feed=feeder.feed(data))
         print(first_loss)
-        for i in xrange(5):
-            loss, = exe.run(fetch_list=[avg_cost.name])
-        last_loss, = exe.run(fetch_list=[avg_cost.name])
+
+        for i in six.moves.xrange(5):
+            data = next(reader_generator)
+            loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
+
+        data = next(reader_generator)
+        last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
         print(last_loss)
 
 
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee8020a73546cb9037e9dc4be589c62bb1b6b937
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -0,0 +1,280 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+import os
+import sys
+import transformer_model
+import paddle.dataset.wmt16 as wmt16
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio"
+
+
+class ModelHyperParams(object):
+    # Dictionary size for source and target language. This model directly uses
+    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
+    # alreay been added, but the <pad> token is not added. Transformer requires
+    # sequences in a mini-batch are padded to have the same length. A <pad> token is
+    # added into the original dictionary in paddle.dateset.wmt16.
+
+    # size of source word dictionary.
+    src_vocab_size = 10000
+    # index for <pad> token in source language.
+    src_pad_idx = src_vocab_size
+
+    # size of target word dictionay
+    trg_vocab_size = 10000
+    # index for <pad> token in target language.
+    trg_pad_idx = trg_vocab_size
+
+    # position value corresponding to the <pad> token.
+    pos_pad_idx = 0
+
+    # max length of sequences. It should plus 1 to include position
+    # padding token for position encoding.
+    max_length = 50
+
+    # the dimension for word embeddings, which is also the last dimension of
+    # the input and output of multi-head attention, position-wise feed-forward
+    # networks, encoder and decoder.
+
+    d_model = 512
+    # size of the hidden layer in position-wise feed-forward networks.
+    d_inner_hid = 1024
+    # the dimension that keys are projected to for dot-product attention.
+    d_key = 64
+    # the dimension that values are projected to for dot-product attention.
+    d_value = 64
+    # number of head used in multi-head attention.
+    n_head = 8
+    # number of sub-layers to be stacked in the encoder and decoder.
+    n_layer = 6
+    # dropout rate used by all dropout layers.
+    dropout = 0.1
+
+
+def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and attention bias. Then, convert the numpy
+    data to tensors and return a dict mapping names to tensors.
+    """
+
+    def __pad_batch_data(insts,
+                         pad_idx,
+                         is_target=False,
+                         return_pos=True,
+                         return_attn_bias=True,
+                         return_max_len=True):
+        """
+        Pad the instances to the max sequence length in batch, and generate the
+        corresponding position data and attention bias.
+        """
+        return_list = []
+        max_len = max(len(inst) for inst in insts)
+        inst_data = np.array(
+            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
+        return_list += [inst_data.astype("int64").reshape([-1, 1])]
+        if return_pos:
+            inst_pos = np.array([[
+                pos_i + 1 if w_i != pad_idx else 0
+                for pos_i, w_i in enumerate(inst)
+            ] for inst in inst_data])
+
+            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
+        if return_attn_bias:
+            if is_target:
+                # This is used to avoid attention on paddings and subsequent
+                # words.
+                slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
+                                              max_len))
+                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
+                    [-1, 1, max_len, max_len])
+                slf_attn_bias_data = np.tile(slf_attn_bias_data,
+                                             [1, n_head, 1, 1]) * [-1e9]
+            else:
+                # This is used to avoid attention on paddings.
+                slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
+                                               (max_len - len(inst))
+                                               for inst in insts])
+                slf_attn_bias_data = np.tile(
+                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
+                    [1, n_head, max_len, 1])
+            return_list += [slf_attn_bias_data.astype("float32")]
+        if return_max_len:
+            return_list += [max_len]
+        return return_list if len(return_list) > 1 else return_list[0]
+
+    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
+        [inst[0] for inst in insts], src_pad_idx, is_target=False)
+    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
+        [inst[1] for inst in insts], trg_pad_idx, is_target=True)
+    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
+                                [1, 1, trg_max_len, 1]).astype("float32")
+    lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
+                                False, False, False)
+    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
+
+    return [
+        src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
+        trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+    ]
+
+
+def transformer(use_feed):
+    assert not use_feed, "transfomer doesn't support feed yet"
+    return transformer_model.transformer(
+        ModelHyperParams.src_vocab_size + 1,
+        ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
+        ModelHyperParams.n_layer, ModelHyperParams.n_head,
+        ModelHyperParams.d_key, ModelHyperParams.d_value,
+        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
+        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
+        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
+
+
+def get_model():
+    avg_cost = transformer(use_feed=False)
+    optimizer = fluid.optimizer.Adam()
+    optimizer.minimize(avg_cost)
+    return avg_cost
+
+
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+
+
+class DistTransformer2x2(object):
+    def run_pserver(self, pserver_endpoints, trainers, current_endpoint,
+                    trainer_id):
+        get_model()
+        t = get_transpiler(trainer_id,
+                           fluid.default_main_program(), pserver_endpoints,
+                           trainers)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        exe.run(pserver_prog)
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 20
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(3)
+            print("waiting ps ready: ", pid)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
+        avg_cost = get_model()
+        if is_dist:
+            t = get_transpiler(trainer_id,
+                               fluid.default_main_program(), endpoints,
+                               trainers)
+            trainer_prog = t.get_trainer_program()
+        else:
+            trainer_prog = fluid.default_main_program()
+
+        startup_exe = fluid.Executor(place)
+        startup_exe.run(fluid.default_startup_program())
+
+        strategy = fluid.ExecutionStrategy()
+        strategy.num_threads = 1
+        strategy.allow_op_delay = False
+        exe = fluid.ParallelExecutor(
+            True, loss_name=avg_cost.name, exec_strategy=strategy)
+
+        first_loss, = exe.run(fetch_list=[avg_cost.name])
+        print(first_loss)
+        for i in xrange(5):
+            _ = exe.run(fetch_list=[avg_cost.name])
+        last_loss, = exe.run(fetch_list=[avg_cost.name])
+        print(last_loss)
+
+
+def main(role="pserver",
+         endpoints="127.0.0.1:9123",
+         trainer_id=0,
+         current_endpoint="127.0.0.1:9123",
+         trainers=1,
+         is_dist=True):
+
+    reader = paddle.batch(
+        wmt16.train(ModelHyperParams.src_vocab_size,
+                    ModelHyperParams.trg_vocab_size),
+        batch_size=transformer_model.batch_size)
+
+    with fluid.recordio_writer.create_recordio_writer(
+            WMT16_RECORDIO_FILE) as writer:
+        for batch in reader():
+            for tensor in prepare_batch_input(
+                    batch, ModelHyperParams.src_pad_idx,
+                    ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
+                t = fluid.LoDTensor()
+                t.set(tensor, fluid.CPUPlace())
+                writer.append_tensor(t)
+            writer.complete_append_tensor()
+
+    model = DistTransformer2x2()
+    if role == "pserver":
+        model.run_pserver(endpoints, trainers, current_endpoint, trainer_id)
+    else:
+        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        model.run_trainer(p, endpoints, trainer_id, trainers, is_dist)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 7:
+        print(
+            "Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
+        )
+    role = sys.argv[1]
+    endpoints = sys.argv[2]
+    trainer_id = int(sys.argv[3])
+    current_endpoint = sys.argv[4]
+    trainers = int(sys.argv[5])
+    is_dist = True if sys.argv[6] == "TRUE" else False
+    main(
+        role=role,
+        endpoints=endpoints,
+        trainer_id=trainer_id,
+        current_endpoint=current_endpoint,
+        trainers=trainers,
+        is_dist=is_dist)
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 82b5e7cf0b3633eb04ab97c5300b1926b9d47cb6..b27d773f09d9a6daad5a10b65e683f4e11881de1 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -26,13 +26,15 @@ from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
 from paddle.fluid.framework import Program, OpProtoHolder, Variable
 from testsuite import create_op, set_input, append_input_output, append_loss_ops
+from functools import reduce
+from six.moves import zip
 
 
 def randomize_probability(batch_size, class_num, dtype='float32'):
     prob = np.random.uniform(
         0.1, 1.0, size=(batch_size, class_num)).astype(dtype)
     prob_sum = prob.sum(axis=1)
-    for i in xrange(len(prob)):
+    for i in range(len(prob)):
         prob[i] /= prob_sum[i]
     return prob
 
@@ -66,6 +68,10 @@ def get_numeric_gradient(place,
         tensor_to_check_dtype = np.float32
     elif tensor_to_check_dtype == core.VarDesc.VarType.FP64:
         tensor_to_check_dtype = np.float64
+    elif tensor_to_check_dtype == core.VarDesc.VarType.FP16:
+        tensor_to_check_dtype = np.float16
+        # set delta as np.float16, will automatic convert to float32, float64
+        delta = np.array(delta).astype(np.float16)
     else:
         raise ValueError("Not supported data type " + str(
             tensor_to_check_dtype))
@@ -73,20 +79,31 @@ def get_numeric_gradient(place,
     gradient_flat = np.zeros(shape=(tensor_size, ), dtype=tensor_to_check_dtype)
 
     def __get_elem__(tensor, i):
-        if tensor_to_check_dtype == np.float32:
+        if tensor_to_check_dtype == np.float16:
+            numpy_tensor = np.array(tensor).astype(np.float16)
+            numpy_tensor = numpy_tensor.flatten()
+            return numpy_tensor[i]
+        elif tensor_to_check_dtype == np.float32:
             return tensor._get_float_element(i)
         else:
             return tensor._get_double_element(i)
 
     def __set_elem__(tensor, i, e):
-        if tensor_to_check_dtype == np.float32:
+        if tensor_to_check_dtype == np.float16:
+            numpy_tensor = np.array(tensor).astype(np.float16)
+            shape = numpy_tensor.shape
+            numpy_tensor = numpy_tensor.flatten()
+            numpy_tensor[i] = e
+            numpy_tensor = numpy_tensor.reshape(shape).view(np.uint16)
+            tensor.set(numpy_tensor, place)
+        elif tensor_to_check_dtype == np.float32:
             tensor._set_float_element(i, e)
         else:
             tensor._set_double_element(i, e)
 
     # we only compute gradient of one element each time.
     # we use a for loop to compute the gradient of every element.
-    for i in xrange(tensor_size):
+    for i in range(tensor_size):
         if in_place:
             set_input(scope, op, inputs, place)
 
@@ -133,13 +150,18 @@ class OpTest(unittest.TestCase):
         if not self.call_once:
             self.call_once = True
             self.dtype = data_type
+            # See the comment of np_dtype_to_fluid_dtype
+            # If the input type is uint16, we assume use float16
+            # for lodtensor dtype.
+            if self.dtype == np.uint16:
+                self.dtype == np.float16
 
     def infer_dtype_from_inputs_outputs(self, inputs, outputs):
         def infer_dtype(numpy_dict):
             assert isinstance(
                 numpy_dict,
                 dict), "self.inputs, self.outputs must be numpy_dict"
-            for var_name, var_value in numpy_dict.iteritems():
+            for var_name, var_value in numpy_dict.items():
                 if isinstance(var_value, (np.ndarray, np.generic)):
                     self.try_call_once(var_value.dtype)
                 elif isinstance(var_value, (list, tuple)):
@@ -161,19 +183,25 @@ class OpTest(unittest.TestCase):
                 for name, np_value in self.inputs[var_name]:
                     tensor = core.LoDTensor()
                     if isinstance(np_value, tuple):
-                        tensor.set(np_value[0], place)
+                        tensor.set(
+                            OpTest.np_value_to_fluid_value(np_value[0]), place)
                         tensor.set_recursive_sequence_lengths(np_value[1])
                     else:
-                        tensor.set(np_value, place)
+                        tensor.set(
+                            OpTest.np_value_to_fluid_value(np_value), place)
                     feed_map[name] = tensor
             else:
                 tensor = core.LoDTensor()
                 if isinstance(self.inputs[var_name], tuple):
-                    tensor.set(self.inputs[var_name][0], place)
+                    tensor.set(
+                        OpTest.np_value_to_fluid_value(self.inputs[var_name][
+                            0]), place)
                     tensor.set_recursive_sequence_lengths(self.inputs[var_name][
                         1])
                 else:
-                    tensor.set(self.inputs[var_name], place)
+                    tensor.set(
+                        OpTest.np_value_to_fluid_value(self.inputs[var_name]),
+                        place)
                 feed_map[var_name] = tensor
 
         return feed_map
@@ -197,7 +225,7 @@ class OpTest(unittest.TestCase):
 
     def _get_io_vars(self, block, numpy_inputs):
         inputs = {}
-        for name, value in numpy_inputs.iteritems():
+        for name, value in numpy_inputs.items():
             if isinstance(value, list):
                 var_list = [
                     block.var(sub_name) for sub_name, sub_value in value
@@ -240,7 +268,7 @@ class OpTest(unittest.TestCase):
         # if the fetch_list is customized by user, we use it directly.
         # if not, fill the fetch_list by the user configured outputs in test.
         if len(fetch_list) == 0:
-            for var_name, var in outputs.iteritems():
+            for var_name, var in outputs.items():
                 if isinstance(var, list):
                     for v in var:
                         fetch_list.append(v)
@@ -252,7 +280,7 @@ class OpTest(unittest.TestCase):
                 fetch_list.append(str(out_name))
         # fetch_list = map(block.var, fetch_list)
         if not isinstance(fetch_list[0], fluid.framework.Variable):
-            fetch_list = map(block.var, fetch_list)
+            fetch_list = list(map(block.var, fetch_list))
         outs = executor.run(program,
                             feed=feed_map,
                             fetch_list=fetch_list,
@@ -307,13 +335,22 @@ class OpTest(unittest.TestCase):
                     np.allclose(
                         actual_t, expect_t, atol=atol),
                     "Output (" + out_name + ") has diff at " + str(place) +
-                    str(actual_t) + "\n" + str(expect_t))
+                    "\nExpect " + str(expect_t) + "\n" + "But Got" +
+                    str(actual_t))
                 if isinstance(expect, tuple):
                     self.assertListEqual(actual.recursive_sequence_lengths(),
                                          expect[1], "Output (" + out_name +
                                          ") has different lod at " + str(place))
 
     def _get_places(self):
+        if self.dtype == np.float16:
+            if core.is_compiled_with_cuda() and core.op_support_gpu(
+                    self.op_type):
+                place = core.CUDAPlace(0)
+                if core.is_float16_supported(place):
+                    return [place]
+            else:
+                return []
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
             places.append(core.CUDAPlace(0))
@@ -334,7 +371,7 @@ class OpTest(unittest.TestCase):
     def __assert_is_close(self, numeric_grads, analytic_grads, names,
                           max_relative_error, msg_prefix):
 
-        for a, b, name in itertools.izip(numeric_grads, analytic_grads, names):
+        for a, b, name in zip(numeric_grads, analytic_grads, names):
             abs_a = np.abs(a)
             abs_a[abs_a < 1e-3] = 1
 
@@ -344,9 +381,9 @@ class OpTest(unittest.TestCase):
             def err_msg():
                 offset = np.argmax(diff_mat > max_relative_error)
                 return ("%s Variable %s max gradient diff %f over limit %f, "
-                        "the first error element is %d, %f, %f") % (
-                            msg_prefix, name, max_diff, max_relative_error,
-                            offset, a.flatten()[offset], b.flatten()[offset])
+                        "the first error element is %d, expected %f, but got %f"
+                        ) % (msg_prefix, name, max_diff, max_relative_error,
+                             offset, a.flatten()[offset], b.flatten()[offset])
 
             self.assertLessEqual(max_diff, max_relative_error, err_msg())
 
@@ -435,6 +472,21 @@ class OpTest(unittest.TestCase):
             input.dtype = np.uint16
         return input
 
+    @staticmethod
+    def fluid_dtype_to_np_dtype(self, dtype):
+        """
+        See above, convert the dtype to normal type.
+        """
+        if dtype == np.uint16:
+            dtype = np.float16
+        return dtype
+
+    @staticmethod
+    def np_value_to_fluid_value(input):
+        if input.dtype == np.float16:
+            input = input.view(np.uint16)
+        return input
+
     def _get_gradient(self,
                       input_to_check,
                       place,
@@ -457,9 +509,9 @@ class OpTest(unittest.TestCase):
             if isinstance(place, fluid.CUDAPlace(0)):
                 use_cuda = True
             executor = fluid.ParallelExecutor(
-                use_cuda=use_cuda, loss_name=loss.name, main_program=program)
+                use_cuda=use_cuda, loss_name=loss.name, main_program=prog)
         else:
             executor = Executor(place)
-        return map(np.array,
-                   executor.run(prog, feed_dict, fetch_list,
-                                return_numpy=False))
+        return list(
+            map(np.array,
+                executor.run(prog, feed_dict, fetch_list, return_numpy=False)))
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index fcf86cc5839113b75855ce97459b2ee4881238cd..67c35e9de7e83699bf30ca946856bb907152cbdd 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -91,7 +91,7 @@ class TestParallelExecutorBase(unittest.TestCase):
             first_loss, = run_executor(
                 exe=exe, feed=feed_dict, fetch_list=[loss.name])
 
-            for i in xrange(iter):
+            for i in range(iter):
                 run_executor(exe=exe, feed=feed_dict, fetch_list=[])
 
             last_loss, = run_executor(
@@ -99,8 +99,8 @@ class TestParallelExecutorBase(unittest.TestCase):
             end = time.time()
 
             if batch_size is not None:
-                print "%.4f Instance per second" % (
-                    (batch_size * iter + 2) / (end - begin))
+                print("%.4f Instance per second" % (
+                    (batch_size * iter + 2) / (end - begin)))
 
             avg_last_loss_val = np.array(last_loss).mean()
             avg_first_loss_val = np.array(first_loss).mean()
@@ -108,6 +108,6 @@ class TestParallelExecutorBase(unittest.TestCase):
                     float(avg_first_loss_val)):
                 sys.exit("got NaN loss, training failed.")
 
-            print first_loss, last_loss
+            print(first_loss, last_loss)
             # self.assertGreater(first_loss[0], last_loss[0])
             return first_loss, last_loss
diff --git a/python/paddle/fluid/tests/unittests/test_accuracy_op.py b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
index 212a87e529da83c40ba8852e81bdf43d4611897b..db1861fd10e371ebe631a16380af591875886769 100644
--- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
@@ -26,7 +26,7 @@ class TestAccuracyOp(OpTest):
         label = np.random.randint(0, 2, (n, 1))
         self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
         num_correct = 0
-        for rowid in xrange(n):
+        for rowid in range(n):
             for ele in indices[rowid]:
                 if ele == label[rowid]:
                     num_correct += 1
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 5ed387fb1247f1a91147cb6981f1adc7c2eeb8a2..34f9cf0620fd1351111e93e16ed5f7e765d7078b 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -313,9 +313,9 @@ class TestAbs(OpTest):
         self.init_dtype()
 
         x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
-        # Because we set delta = 0.005 in caculating numeric gradient,
+        # Because we set delta = 0.005 in calculating numeric gradient,
         # if x is too small, such as 0.002, x_neg will be -0.003
-        # x_pos will be 0.007, so the numeric gradient is unaccurate.
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
         # we should avoid this
         x[np.abs(x) < 0.005] = 0.02
         out = np.abs(x)
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index 3c65f3d44adcebdca92f78f7834d4878a9fa3dfe..fa4b39879c0ede569b6802502b2c71a93b163373 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -273,7 +273,7 @@ class TestSparseAdamOp(unittest.TestCase):
         self.setup(scope, place)
 
         op_args = dict()
-        for key, np_array in self.dense_inputs.iteritems():
+        for key, np_array in self.dense_inputs.items():
             var = scope.var(key).get_tensor()
             var.set(np_array, place)
             op_args[key] = key
@@ -290,7 +290,7 @@ class TestSparseAdamOp(unittest.TestCase):
         adam_op = Operator("adam", **op_args)
         adam_op.run(scope, place)
 
-        for key, np_array in self.outputs.iteritems():
+        for key, np_array in self.outputs.items():
             out_var = scope.var(key).get_tensor()
             actual = np.array(out_var)
             actual = actual.reshape([actual.size])
diff --git a/python/paddle/fluid/tests/unittests/test_array_read_write_op.py b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
index a49e9035a43e04fc1d1b2328d7562c053320b24b..0000fb0958a129e9e1098de1fad888c503cfbdc5 100644
--- a/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
+++ b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
@@ -80,8 +80,9 @@ class TestArrayReadWrite(unittest.TestCase):
 
         append_backward(total_sum_scaled)
 
-        g_vars = map(default_main_program().global_block().var,
-                     [each_x.name + "@GRAD" for each_x in x])
+        g_vars = list(
+            map(default_main_program().global_block().var,
+                [each_x.name + "@GRAD" for each_x in x]))
         g_out = [
             item.sum()
             for item in exe.run(
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index fcb2612326e74cf6417aa93f2691154c79b5e44c..f805fdc35f624bf6e9d94d66839dcb2a0143a29b 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -415,7 +415,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
             self.__assert_close(scale_grad, out[6], "scale_grad")
             self.__assert_close(bias_grad, out[7], "bias_grad")
 
-            print "op test forward passed: ", str(place), data_layout
+            print("op test forward passed: ", str(place), data_layout)
 
         places = [core.CPUPlace()]
 
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
index 167451edd8c46c006c8019678a304a38f18cb946..e8283fc9422d93af5735aaec1a165b46ac1ef78e 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
@@ -59,8 +59,7 @@ class BeamSearchOpTester(unittest.TestCase):
             np.allclose(
                 np.array(selected_scores),
                 np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis]))
-        self.assertEqual(selected_ids.lod(),
-                         [[0L, 2L, 4L], [0L, 1L, 2L, 3L, 4L]])
+        self.assertEqual(selected_ids.lod(), [[0, 2, 4], [0, 1, 2, 3, 4]])
 
     def _create_pre_ids(self):
         np_data = np.array([[1, 2, 3, 4]], dtype='int64')
diff --git a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
index d5bd726c4a82ee839703c69a933100bb056cb736..ceeca25b74d85ed2874d672e402e3186c4ce7d47 100644
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
@@ -48,7 +48,7 @@ def bipartite_match(distance, match_indices, match_dist):
 
 def argmax_match(distance, match_indices, match_dist, threshold):
     r, c = distance.shape
-    for j in xrange(c):
+    for j in range(c):
         if match_indices[j] != -1:
             continue
         col_dist = distance[:, j]
diff --git a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
index 23932194f0ca97954ec9ade3fdcaebd7a32749a0..354110f1f96f6b4aad1a4866c8d1337dec3acd16 100644
--- a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
+++ b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
@@ -63,7 +63,7 @@ class TestChunkEvalOp(OpTest):
         # generate chunk beginnings
         chunk_begins = sorted(
             np.random.choice(
-                range(starts[-1]), num_chunks, replace=False))
+                list(range(starts[-1])), num_chunks, replace=False))
         seq_chunk_begins = []
         begin_idx = 0
         # divide chunks into sequences
@@ -93,7 +93,7 @@ class TestChunkEvalOp(OpTest):
                                   self.num_infer_chunks + self.num_label_chunks
                                   - self.num_correct_chunks)
         correct_chunks = np.random.choice(
-            range(len(chunks)), self.num_correct_chunks, replace=False)
+            list(range(len(chunks))), self.num_correct_chunks, replace=False)
         infer_chunks = np.random.choice(
             [x for x in range(len(chunks)) if x not in correct_chunks],
             self.num_infer_chunks - self.num_correct_chunks,
@@ -138,7 +138,8 @@ class TestChunkEvalOp(OpTest):
         infer.fill(self.num_chunk_types * self.num_tag_types)
         label = np.copy(infer)
         starts = np.random.choice(
-            range(1, self.batch_size), self.num_sequences - 1,
+            list(range(1, self.batch_size)),
+            self.num_sequences - 1,
             replace=False).tolist()
         starts.extend([0, self.batch_size])
         starts = sorted(starts)
diff --git a/python/paddle/fluid/tests/unittests/test_conditional_block.py b/python/paddle/fluid/tests/unittests/test_conditional_block.py
index d9f83905e6135e22f74e749857f9b0fbe464d3f4..77869a1242e08d348bfb1031b8f5b1ab5c81d868 100644
--- a/python/paddle/fluid/tests/unittests/test_conditional_block.py
+++ b/python/paddle/fluid/tests/unittests/test_conditional_block.py
@@ -39,7 +39,7 @@ class ConditionalBlockTest(unittest.TestCase):
         x = numpy.random.random(size=(10, 1)).astype('float32')
 
         outs = exe.run(feed={'X': x}, fetch_list=[out])[0]
-        print outs
+        print(outs)
         loss = layers.mean(out)
         append_backward(loss=loss)
         outs = exe.run(
@@ -47,7 +47,7 @@ class ConditionalBlockTest(unittest.TestCase):
             fetch_list=[
                 default_main_program().block(0).var(data.name + "@GRAD")
             ])[0]
-        print outs
+        print(outs)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
index db6be21baaa54d33af9f5c44d1815e4b389eb884..d0de7ad52c8a851c16cbbbf544d479f696dee136 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
@@ -20,16 +20,19 @@ from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride
 class TestMKLDNN(TestConv2dOp):
     def init_kernel_type(self):
         self.use_mkldnn = True
+        self.data_format = "NCHW"
 
 
 class TestMKLDNNWithPad(TestWithPad):
     def init_kernel_type(self):
         self.use_mkldnn = True
+        self.data_format = "NCHW"
 
 
 class TestMKLDNNWithStride(TestWithStride):
     def init_kernel_type(self):
         self.use_mkldnn = True
+        self.data_format = "NCHW"
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index a478649541ba9828e55c4239090d5aee554223ac..bb1cd87d615fa341b7244e9f3e113b9fb4765ac2 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -66,6 +66,7 @@ class TestConv2dOp(OpTest):
         self.op_type = "conv2d"
         self.use_cudnn = False
         self.use_mkldnn = False
+        self.data_format = "AnyLayout"
         self.dtype = np.float32
         self.init_kernel_type()
         self.init_group()
@@ -93,7 +94,8 @@ class TestConv2dOp(OpTest):
             'groups': self.groups,
             'dilations': self.dilations,
             'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_mkldnn
+            'use_mkldnn': self.use_mkldnn,
+            'data_format': self.data_format
         }
         self.outputs = {'Output': output}
 
@@ -101,59 +103,35 @@ class TestConv2dOp(OpTest):
         return core.is_compiled_with_cuda() and self.use_cudnn
 
     def test_check_output(self):
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
-        else:
-            self.check_output()
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_output_with_place(place, atol=1e-5)
 
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place,
-                set(['Input', 'Filter']),
-                'Output',
-                max_relative_error=0.02)
-        else:
-            self.check_grad(
-                set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_grad_with_place(
+            place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
             return
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['Input'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Filter']))
-        else:
-            self.check_grad(
-                ['Input'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Filter']))
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_grad_with_place(
+            place, ['Input'],
+            'Output',
+            max_relative_error=0.02,
+            no_grad_set=set(['Filter']))
 
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
             return
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['Filter'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Input']))
-        else:
-            self.check_grad(
-                ['Filter'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Input']))
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_grad_with_place(
+            place, ['Filter'],
+            'Output',
+            max_relative_error=0.02,
+            no_grad_set=set(['Input']))
 
     def init_test_case(self):
         self.pad = [0, 0]
diff --git a/python/paddle/fluid/tests/unittests/test_conv_shift_op.py b/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
index 5d4d244f439a671d895f9237b793e6c6bbf2895b..9fdb7baa90d2184c3c439e76b6bb5f0668f5f9ee 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
@@ -22,8 +22,8 @@ def conv_shift_forward(x, y):
     M = x.shape[1]
     N = y.shape[1]
     y_half_width = (N - 1) / 2
-    for i in xrange(M):
-        for j in xrange(N):
+    for i in range(M):
+        for j in range(N):
             out[:, i] += x[:, (i + j + M - y_half_width) % M] * y[:, j]
     return out
 
diff --git a/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py b/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
index 5e6f9a20a93e467980f5a4f23fbcb6118317fe44..07c89eefc32fab37ce093e91d96fbe4471ecddc6 100644
--- a/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
+++ b/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
@@ -18,7 +18,7 @@ import paddle.fluid.layers as layers
 
 class TestDocString(unittest.TestCase):
     def test_layer_doc_string(self):
-        print layers.dropout.__doc__
+        print(layers.dropout.__doc__)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py
index aa09b0ea445adccae3f741b53850f8182f3270cc..951282e8bab5018204c0d31caa10f8f84a8f3d6c 100644
--- a/python/paddle/fluid/tests/unittests/test_data_balance.py
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
@@ -21,7 +21,7 @@ import numpy as np
 class TestDataBalance(unittest.TestCase):
     def prepare_data(self):
         def fake_data_generator():
-            for n in xrange(self.total_ins_num):
+            for n in range(self.total_ins_num):
                 yield np.ones((3, 4)) * n, n
 
         # Prepare data
@@ -41,7 +41,7 @@ class TestDataBalance(unittest.TestCase):
 
     def prepare_lod_data(self):
         def fake_data_generator():
-            for n in xrange(1, self.total_ins_num + 1):
+            for n in range(1, self.total_ins_num + 1):
                 d1 = (np.ones((n, 3)) * n).astype('float32')
                 d2 = (np.array(n).reshape((1, 1))).astype('int32')
                 yield d1, d2
@@ -58,9 +58,9 @@ class TestDataBalance(unittest.TestCase):
                             (0, 1))
                     ]
                     lod = [0]
-                    for _ in xrange(self.batch_size):
+                    for _ in range(self.batch_size):
                         try:
-                            ins = generator.next()
+                            ins = next(generator)
                         except StopIteration:
                             eof = True
                             break
diff --git a/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py b/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
index a3bf7b544b91c70ffe3894219c118ec9887aba81..868bcca881a65dad7d0ecabb1e388818cdd0997e 100644
--- a/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
+++ b/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
@@ -39,7 +39,7 @@ class TestDefaultScopeFuncs(unittest.TestCase):
             self.assertTrue(i.is_int())
             self.assertEqual(10, i.get_int())
 
-        for _ in xrange(10):
+        for _ in range(10):
             scoped_function(__new_scope__)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
index 05d3367ad8ec2bc3df794015a7c25e943a26c68c..8b66d1b270980a18fd1bbd068917e982a450ad6f 100644
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
@@ -176,7 +176,7 @@ class TestDetectionMAPOp(OpTest):
             true_pos[label].append([score, tp])
             false_pos[label].append([score, fp])
 
-        for (label, label_pos_num) in label_count.items():
+        for (label, label_pos_num) in list(label_count.items()):
             if label_pos_num == 0 or label not in true_pos: continue
             label_true_pos = true_pos[label]
             label_false_pos = false_pos[label]
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..58cfd4e1fd958d8d59e49c87fbbabd0182975add
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -0,0 +1,138 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+
+import unittest
+import os
+import sys
+import signal
+import subprocess
+
+
+class TestDistBase(unittest.TestCase):
+    def setUp(self):
+        self._trainers = 2
+        self._pservers = 2
+        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
+        self._python_interp = "python"
+
+    def start_pserver(self, model_file):
+        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+        ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
+            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
+             self._trainers)
+        ps1_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
+            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
+             self._trainers)
+
+        ps0_proc = subprocess.Popen(
+            ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        ps1_proc = subprocess.Popen(
+            ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        return ps0_proc, ps1_proc
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 50
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(3)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error as e:
+                sys.stderr.write('waiting for pserver: %s, left retry %d\n' %
+                                 (e, retry_times))
+                retry_times -= 1
+
+    def check_with_place(self, model_file, delta=1e-3):
+        # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
+        required_envs = {
+            "PATH": os.getenv("PATH"),
+            "PYTHONPATH": os.getenv("PYTHONPATH"),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_cudnn_deterministic": "1"
+        }
+        # Run local to get a base line
+        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
+        env_local.update(required_envs)
+        local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \
+            (self._python_interp, model_file,
+             "127.0.0.1:1234", "127.0.0.1:1234", 1)
+        local_proc = subprocess.Popen(
+            local_cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env_local)
+        local_proc.wait()
+        out, err = local_proc.communicate()
+        local_ret = out
+        sys.stderr.write('local_loss: %s\n' % local_ret)
+        sys.stderr.write('local_stderr: %s\n' % err)
+
+        # Run dist train to compare with local results
+        ps0, ps1 = self.start_pserver(model_file)
+        self._wait_ps_ready(ps0.pid)
+        self._wait_ps_ready(ps1.pid)
+
+        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+        tr0_cmd = "%s %s trainer %s 0 %s %d TRUE" % \
+            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
+             self._trainers)
+        tr1_cmd = "%s %s trainer %s 1 %s %d TRUE" % \
+            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
+             self._trainers)
+
+        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
+        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
+        env0.update(required_envs)
+        env1.update(required_envs)
+        FNULL = open(os.devnull, 'w')
+
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env0)
+        tr1_proc = subprocess.Popen(
+            tr1_cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env1)
+
+        tr0_proc.wait()
+        tr1_proc.wait()
+        out, err = tr0_proc.communicate()
+        sys.stderr.write('dist_stderr: %s\n' % err)
+        loss_data0 = out
+        sys.stderr.write('dist_loss: %s\n' % loss_data0)
+        lines = loss_data0.split("\n")
+        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
+        dist_last_loss = eval(lines[1].replace(" ", ","))[0]
+
+        local_lines = local_ret.split("\n")
+        local_first_loss = eval(local_lines[0])[0]
+        local_last_loss = eval(local_lines[1])[0]
+
+        self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta)
+        self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta)
+
+        # check tr0_out
+        # FIXME: ensure the server process is killed
+        # replace with ps0.terminate()
+        os.kill(ps0.pid, signal.SIGKILL)
+        os.kill(ps1.pid, signal.SIGKILL)
+        FNULL.close()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
index ad2d57f7c5f127be87e963508e1dd150fdd30225..a6fcbd977f1af9d452dfd8367efef706cd566149 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -25,6 +25,7 @@ import unittest
 from multiprocessing import Process
 import os
 import signal
+from functools import reduce
 
 SEED = 1
 DTYPE = "float32"
@@ -172,12 +173,12 @@ class TestDistMnist(unittest.TestCase):
         exe.run(fluid.default_startup_program())
 
         feed_var_list = [
-            var for var in trainer_prog.global_block().vars.itervalues()
+            var for var in trainer_prog.global_block().vars.values()
             if var.is_data
         ]
 
         feeder = fluid.DataFeeder(feed_var_list, place)
-        for pass_id in xrange(10):
+        for pass_id in range(10):
             for batch_id, data in enumerate(train_reader()):
                 exe.run(trainer_prog, feed=feeder.feed(data))
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
index e3e7036f08cb88087ae45fe7d7c7565c102dab8a..f3a5fd6985bab1d04f6e1484534367548f383dfb 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -11,111 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import numpy as np
-import argparse
-import time
-import math
-
 import unittest
-import os
-import signal
-import subprocess
-
-
-class TestDistSeResneXt2x2(unittest.TestCase):
-    def setUp(self):
-        self._trainers = 2
-        self._pservers = 2
-        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
-        self._python_interp = "python"
-
-    def start_pserver(self):
-        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
-        ps0_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
-            (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
-        ps1_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
-            (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
-
-        ps0_proc = subprocess.Popen(
-            ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        ps1_proc = subprocess.Popen(
-            ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        return ps0_proc, ps1_proc
-
-    def _wait_ps_ready(self, pid):
-        retry_times = 20
-        while True:
-            assert retry_times >= 0, "wait ps ready failed"
-            time.sleep(3)
-            try:
-                # the listen_and_serv_op would touch a file which contains the listen port
-                # on the /tmp directory until it was ready to process all the RPC call.
-                os.stat("/tmp/paddle.%d.port" % pid)
-                return
-            except os.error:
-                retry_times -= 1
-
-    def non_test_with_place(self):
-        # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
-        required_envs = {
-            "PATH": os.getenv("PATH"),
-            "PYTHONPATH": os.getenv("PYTHONPATH"),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
-            "FLAGS_fraction_of_gpu_memory_to_use": "0.15"
-        }
-        # Run local to get a base line
-        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
-        env_local.update(required_envs)
-        local_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d FLASE" % \
-            (self._python_interp, "127.0.0.1:1234", "127.0.0.1:1234", 1)
-        local_proc = subprocess.Popen(
-            local_cmd.split(" "), stdout=subprocess.PIPE, env=env_local)
-        local_proc.wait()
-        local_ret = local_proc.stdout.read()
-
-        # Run dist train to compare with local results
-        ps0, ps1 = self.start_pserver()
-        self._wait_ps_ready(ps0.pid)
-        self._wait_ps_ready(ps1.pid)
-
-        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
-        tr0_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d TRUE" % \
-            (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
-        tr1_cmd = "%s dist_se_resnext.py trainer %s 1 %s %d TRUE" % \
-            (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
-
-        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
-        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
-        env0.update(required_envs)
-        env1.update(required_envs)
-        FNULL = open(os.devnull, 'w')
-
-        tr0_proc = subprocess.Popen(
-            tr0_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env0)
-        tr1_proc = subprocess.Popen(
-            tr1_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env1)
-
-        tr0_proc.wait()
-        tr1_proc.wait()
-        loss_data0 = tr0_proc.stdout.read()
-        lines = loss_data0.split("\n")
-        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
-        dist_last_loss = eval(lines[1].replace(" ", ","))[0]
-
-        local_lines = local_ret.split("\n")
-        local_first_loss = eval(local_lines[0])[0]
-        local_last_loss = eval(local_lines[1])[0]
+from test_dist_base import TestDistBase
 
-        self.assertAlmostEqual(local_first_loss, dist_first_loss)
-        self.assertAlmostEqual(local_last_loss, dist_last_loss)
 
-        # check tr0_out
-        # FIXME: ensure the server process is killed
-        # replace with ps0.terminate()
-        os.kill(ps0.pid, signal.SIGKILL)
-        os.kill(ps1.pid, signal.SIGKILL)
-        FNULL.close()
+class TestDistSeResneXt2x2(TestDistBase):
+    def test_se_resnext(self):
+        self.check_with_place("dist_se_resnext.py")
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..68cd35d751dbce7eef9919dc8678fc0dd117757b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
@@ -0,0 +1,27 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from test_dist_base import TestDistBase
+
+
+class TestDistTransformer2x2(TestDistBase):
+    def test_transformer(self):
+        # TODO(paddle-dev): check if the delta is OK.
+        # Usually start around ~8000 and converge to ~5000
+        self.check_with_place("dist_transformer.py", delta=400)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 9dbef0693bb129186dfc50f6efdd0896deedda81..3bc5e6ada5dabfcad38278db6e14d0b115c5050d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -51,17 +51,17 @@ class TranspilerTest(unittest.TestCase):
         self.origin_prog = main.clone()
         return main
 
-    def get_trainer(self, config=None):
-        t = self._transpiler_instance(config)
+    def get_trainer(self, config=None, sync_mode=True):
+        t = self._transpiler_instance(config, sync_mode)
         return t.get_trainer_program()
 
-    def get_pserver(self, ep, config=None):
-        t = self._transpiler_instance(config)
+    def get_pserver(self, ep, config=None, sync_mode=True):
+        t = self._transpiler_instance(config, sync_mode)
         pserver = t.get_pserver_program(ep)
         startup = t.get_startup_program(ep, pserver)
         return pserver, startup
 
-    def _transpiler_instance(self, config=None):
+    def _transpiler_instance(self, config=None, sync_mode=True):
         if not self.transpiler:
             main = self.get_main_program()
             self.transpiler = fluid.DistributeTranspiler(config=config)
@@ -69,13 +69,23 @@ class TranspilerTest(unittest.TestCase):
                 self.trainer_id,
                 program=main,
                 pservers=self.pserver_eps,
-                trainers=self.trainers)
+                trainers=self.trainers,
+                sync_mode=sync_mode)
 
         return self.transpiler
 
+    def transpiler_test_impl(self):
+        pass
 
-class TestBasicModel(TranspilerTest):
     def test_transpiler(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            self.transpiler_test_impl()
+
+
+class TestBasicModel(TranspilerTest):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         pserver2, startup2 = self.get_pserver(self.pserver2_ep)
 
@@ -123,7 +133,7 @@ class TestBasicModel(TranspilerTest):
 
 
 class TestBasicModelWithLargeBlockSize(TranspilerTest):
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         config = fluid.DistributeTranspilerConfig()
         config.min_block_size = 1048576
 
@@ -148,10 +158,10 @@ class TestBasicModelWithLargeBlockSize(TranspilerTest):
                          ["sum", "scale", "sgd"])
         # confirm startup program
         self.assertEqual([op.type for op in startup.global_block().ops],
-                         ["fill_constant", "fill_constant", "fill_constant"])
+                         ["fill_constant", "fill_constant"])
         # the variable #fc_w will be split into two blocks
         fc_w_var = startup2.global_block().var("fc_w")
-        self.assertEqual(fc_w_var.shape, (1000L, 1000L))
+        self.assertEqual(fc_w_var.shape, (1000, 1000))
         # all parameters should be optimized on pserver
 
         pserver_params = []
@@ -177,16 +187,16 @@ class TestNoSliceVar(TranspilerTest):
     def setUp(self):
         super(TestNoSliceVar, self).setUp()
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         config = fluid.DistributeTranspilerConfig()
         config.slice_var_up = False
 
         _, startup = self.get_pserver(self.pserver1_ep, config)
         _, startup2 = self.get_pserver(self.pserver2_ep, config)
 
-        if startup.global_block().vars.has_key("fc_w"):
+        if "fc_w" in startup.global_block().vars:
             fc_w_var = startup.global_block().vars["fc_w"]
-        elif startup2.global_block().vars.has_key("fc_w"):
+        elif "fc_w" in startup2.global_block().vars:
             fc_w_var = startup2.global_block().vars["fc_w"]
 
         self.assertEqual(fc_w_var.shape, (1000, 1000))
@@ -212,7 +222,7 @@ class TestLRDecay(TranspilerTest):
         sgd_optimizer.minimize(avg_cost)
         return
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         trainer = self.get_trainer()
 
@@ -242,7 +252,7 @@ class TestLRDecayConditional(TranspilerTest):
         sgd_optimizer.minimize(avg_cost)
         return
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         trainer = self.get_trainer()
 
@@ -291,7 +301,7 @@ class TestL2Decay(TranspilerTest):
         sgd_optimizer.minimize(avg_cost)
         return
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         trainer = self.get_trainer()
 
@@ -326,7 +336,7 @@ class TestL2DecayWithPiecewise(TranspilerTest):
         sgd_optimizer.minimize(avg_cost)
         return
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         trainer = self.get_trainer()
 
@@ -350,5 +360,181 @@ class TestL2DecayWithPiecewise(TranspilerTest):
             ["sum", "scale", "scale", "elementwise_add", "momentum"])
 
 
+class TestDistLookupTableBase(TranspilerTest):
+    def network_with_table(self, is_sparse, is_distributed):
+        def emb_pool(ids):
+            table_size = 1000
+            emb_size = 64
+            emb = fluid.layers.embedding(
+                input=ids,
+                size=[table_size, emb_size],
+                dtype='float32',
+                param_attr='shared_w',  # share parameter
+                is_sparse=is_sparse,
+                is_distributed=is_distributed)
+            pool = fluid.layers.sequence_pool(input=emb, pool_type='average')
+            return pool
+
+        title_ids = fluid.layers.data(
+            name='title_ids', shape=[1], dtype='int64', lod_level=1)
+        brand_ids = fluid.layers.data(
+            name='brand_ids', shape=[1], dtype='int64', lod_level=1)
+        title_emb = emb_pool(title_ids)
+        brand_emb = emb_pool(brand_ids)
+        fc0 = fluid.layers.concat(input=[title_emb, brand_emb], axis=1)
+        predict = fluid.layers.fc(input=fc0,
+                                  size=2,
+                                  act=None,
+                                  param_attr=fluid.ParamAttr(name='fc_w'),
+                                  bias_attr=fluid.ParamAttr(name='fc_b'))
+
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(cost)
+        optimizer = fluid.optimizer.Adam(learning_rate=0.003)
+        optimizer.minimize(avg_cost)
+
+
+class TestLocalLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=False)
+
+    def transpiler_test_impl(self):
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep)
+
+        self.assertEqual(len(pserver1.blocks), 3)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+        # 2 optimize for table adam
+        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["sum", "adam", "scale", "scale"])
+
+        trainer = self.get_trainer()
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
+            'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean',
+            'fill_constant', 'mean_grad', 'cross_entropy_grad',
+            'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad',
+            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sum', 'split_selected_rows', 'send',
+            'send_barrier', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
+class TestDistLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=True)
+
+    def transpiler_test_impl(self):
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep)
+
+        self.assertEqual(len(pserver1.blocks), 6)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+        # 2 optimize for table sgd
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["sum", "sgd"])
+        # 3 prefetch -> lookup_sparse_table for data0
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
+                         ["lookup_sparse_table"])
+        # 4 prefetch -> lookup_sparse_table for data1
+        self.assertEqual([op.type for op in pserver1.blocks[4].ops],
+                         ["lookup_sparse_table"])
+        # 5 save table
+        self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
+
+        trainer = self.get_trainer()
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
+            'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul',
+            'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
+            'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
+            'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'sum', 'split_ids', 'send', 'send_barrier', 'recv', 'recv',
+            'fetch_barrier'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
+class TestAsyncLocalLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=False)
+
+    def transpiler_test_impl(self):
+        config = fluid.DistributeTranspilerConfig()
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
+
+        self.assertEqual(len(pserver1.blocks), 3)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["adam", "scale", "scale"])
+        # 2 optimize for table adam
+        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["adam", "scale", "scale"])
+
+        trainer = self.get_trainer(config)
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
+            'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean',
+            'fill_constant', 'mean_grad', 'cross_entropy_grad',
+            'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad',
+            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sum', 'split_selected_rows', 'send', 'recv',
+            'recv', 'recv', 'concat'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
+class TestAsyncDistLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=True)
+
+    def transpiler_test_impl(self):
+        config = fluid.DistributeTranspilerConfig()
+
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
+
+        self.assertEqual(len(pserver1.blocks), 6)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["adam", "scale", "scale"])
+        # 2 optimize for table sgd
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops], ["sgd"])
+        # 3 prefetch -> lookup_sparse_table for data0
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
+                         ["lookup_sparse_table"])
+        # 4 prefetch -> lookup_sparse_table for data1
+        self.assertEqual([op.type for op in pserver1.blocks[4].ops],
+                         ["lookup_sparse_table"])
+        # 5 save table
+        self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
+
+        trainer = self.get_trainer(config)
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
+            'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul',
+            'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
+            'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
+            'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'sum', 'split_ids', 'send', 'recv', 'recv'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
index 712fd5849d80b1915ae3b2ae5108bedee8d88a2c..4bb3998f891959f8270dc4b25821f23f1e6195e0 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
@@ -183,12 +183,12 @@ class TestDistMnist(unittest.TestCase):
             exec_strategy=exec_strategy)
 
         feed_var_list = [
-            var for var in trainer_prog.global_block().vars.itervalues()
+            var for var in trainer_prog.global_block().vars.values()
             if var.is_data
         ]
 
         feeder = fluid.DataFeeder(feed_var_list, place)
-        for pass_id in xrange(10):
+        for pass_id in range(10):
             for batch_id, data in enumerate(train_reader()):
                 avg_loss_np = train_exe.run(feed=feeder.feed(data),
                                             fetch_list=[avg_cost.name])
diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
index 4448de8839d7ad4ad1f70ecdc4ac94da1e619adb..fdc6adc93bc2488d4faffed61fde5d54bbbbfd57 100644
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
@@ -135,7 +135,7 @@ class TestDynRNN(unittest.TestCase):
         loss_0 = exe.run(main_program,
                          feed=feeder.feed(data),
                          fetch_list=[loss])[0]
-        for _ in xrange(100):
+        for _ in range(100):
             val = exe.run(main_program,
                           feed=feeder.feed(data),
                           fetch_list=[loss])[0]
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
index 0f289af284773caf8515f9cbdd38e0d4481e4e44..7756885166c88eadb77c2c6d56aab767015abc51 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
@@ -61,13 +61,13 @@ class BaseRNN(object):
         self.num_seq = num_seq
         self.inputs = collections.defaultdict(list)
 
-        for _ in xrange(num_seq):
+        for _ in range(num_seq):
             seq_len = random.randint(1, max_seq_len - 1)
             for iname in ins:
                 ishape = ins[iname].get('shape', None)
                 idtype = ins[iname].get('dtype', 'float32')
                 lst = []
-                for _ in xrange(seq_len):
+                for _ in range(seq_len):
                     lst.append(numpy.random.random(size=ishape).astype(idtype))
                 self.inputs[iname].append(lst)
 
@@ -96,16 +96,16 @@ class BaseRNN(object):
         for out in self.outputs:
             retv[out] = []
 
-        for seq_id in xrange(self.num_seq):
+        for seq_id in range(self.num_seq):
             for mname in self.mems:
                 self.mems[mname].reset()
             for out in self.outputs:
                 self.outputs[out].next_sequence()
 
-            iname0 = self.inputs.keys()[0]
+            iname0 = list(self.inputs.keys())[0]
             seq_len = len(self.inputs[iname0][seq_id])
 
-            for step_id in xrange(seq_len):
+            for step_id in range(seq_len):
                 xargs = dict()
 
                 for iname in self.inputs:
@@ -138,7 +138,7 @@ class BaseRNN(object):
         for iname in self.inputs:
             lod = []
             np_flatten = []
-            for seq_id in xrange(len(self.inputs[iname])):
+            for seq_id in range(len(self.inputs[iname])):
                 seq_len = len(self.inputs[iname][seq_id])
                 lod.append(seq_len)
                 np_flatten.extend(self.inputs[iname][seq_id])
@@ -159,8 +159,8 @@ class BaseRNN(object):
                              " which is not matrix")
         g = numpy.zeros(shape=p.shape, dtype=p.dtype)
 
-        for i in xrange(p.shape[0]):
-            for j in xrange(p.shape[1]):
+        for i in range(p.shape[0]):
+            for j in range(p.shape[1]):
                 o = p[i][j]
                 p[i][j] += delta
                 pos = self._exe_mean_out_()
@@ -184,7 +184,7 @@ class BaseRNN(object):
                 if len(item.shape) != 1:
                     raise ValueError("Not support")
 
-                for i in xrange(len(item)):
+                for i in range(len(item)):
                     o = item[i]
                     item[i] += delta
                     pos = self._exe_mean_out_()
@@ -198,14 +198,14 @@ class BaseRNN(object):
         if not return_one_tensor:
             return grad
 
-        for i in xrange(len(grad)):
+        for i in range(len(grad)):
             grad[i] = numpy.concatenate(grad[i])
         grad = numpy.concatenate(grad)
         return grad
 
     def _exe_mean_out_(self):
         outs = self.exe()
-        return numpy.array([o.mean() for o in outs.itervalues()]).mean()
+        return numpy.array([o.mean() for o in outs.values()]).mean()
 
 
 class SeedFixedTestCase(unittest.TestCase):
@@ -274,13 +274,14 @@ class TestSimpleMul(SeedFixedTestCase):
 
         cpu = fluid.CPUPlace()
         exe = fluid.Executor(cpu)
-        out, w_g, i_g = map(numpy.array,
-                            exe.run(feed=py_rnn.to_feed(cpu),
-                                    fetch_list=[
-                                        out, self.PARAM_NAME + "@GRAD",
-                                        self.DATA_NAME + "@GRAD"
-                                    ],
-                                    return_numpy=False))
+        out, w_g, i_g = list(
+            map(numpy.array,
+                exe.run(feed=py_rnn.to_feed(cpu),
+                        fetch_list=[
+                            out, self.PARAM_NAME + "@GRAD", self.DATA_NAME +
+                            "@GRAD"
+                        ],
+                        return_numpy=False)))
         out_by_python = py_rnn.exe()[self.OUT_NAME]
         self.assertTrue(numpy.allclose(out, out_by_python))
         w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME)
@@ -351,14 +352,15 @@ class TestSimpleMulWithMemory(SeedFixedTestCase):
         cpu = fluid.CPUPlace()
         exe = fluid.Executor(cpu)
         feed = py_rnn.to_feed(cpu)
-        last_np, w_g, i_g = map(numpy.array,
-                                exe.run(feed=feed,
-                                        fetch_list=[
-                                            last, self.PARAM_NAME + "@GRAD",
-                                            self.DATA_NAME + "@GRAD"
-                                        ],
-                                        return_numpy=False))
-        last_by_py, = py_rnn.exe().values()
+        last_np, w_g, i_g = list(
+            map(numpy.array,
+                exe.run(feed=feed,
+                        fetch_list=[
+                            last, self.PARAM_NAME + "@GRAD", self.DATA_NAME +
+                            "@GRAD"
+                        ],
+                        return_numpy=False)))
+        last_by_py, = list(py_rnn.exe().values())
         w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME)
         self.assertTrue(numpy.allclose(last_np, last_by_py))
 
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
index 31af1245720405ee067a0acf3575e3ae86372c13..d182889a970fb178dec4976aebbd79d05dc3e91e 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
@@ -67,7 +67,7 @@ class TestDyRnnStaticInput(unittest.TestCase):
     def _lodtensor_to_ndarray(self, lod_tensor):
         dims = lod_tensor.shape()
         ndarray = np.zeros(shape=dims).astype('float32')
-        for i in xrange(np.product(dims)):
+        for i in range(np.product(dims)):
             ndarray.ravel()[i] = lod_tensor._get_float_element(i)
         return ndarray, lod_tensor.recursive_sequence_lengths()
 
@@ -114,7 +114,7 @@ class TestDyRnnStaticInput(unittest.TestCase):
                 shape=[1], dtype='int64', value=0)
             step_idx.stop_gradient = True
 
-            for i in xrange(self._max_sequence_len):
+            for i in range(self._max_sequence_len):
                 step_out = fluid.layers.array_read(static_input_out_array,
                                                    step_idx)
                 step_out.stop_gradient = True
@@ -140,27 +140,27 @@ class TestDyRnnStaticInput(unittest.TestCase):
         static_lod = self.static_input_tensor.recursive_sequence_lengths()
         static_sliced = []
         cur_offset = 0
-        for i in xrange(len(static_lod[0])):
+        for i in range(len(static_lod[0])):
             static_sliced.append(self.static_input_data[cur_offset:(
                 cur_offset + static_lod[0][i])])
             cur_offset += static_lod[0][i]
         static_seq_len = static_lod[0]
         static_reordered = []
-        for i in xrange(len(x_sorted_indices)):
+        for i in range(len(x_sorted_indices)):
             static_reordered.extend(static_sliced[x_sorted_indices[i]].tolist())
         static_seq_len_reordered = [
             static_seq_len[x_sorted_indices[i]]
-            for i in xrange(len(x_sorted_indices))
+            for i in range(len(x_sorted_indices))
         ]
 
         static_step_outs = []
         static_step_lods = []
 
-        for i in xrange(self._max_sequence_len):
+        for i in range(self._max_sequence_len):
             end = len(x_seq_len) - bisect.bisect_left(x_seq_len_sorted, i + 1)
             lod = []
             total_len = 0
-            for i in xrange(end):
+            for i in range(end):
                 lod.append(static_seq_len_reordered[i])
                 total_len += lod[-1]
             static_step_lods.append([lod])
@@ -174,7 +174,7 @@ class TestDyRnnStaticInput(unittest.TestCase):
         static_step_outs = self.build_graph(only_forward=True)
         self.exe.run(framework.default_startup_program())
         expected_outs, expected_lods = self.get_expected_static_step_outs()
-        for i in xrange(self._max_sequence_len):
+        for i in range(self._max_sequence_len):
             step_out, lod = self.fetch_value(static_step_outs[i])
             self.assertTrue(np.allclose(step_out, expected_outs[i]))
             self.assertTrue(np.allclose(lod, expected_lods[i]))
@@ -189,7 +189,7 @@ class TestDyRnnStaticInput(unittest.TestCase):
         numeric_gradients = np.zeros(shape=static_input_shape).astype('float32')
         # calculate numeric gradients
         tensor_size = np.product(static_input_shape)
-        for i in xrange(tensor_size):
+        for i in range(tensor_size):
             origin = self.static_input_tensor._get_float_element(i)
             x_pos = origin + self._delta
             self.static_input_tensor._set_float_element(i, x_pos)
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
index c6f45381af8ac64d117eb27325f25763fbf6cae7..6f350044892a4ba2a985b5bc2328ab1fc20c5504 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
@@ -26,7 +26,7 @@ class TestElementWiseAddOp(unittest.TestCase):
         def test_with_place(place):
             out_grad = np.random.random_sample(self.x.shape).astype(np.float32)
             x_grad = out_grad
-            sum_axis = range(0, len(self.x.shape))
+            sum_axis = list(range(0, len(self.x.shape)))
             del sum_axis[self.axis]
             y_grad = np.sum(out_grad, axis=tuple(sum_axis))
 
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
index acf652d3fb9743d69b7f7e248ff7a3ee83fc4c50..1854232194963bcbe302010320a30d85747eea96 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
@@ -20,8 +20,8 @@ class TestElementwiseOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+            'X': np.random.uniform(0.1, 1, [2, 3]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2, 3]).astype("float32")
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
 
diff --git a/python/paddle/fluid/tests/unittests/test_extract_rows_op.py b/python/paddle/fluid/tests/unittests/test_extract_rows_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a41c44fe655b18626bdb727745dae032babe8ad
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_extract_rows_op.py
@@ -0,0 +1,58 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from op_test import OpTest
+
+
+class TestExtractRows(OpTest):
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        # create and initialize Variable
+        feature_len = 12
+        rows = [0, 4, 4, 7]
+        np_array = np.ones((len(rows), feature_len)).astype("float32")
+
+        in_x = scope.var('X').get_selected_rows()
+        in_x.set_height(len(rows))
+        in_x.set_rows(rows)
+        in_x_tensor = in_x.get_tensor()
+        in_x_tensor.set(np_array, place)
+
+        # create Out Variable
+        out_tensor = scope.var('Out').get_tensor()
+
+        # create and run lookup_table operator
+        extract_rows_op = Operator("extract_rows", X='X', Out='Out')
+        extract_rows_op.run(scope, place)
+
+        # get result from Out
+        result_array = np.array(out_tensor)
+        result_array = [ele[0] for ele in result_array]
+        assert result_array == rows
+
+    def test_concat_rows(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_op.py b/python/paddle/fluid/tests/unittests/test_flatten_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8692ce2ea66ef61c63bc41e77df050398ac63fd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_flatten_op.py
@@ -0,0 +1,68 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+from op_test import OpTest
+
+
+class TestFlattenOp(OpTest):
+    def setUp(self):
+        self.op_type = "flatten"
+        self.init_test_case()
+        self.inputs = {"X": np.random.random(self.in_shape).astype("float32")}
+        self.init_attrs()
+        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+    def init_test_case(self):
+        self.in_shape = (3, 2, 2, 5)
+        self.axis = 1
+        self.new_shape = (3, 20)
+
+    def init_attrs(self):
+        self.attrs = {"axis": self.axis}
+
+
+class TestFlattenOp(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 2, 3)
+        self.axis = 0
+        self.new_shape = (1, 36)
+
+
+class TestFlattenOpWithDefaultAxis(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 2, 3)
+        self.new_shape = (3, 12)
+
+    def init_attrs(self):
+        self.attrs = {}
+
+
+class TestFlattenOpSixDims(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 3, 2, 4, 4)
+        self.axis = 4
+        self.new_shape = (36, 16)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec0a939e9ec21952a6657ea849bb9844bb69cc8d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
@@ -0,0 +1,818 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+
+# scale + add
+#   TestElementwiseAddOp
+#   TestFusedOperatorsOp_scalar
+#   TestFusedOperatorsOp_scalar2
+#   TestFusedOperatorsOp_Vector
+#   TestFusedOperatorsOp_broadcast_0
+#   TestFusedOperatorsOp_broadcast_1
+#   TestFusedOperatorsOp_broadcast_2
+#   TestFusedOperatorsOp_broadcast_3
+#   TestFusedOperatorsOp_broadcast_4
+#   TestFusedOperatorsOp_rowwise_add_0
+#   TestFusedOperatorsOp_rowwise_add_1
+#   TestFusedOperatorsOp_channelwise_add
+
+
+class TestElementwiseAddOp(OpTest):
+    def setUp(self):
+        self.op_type = "fused_elemwise_activation"
+        self.dtype = np.float32
+        self.axis = -1
+
+        self.init_axis()
+        self.init_dtype()
+        self.init_input()
+        self.init_output()
+        self.init_attr()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.outputs = {'Out': self.out}
+
+    def init_input(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["scale", "elementwise_add"]
+        }
+
+    def init_dtype(self):
+        pass
+
+    def init_axis(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+
+
+class TestFusedOperatorsOp_scalar(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y) * self.scale
+
+
+class TestFusedOperatorsOp_scalar2(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y) * self.scale
+
+
+class TestFusedOperatorsOp_Vector(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.random((32, )).astype(self.dtype)
+        self.y = np.random.random((32, )).astype(self.dtype)
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y) * self.scale
+
+
+class TestFusedOperatorsOp_broadcast_0(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(2).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(2, 1, 1)) * self.scale
+
+
+class TestFusedOperatorsOp_broadcast_1(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(3).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(1, 3, 1)) * self.scale
+
+
+class TestFusedOperatorsOp_broadcast_2(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(4).astype(self.dtype)
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(1, 1, 4)) * self.scale
+
+
+class TestFusedOperatorsOp_broadcast_3(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(1, 3, 4, 1)) * self.scale
+
+
+class TestFusedOperatorsOp_broadcast_4(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 1).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(2, 1, 1, 1)) * self.scale
+
+
+class TestFusedOperatorsOp_rowwise_add_0(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(1, 3, 4)) * self.scale
+
+
+class TestFusedOperatorsOp_rowwise_add_1(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 1).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(1, 1)) * self.scale
+
+
+class TestFusedOperatorsOp_channelwise_add(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(3, 20, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = -1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y) * self.scale
+
+
+# add + scale
+#   TestElementwiseAddOp_f_add_scale
+#   TestFusedOperatorsOp_scalar_f_add_scale
+#   TestFusedOperatorsOp_scalar2_f_add_scale
+#   TestFusedOperatorsOp_Vector_f_add_scale
+#   TestFusedOperatorsOp_broadcast_0_f_add_scale
+#   TestFusedOperatorsOp_broadcast_1_f_add_scale
+#   TestFusedOperatorsOp_broadcast_2_f_add_scale
+#   TestFusedOperatorsOp_broadcast_3_f_add_scale
+#   TestFusedOperatorsOp_broadcast_4_f_add_scale
+#   TestFusedOperatorsOp_rowwise_add_0_f_add_scale
+#   TestFusedOperatorsOp_rowwise_add_1_f_add_scale
+#   TestFusedOperatorsOp_channelwise_add_f_add_scale
+
+
+class TestFusedOperatorsOp_f_add_scale(TestElementwiseAddOp):
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_scalar_f_add_scale(TestFusedOperatorsOp_scalar):
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_scalar2_f_add_scale(TestFusedOperatorsOp_scalar2):
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_Vector_f_add_scale(TestFusedOperatorsOp_Vector):
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_0_f_add_scale(
+        TestFusedOperatorsOp_broadcast_0):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y.reshape(2, 1, 1) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_1_f_add_scale(
+        TestFusedOperatorsOp_broadcast_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y.reshape(1, 3, 1) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_2_f_add_scale(
+        TestFusedOperatorsOp_broadcast_2):
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y.reshape(1, 1, 4) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_3_f_add_scale(
+        TestFusedOperatorsOp_broadcast_3):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y.reshape(1, 3, 4, 1) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_4_f_add_scale(
+        TestFusedOperatorsOp_broadcast_4):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.scale = 0.2
+        self.out = self.x + self.y.reshape(2, 1, 1, 1) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_0_f_add_scale(
+        TestFusedOperatorsOp_rowwise_add_0):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y.reshape(1, 3, 4) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_1_f_add_scale(
+        TestFusedOperatorsOp_rowwise_add_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.2
+        self.out = self.x + self.y.reshape(1, 1) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_channelwise_add_f_add_scale(
+        TestFusedOperatorsOp_channelwise_add):
+    def init_axis(self):
+        self.axis = -1
+
+    def init_output(self):
+        self.scale = 0.2
+        self.out = self.x + self.y * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+# add + relu
+#   TestElementwiseAddOp_f_add_relu
+#   TestFusedOperatorsOp_scalar_f_add_relu
+#   TestFusedOperatorsOp_scalar2_f_add_relu
+#   TestFusedOperatorsOp_Vector_f_add_relu
+#   TestFusedOperatorsOp_broadcast_0_f_add_relu
+#   TestFusedOperatorsOp_broadcast_1_f_add_relu
+#   TestFusedOperatorsOp_broadcast_2_f_add_relu
+#   TestFusedOperatorsOp_broadcast_3_f_add_relu
+#   TestFusedOperatorsOp_broadcast_4_f_add_relu
+#   TestFusedOperatorsOp_rowwise_add_0_f_add_relu
+#   TestFusedOperatorsOp_rowwise_add_1_f_add_relu
+#   TestFusedOperatorsOp_channelwise_add_f_add_relu
+
+
+class TestFusedOperatorsOp_f_add_relu(TestElementwiseAddOp):
+    def init_output(self):
+        # Copy from test_activation_op.py
+        # Because we set delta = 0.005 in calculating numeric gradient,
+        # if x is too small, such as 0.002, x_neg will be -0.003
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
+        # we should avoid this
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y, 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_scalar_f_add_relu(TestFusedOperatorsOp_scalar):
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y, 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_scalar2_f_add_relu(TestFusedOperatorsOp_scalar2):
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y, 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_Vector_f_add_relu(TestFusedOperatorsOp_Vector):
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y, 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_0_f_add_relu(
+        TestFusedOperatorsOp_broadcast_0):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(2, 1, 1), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_1_f_add_relu(
+        TestFusedOperatorsOp_broadcast_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(1, 3, 1), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_2_f_add_relu(
+        TestFusedOperatorsOp_broadcast_2):
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(1, 1, 4), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_3_f_add_relu(
+        TestFusedOperatorsOp_broadcast_3):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(1, 3, 4, 1), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_4_f_add_relu(
+        TestFusedOperatorsOp_broadcast_4):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(2, 1, 1, 1), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_0_f_add_relu(
+        TestFusedOperatorsOp_rowwise_add_0):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(1, 3, 4), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_1_f_add_relu(
+        TestFusedOperatorsOp_rowwise_add_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(1, 1), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_channelwise_add_f_add_relu(
+        TestFusedOperatorsOp_channelwise_add):
+    def init_axis(self):
+        self.axis = -1
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y, 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+# relu + add
+#   TestElementwiseAddOp_f_relu_add
+#   TestFusedOperatorsOp_scalar_f_relu_add
+#   TestFusedOperatorsOp_scalar2_f_relu_add
+#   TestFusedOperatorsOp_Vector_f_relu_add
+#   TestFusedOperatorsOp_broadcast_0_f_relu_add
+#   TestFusedOperatorsOp_broadcast_1_f_relu_add
+#   TestFusedOperatorsOp_broadcast_2_f_relu_add
+#   TestFusedOperatorsOp_broadcast_3_f_relu_add
+#   TestFusedOperatorsOp_broadcast_4_f_relu_add
+#   TestFusedOperatorsOp_rowwise_add_0_f_relu_add
+#   TestFusedOperatorsOp_rowwise_add_1_f_relu_add
+#   TestFusedOperatorsOp_channelwise_add_f_relu_add
+
+
+class TestFusedOperatorsOp_f_relu_add(TestElementwiseAddOp):
+    def init_output(self):
+        # Copy from test_activation_op.py
+        # Because we set delta = 0.005 in calculating numeric gradient,
+        # if x is too small, such as 0.002, x_neg will be -0.003
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
+        # we should avoid this
+        self.out = self.x + self.y
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_scalar_f_relu_add(TestFusedOperatorsOp_scalar):
+    def init_output(self):
+        self.out = self.x + self.y
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_scalar2_f_relu_add(TestFusedOperatorsOp_scalar2):
+    def init_output(self):
+        self.out = self.x + self.y
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_Vector_f_relu_add(TestFusedOperatorsOp_Vector):
+    def init_output(self):
+        self.out = self.x + self.y
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_0_f_relu_add(
+        TestFusedOperatorsOp_broadcast_0):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(2, 1, 1)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_1_f_relu_add(
+        TestFusedOperatorsOp_broadcast_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(1, 3, 1)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_2_f_relu_add(
+        TestFusedOperatorsOp_broadcast_2):
+    def init_output(self):
+        self.out = self.x + self.y.reshape(1, 1, 4)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_3_f_relu_add(
+        TestFusedOperatorsOp_broadcast_3):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(1, 3, 4, 1)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_4_f_relu_add(
+        TestFusedOperatorsOp_broadcast_4):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(2, 1, 1, 1)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_0_f_relu_add(
+        TestFusedOperatorsOp_rowwise_add_0):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(1, 3, 4)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_1_f_relu_add(
+        TestFusedOperatorsOp_rowwise_add_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(1, 1)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_channelwise_add_f_relu_add(
+        TestFusedOperatorsOp_channelwise_add):
+    def init_axis(self):
+        self.axis = -1
+
+    def init_output(self):
+        self.out = self.x + self.y
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py
index 8fbf1560859aa295fc40b36129d0f0d07d55dd9f..86a2c674d01f45b2b141572c8191d2fba7fa312f 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
@@ -38,7 +38,7 @@ class TestGRUOp(OpTest):
         for i in range(len(seq_lens)):
             seq_starts.append(seq_starts[-1] + seq_lens[i])
         sorted_seqs = sorted(
-            range(len(seq_lens)), lambda x, y: seq_lens[y] - seq_lens[x])
+            list(range(len(seq_lens))), lambda x, y: seq_lens[y] - seq_lens[x])
         num_batch = seq_lens[sorted_seqs[0]]
         for batch_idx in range(num_batch):
             idx_in_seq = []
@@ -74,15 +74,16 @@ class TestGRUOp(OpTest):
     def gru(self):
         input, lod = self.inputs['Input']
         w = self.inputs['Weight']
-        b = self.inputs['Bias'] if self.inputs.has_key('Bias') else np.zeros(
+        b = self.inputs['Bias'] if 'Bias' in self.inputs else np.zeros(
             (1, self.frame_size * 3))
         batch_gate = self.outputs['BatchGate']
         batch_reset_hidden_prev = self.outputs['BatchResetHiddenPrev']
         batch_hidden = self.outputs['BatchHidden']
         hidden = self.outputs['Hidden']
         idx_in_seq_list = self.idx_in_seq_list
-        h_p = self.inputs['H0'][self.sorted_seqs] if self.inputs.has_key(
-            'H0') else np.zeros((len(idx_in_seq_list[0]), self.frame_size))
+        h_p = self.inputs['H0'][
+            self.sorted_seqs] if 'H0' in self.inputs else np.zeros(
+                (len(idx_in_seq_list[0]), self.frame_size))
         num_batch = len(idx_in_seq_list)
         end_idx = 0
         for batch_idx in range(num_batch):
diff --git a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
index c56b1eefd3a3dfe1478bd0526fa32077edcac9ba..87a9eba4d97459082cdf1499efeddf24ed51e1b1 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
@@ -76,7 +76,7 @@ class TestGRUUnitOp(OpTest):
         x = self.inputs['Input']
         h_p = self.inputs['HiddenPrev']
         w = self.inputs['Weight']
-        b = self.inputs['Bias'] if self.inputs.has_key('Bias') else np.zeros(
+        b = self.inputs['Bias'] if 'Bias' in self.inputs else np.zeros(
             (1, frame_size * 3))
         g = x + np.tile(b, (batch_size, 1))
         w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape(
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index d090960c84e47da68a0ebea4609dfc3ed76e114e..daa5da8d95129af0305b326832a557daeb4c5c9c 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -17,6 +17,8 @@ import numpy as np
 import math
 from op_test import OpTest
 
+np.random.seed(100)
+
 
 def find_latest_set(num):
     return 1 + int(math.floor(math.log(num, 2)))
diff --git a/python/paddle/fluid/tests/unittests/test_image_classification_layer.py b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
index 6ecfa9ea213fe0cf57e18fa83bbb85c223727d71..23b1ed957ad15bb631cd5160eb48328c76302987 100644
--- a/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
@@ -43,7 +43,7 @@ class TestLayer(unittest.TestCase):
             hidden2 = fluid.layers.fc(input=hidden1, size=128, act='relu')
             fluid.layers.batch_norm(input=hidden2)
 
-        print str(main_program)
+        print(str(main_program))
 
     def test_dropout_layer(self):
         main_program = Program()
@@ -53,7 +53,7 @@ class TestLayer(unittest.TestCase):
                 name='pixel', shape=[3, 48, 48], dtype='float32')
             fluid.layers.dropout(x=images, dropout_prob=0.5)
 
-        print str(main_program)
+        print(str(main_program))
 
     def test_img_conv_group(self):
         main_program = Program()
@@ -65,7 +65,7 @@ class TestLayer(unittest.TestCase):
             conv1 = conv_block(images, 64, 2, [0.3, 0])
             conv_block(conv1, 256, 3, [0.4, 0.4, 0])
 
-        print str(main_program)
+        print(str(main_program))
 
     def test_elementwise_add_with_act(self):
         main_program = Program()
diff --git a/python/paddle/fluid/tests/unittests/test_inference_model_io.py b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
index 51460cbb1370f6794e13d18fe099865b4713691f..4cd203155f446df07d2fe6c1d56e0d20f1113679 100644
--- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
@@ -48,7 +48,7 @@ class TestBook(unittest.TestCase):
 
         exe.run(init_program, feed={}, fetch_list=[])
 
-        for i in xrange(100):
+        for i in range(100):
             tensor_x = np.array(
                 [[1, 1], [1, 2], [3, 4], [5, 2]]).astype("float32")
             tensor_y = np.array([[-2], [-3], [-7], [-7]]).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 15a72cb605911dfe957fb927763174521a30a085..b215e379864e919af03591ab2566c08dddbb5743 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -27,12 +27,13 @@ class TestConstantInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.ConstantInitializer())
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.ConstantInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -43,12 +44,13 @@ class TestConstantInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.ConstantInitializer(2.3))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.ConstantInitializer(2.3))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -61,12 +63,13 @@ class TestUniformInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.UniformInitializer())
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.UniformInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -80,18 +83,19 @@ class TestUniformInitializer(unittest.TestCase):
         program = framework.Program()
         program.random_seed = 123
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.UniformInitializer())
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.UniformInitializer(seed=456))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param1",
+                initializer=initializer.UniformInitializer())
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param2",
+                initializer=initializer.UniformInitializer(seed=456))
         init_op = block.ops[1]
         self.assertEqual(init_op.attr("seed"), 123)
         init_op1 = block.ops[0]
@@ -102,12 +106,13 @@ class TestUniformInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.UniformInitializer(-4.2, 3.1, 123))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.UniformInitializer(-4.2, 3.1, 123))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -115,6 +120,25 @@ class TestUniformInitializer(unittest.TestCase):
         self.assertAlmostEqual(init_op.attr('max'), 3.1, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 123)
 
+    def test_uniform_initializer_two_op(self):
+        """Test uniform initializer with supplied attributes
+        """
+        program = framework.Program()
+        block = program.global_block()
+        for i in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.UniformInitializer(-4.2, float(i), 123))
+        self.assertEqual(len(block.ops), 1)
+        init_op0 = block.ops[0]
+        self.assertEqual(init_op0.type, 'uniform_random')
+        self.assertAlmostEqual(init_op0.attr('min'), -4.2, delta=DELTA)
+        self.assertAlmostEqual(init_op0.attr('max'), 0.0, delta=DELTA)
+        self.assertEqual(init_op0.attr('seed'), 123)
+
 
 class TestNormalInitializer(unittest.TestCase):
     def test_normal_initializer_default_value(self):
@@ -122,12 +146,13 @@ class TestNormalInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.NormalInitializer())
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.NormalInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -140,12 +165,13 @@ class TestNormalInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.NormalInitializer(2.3, 1.9, 123))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.NormalInitializer(2.3, 1.9, 123))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -161,12 +187,13 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer())
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -181,12 +208,13 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10, 15, 20],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer())
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -203,12 +231,13 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer(uniform=False))
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer(uniform=False))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -223,12 +252,13 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10, 15, 20],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer(uniform=False))
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer(uniform=False))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -244,13 +274,14 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer(
-                fan_in=12, fan_out=23, seed=134))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer(
+                    fan_in=12, fan_out=23, seed=134))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -267,12 +298,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer())
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -287,12 +319,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10, 15, 20],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer())
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -308,12 +341,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer(uniform=False))
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer(uniform=False))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -328,12 +362,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10, 15, 20],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer(uniform=False))
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer(uniform=False))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -348,13 +383,14 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer(
-                fan_in=12, seed=134))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer(
+                    fan_in=12, seed=134))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -370,12 +406,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[8, 1, 3, 3],
-            lod_level=0,
-            name="param",
-            initializer=initializer.BilinearInitializer())
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[8, 1, 3, 3],
+                lod_level=0,
+                name="param",
+                initializer=initializer.BilinearInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'assign_value')
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index 69365db4d104a1b69916a605534eff83e242289f..295887ccd171a3101329eb1255da146914fa9264 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -17,6 +17,7 @@ import numpy as np
 from operator import mul
 import paddle.fluid.core as core
 import paddle.fluid as fluid
+from functools import reduce
 
 np.random.random(123)
 
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index ab2ab24f354c1fbdc8b5221061db56a8d8a48689..8f2dac786d03334642714d221710a20833939c7a 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -279,7 +279,7 @@ class TestBook(unittest.TestCase):
     def test_nce(self):
         window_size = 5
         words = []
-        for i in xrange(window_size):
+        for i in range(window_size):
             words.append(
                 layers.data(
                     name='word_{0}'.format(i), shape=[1], dtype='int64'))
@@ -288,7 +288,7 @@ class TestBook(unittest.TestCase):
         label_word = int(window_size / 2) + 1
 
         embs = []
-        for i in xrange(window_size):
+        for i in range(window_size):
             if i == label_word:
                 continue
 
@@ -465,6 +465,15 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(out)
         print(str(program))
 
+    def test_shape(self):
+        program = Program()
+        with program_guard(program):
+            input = layers.data(
+                name="input", shape=[3, 100, 100], dtype="float32")
+            out = layers.shape(input, name="shape")
+            self.assertIsNotNone(out)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
index 16e85830ffa51ec428951570cc7a038f3d10c873..d53ead381d301e797d5a19784aed49a5d6f99319 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
@@ -36,7 +36,7 @@ class TestLoDRankTable(unittest.TestCase):
         exe.run(scope=scope, feed={'x': tensor})
         var = scope.find_var(rank_table.name)
         table = var.get_lod_rank_table()
-        self.assertEqual([(0, 5), (1, 1), (2, 1)], table.items())
+        self.assertEqual([(0, 5), (1, 1), (2, 1)], list(table.items()))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
index 118c22fbb1ff6be5859ae9e4aed6218b0c77deec..0ac6d9b81df0ecbe9c6560cdb0ab0507c3c2ed18 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
@@ -24,7 +24,7 @@ class TestLoDTensorArray(unittest.TestCase):
         tensor_array = arr.get_lod_tensor_array()
         self.assertEqual(0, len(tensor_array))
         cpu = core.CPUPlace()
-        for i in xrange(10):
+        for i in range(10):
             t = core.LoDTensor()
             t.set(numpy.array([i], dtype='float32'), cpu)
             t.set_recursive_sequence_lengths([[1]])
@@ -32,7 +32,7 @@ class TestLoDTensorArray(unittest.TestCase):
 
         self.assertEqual(10, len(tensor_array))
 
-        for i in xrange(10):
+        for i in range(10):
             t = tensor_array[i]
             self.assertEqual(numpy.array(t), numpy.array([i], dtype='float32'))
             self.assertEqual([[1]], t.recursive_sequence_lengths())
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
index 5a4580116bc7009c73f1de14a265bf2cea5acf9b..9789ff4af648b41a1b53844be89249bd260de61b 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
@@ -35,8 +35,10 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor.set(
             numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
         tensor.set_recursive_sequence_lengths([[3, 6, 1]])
-        expect = map(lambda x: numpy.array(x).astype('int32'),
-                     [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
+        expect = [
+            numpy.array(x).astype('int32')
+            for x in [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]]
+        ]
         self.main(
             tensor=tensor,
             expect_array=expect,
@@ -48,8 +50,10 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor.set(
             numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
         tensor.set_recursive_sequence_lengths([[3, 6, 0, 1]])
-        expect = map(lambda x: numpy.array(x).astype('int32'),
-                     [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
+        expect = [
+            numpy.array(x).astype('int32')
+            for x in [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]]
+        ]
         self.main(
             tensor=tensor,
             expect_array=expect,
@@ -111,8 +115,8 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         expect = [
             numpy.array(
                 item, dtype='int32')
-            for item in [[21, 0, 1, 2, 3, 4, 5, 6, 46, 47, 48, 49], range(
-                22, 39) + range(7, 21), range(39, 46)]
+            for item in [[21, 0, 1, 2, 3, 4, 5, 6, 46, 47, 48, 49], list(
+                range(22, 39)) + list(range(7, 21)), list(range(39, 46))]
         ]
         lod = [[[1, 2, 1], [1, 3, 4, 4]], [[4, 3], [1, 4, 4, 8, 4, 6, 4]],
                [[2], [6, 1]]]
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
index f8d5785fbfe64843f4aa3b96b24809df60980c74..ac25f432dffd544d4b336983ec868f2431a5b91a 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@@ -35,77 +35,59 @@ class TestLookupTableOp(OpTest):
         self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
 
 
+class TestLookupTableOpWithTensorIds(OpTest):
+    def setUp(self):
+        self.op_type = "lookup_table"
+        table = np.random.random((17, 31)).astype("float32")
+        ids = np.random.randint(
+            low=0, high=17, size=(2, 4, 5, 1)).astype("int64")
+        self.inputs = {'W': table, 'Ids': ids}
+        self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
+
+
 class TestLookupTableOpWithPadding(TestLookupTableOp):
     def test_check_output(self):
         ids = np.squeeze(self.inputs['Ids'])
         padding_idx = np.random.choice(ids, 1)[0]
         self.outputs['Out'][ids == padding_idx] = np.zeros(31)
-        self.attrs = {'padding_idx': long(padding_idx)}
+        self.attrs = {'padding_idx': int(padding_idx)}
         self.check_output()
 
     def test_check_grad(self):
-        # Since paddings are not trainable and fixed in forward, the gradient of 
+        # Since paddings are not trainable and fixed in forward, the gradient of
         # paddings makes no sense and we don't test the gradient here.
         pass
 
 
-class TestLookupTableIdsIsSelectedRows(OpTest):
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        # create and initialize Variable
-        height = 10
-        rows = [0, 4, 4, 7]
-        row_numel = 12
-
-        # create and initialize W Variable
-        W = scope.var('W').get_tensor()
-        W_array = np.full((height, row_numel), 1.0).astype("float32")
-        for i in range(height):
-            W_array[i] *= i
-        W.set(W_array, place)
-
-        # create and initialize Ids Variable
-        ids_selected_rows = scope.var('Ids').get_selected_rows()
-        ids_selected_rows.set_height(len(rows))
-        ids_selected_rows.set_rows(rows)
-        np_array = np.ones((len(rows), row_numel)).astype("float32")
-        ids_tensor = ids_selected_rows.get_tensor()
-        ids_tensor.set(np_array, place)
-
-        # create Out Variable
-        Out = scope.var('Out').get_selected_rows()
-
-        # create and run lookup_table operator
-        concat_rows_op = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
-        concat_rows_op.run(scope, place)
-
-        # get result from Out
-        Out_tensor = Out.get_tensor()
-        result_array = np.array(Out_tensor)
-
-        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
-        for idx, row in enumerate(rows):
-            assert (row == result_array[idx]).all()
+class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds):
+    def test_check_output(self):
+        ids = self.inputs['Ids']
+        flatten_idx = ids.flatten()
+        padding_idx = np.random.choice(flatten_idx, 1)[0]
+        self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
+        self.attrs = {'padding_idx': long(padding_idx)}
+        self.check_output()
 
-    def test_concat_rows(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            self.check_with_place(place)
+    def test_check_grad(self):
+        # Since paddings are not trainable and fixed in forward, the gradient of
+        # paddings makes no sense and we don't test the gradient here.
+        pass
 
 
 class TestLookupTableWIsSelectedRows(OpTest):
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        # create and initialize Id Variable
+    def prepare_ids(self, scope, place):
         ids_tensor = scope.var('Ids').get_tensor()
         ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
         ids_tensor.set(ids_array, place)
+        return ids_array
 
-        # create and initialize W Variable
+    def prepare_w(self, scope, place):
         rows = [0, 1, 2, 3, 4, 5, 6]
         row_numel = 12
 
@@ -118,8 +100,22 @@ class TestLookupTableWIsSelectedRows(OpTest):
         w_tensor = w_selected_rows.get_tensor()
         w_tensor.set(w_array, place)
 
-        # create Out Variable
-        out_tensor = scope.var('Out').get_tensor()
+    def create_out_tensor(self, scope, place):
+        return scope.var('Out').get_tensor()
+
+    def check_result(self, ids_array, result_array):
+        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
+        for idx, row in enumerate(ids_array):
+            assert (row[0] == result_array[idx]).all()
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        ids_array = self.prepare_ids(scope, place)
+
+        self.prepare_w(scope, place)
+
+        out_tensor = self.create_out_tensor(scope, place)
 
         # create and run lookup_table operator
         lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
@@ -127,9 +123,8 @@ class TestLookupTableWIsSelectedRows(OpTest):
 
         # get result from Out
         result_array = np.array(out_tensor)
-        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
-        for idx, row in enumerate(ids_array):
-            assert (row[0] == result_array[idx]).all()
+
+        self.check_result(ids_array, result_array)
 
     def test_w_is_selected_rows(self):
         places = [core.CPUPlace()]
@@ -138,5 +133,19 @@ class TestLookupTableWIsSelectedRows(OpTest):
             self.check_with_place(place)
 
 
+class TestLookupTableWithTensorIdsWIsSelectedRows(
+        TestLookupTableWIsSelectedRows):
+    def prepare_ids(self, scope, place):
+        ids_tensor = scope.var('Ids').get_tensor()
+        ids_array = np.random.randint(
+            low=0, high=6, size=(2, 4, 3, 1)).astype("int64")
+        ids_tensor.set(ids_array, place)
+        return ids_array
+
+    def check_result(self, ids_array, result_array):
+        for idx, row in np.ndenumerate(ids_array):
+            assert (row == result_array[idx]).all()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mean_iou.py b/python/paddle/fluid/tests/unittests/test_mean_iou.py
index 64d42b693bf11f3cb0153243909db4c0612bf4e7..32b4ee184787cd4cda0fd889f67a609141a3cb27 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_iou.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py
@@ -80,7 +80,7 @@ class TestMeanIOUOp(OpTest):
             'InCorrects': in_corrects,
             'InMeanIou': in_mean_ious
         }
-        self.attrs = {'num_classes': long(self.num_classes)}
+        self.attrs = {'num_classes': int(self.num_classes)}
         mean_iou, out_wrong, out_correct = compute_mean_iou(
             predictions, labels, self.num_classes, in_wrongs, in_corrects,
             in_mean_ious)
diff --git a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
index cfd6e63e12258a92447e68b4afbc7ead91b68cc1..67733807f8f8582f68dcfa3f361e13a631a29597 100644
--- a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
@@ -43,5 +43,29 @@ class TestControlFlowGraph(unittest.TestCase):
         print(str(result_program))
 
 
+class TestMemoryTranspiler2(unittest.TestCase):
+    def setUp(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            x = layers.data(name='x', shape=[13], dtype='float32')
+            fc = layers.fc(input=x, size=10, act=None)
+            reshape = layers.reshape(x=fc, shape=[-1, 2, 5])
+            fc = layers.reshape(x=reshape, shape=[-1, 5, 2])
+            y_predict = layers.fc(input=fc, size=1, act=None)
+            y = layers.data(name='y', shape=[1], dtype='float32')
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(cost)
+            opt = optimizer.SGD(learning_rate=0.001)
+            opt.minimize(avg_cost)
+        self.program = program
+
+    def test_inplace_ops(self):
+        print("before optimization")
+        print(str(self.program))
+        result_program = memory_optimize(self.program)
+        print("after optimization")
+        print(str(result_program))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_memory_usage.py b/python/paddle/fluid/tests/unittests/test_memory_usage.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9daf83652e18faab0ab31402b9f5889a0beceaf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_memory_usage.py
@@ -0,0 +1,69 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+import contextlib
+import unittest
+
+
+def train_simulator(test_batch_size=10):
+    if test_batch_size <= 0:
+        raise ValueError("batch_size should be a positive integeral value, "
+                         "but got batch_size={}".format(test_batch_size))
+
+    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_cost)
+
+    # Calculate memory usage in current network config 
+    lower_usage, upper_usage, unit = fluid.contrib.memory_usage(
+        fluid.default_main_program(), batch_size=test_batch_size)
+
+    print("memory usage is about %.3f - %.3f %s" %
+          (lower_usage, upper_usage, unit))
+
+
+class TestMemoryUsage(unittest.TestCase):
+    def test_with_unit_B(self):
+        with self.program_scope_guard():
+            train_simulator()
+
+    def test_with_unit_KB(self):
+        with self.program_scope_guard():
+            train_simulator(test_batch_size=1000)
+
+    def test_with_unit_MB(self):
+        with self.program_scope_guard():
+            train_simulator(test_batch_size=100000)
+
+    @contextlib.contextmanager
+    def program_scope_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index aacd8ae45af10a2b19d2903ab121e9bb4f9de7ff..10cb78a08db0471699bcc0b7323d5346e3af64c7 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -112,7 +112,7 @@ def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold,
 
     if keep_top_k > -1 and num_det > keep_top_k:
         score_index = []
-        for c, indices in selected_indices.iteritems():
+        for c, indices in selected_indices.items():
             for idx in indices:
                 score_index.append((scores[c][idx], c, idx))
 
@@ -143,7 +143,7 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
         lod.append(nmsed_num)
         if nmsed_num == 0: continue
 
-        for c, indices in nmsed_outs.iteritems():
+        for c, indices in nmsed_outs.items():
             for idx in indices:
                 xmin, ymin, xmax, ymax = boxes[n][idx][:]
                 det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax])
diff --git a/python/paddle/fluid/tests/unittests/test_nce.py b/python/paddle/fluid/tests/unittests/test_nce.py
index 76ecc8ba08ba31798040080a0ae99fe515c28cec..7431a142c53a64e58872390776904ce8f781d6a9 100644
--- a/python/paddle/fluid/tests/unittests/test_nce.py
+++ b/python/paddle/fluid/tests/unittests/test_nce.py
@@ -66,7 +66,7 @@ class TestNCE(OpTest):
         self.attrs = {
             'num_total_classes': num_classes,
             'num_neg_samples': num_neg_samples,
-            'custom_neg_classes': range(num_neg_samples)
+            'custom_neg_classes': list(range(num_neg_samples))
         }
         self.inputs = {
             'Input': input,
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
index d13f2b3afde10f9b4e632094fa216d8729069afa..06fccd39ac65ab62ee5618ac19d1a0535b481d06 100644
--- a/python/paddle/fluid/tests/unittests/test_one_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
@@ -28,13 +28,13 @@ class TestOneHotOp(OpTest):
         depth = 10
         dimension = 12
         x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in xrange(sum(x_lod[0]))]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
         x = np.array(x).astype('int').reshape([sum(x_lod[0]), 1])
 
         out = np.zeros(shape=(np.product(x.shape[:-1]),
                               depth)).astype('float32')
 
-        for i in xrange(np.product(x.shape)):
+        for i in range(np.product(x.shape)):
             out[i, x[i]] = 1.0
 
         self.inputs = {'X': (x, x_lod)}
@@ -51,13 +51,13 @@ class TestOneHotOp_default_dtype(OpTest):
         depth = 10
         dimension = 12
         x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in xrange(sum(x_lod[0]))]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
         x = np.array(x).astype('int').reshape([sum(x_lod[0]), 1])
 
         out = np.zeros(shape=(np.product(x.shape[:-1]),
                               depth)).astype('float32')
 
-        for i in xrange(np.product(x.shape)):
+        for i in range(np.product(x.shape)):
             out[i, x[i]] = 1.0
 
         self.inputs = {'X': (x, x_lod)}
@@ -76,7 +76,7 @@ class TestOneHotOp_exception(OpTest):
         self.dimension = 12
         self.x = core.LoDTensor()
         x_lod = [[4, 1, 3, 3]]
-        data = [np.random.randint(11, 20) for i in xrange(sum(x_lod[0]))]
+        data = [np.random.randint(11, 20) for i in range(sum(x_lod[0]))]
         data = np.array(data).astype('int').reshape([sum(x_lod[0]), 1])
         self.x.set(data, self.place)
         self.x.set_recursive_sequence_lengths(x_lod)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index 63fb58c6927fa387b3b19147b9dc9d24bb8e5132..d17e493c36a2ffcba632f5f85c7a1d2e5066dd1c 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -167,10 +167,10 @@ class TestCRFModel(unittest.TestCase):
                 place=fluid.CPUPlace())
 
             data = train_data()
-            for i in xrange(10):
+            for i in range(10):
                 cur_batch = next(data)
-                print pe.run(feed=feeder.feed(cur_batch),
-                             fetch_list=[avg_cost.name])[0]
+                print(pe.run(feed=feeder.feed(cur_batch),
+                             fetch_list=[avg_cost.name])[0])
 
     @unittest.skip(reason="CI hangs")
     def test_update_sparse_parameter_all_reduce(self):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
index 60d63364d5f403f04519363db5ad3ad136f8a975..a43f2e7c49c02ce779344da44e640cabbf27986c 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -71,7 +71,7 @@ class TestFetchOp(unittest.TestCase):
 
             fetch_list = []
             all_vars = main.global_block().vars
-            for k, v in all_vars.iteritems():
+            for k, v in all_vars.items():
                 if 'tmp' not in k and k[0] is not '_' or v.persistable:
                     fetch_list.append(k)
 
@@ -90,7 +90,7 @@ class TestFetchOp(unittest.TestCase):
         iters = 3
         train_inputs = []
         for i in range(iters):
-            train_inputs.append(tst_reader_iter.next())
+            train_inputs.append(next(tst_reader_iter))
 
         os.environ['CPU_NUM'] = str(4)
         if core.is_compiled_with_cuda():
@@ -133,7 +133,7 @@ class TestFeedParallel(unittest.TestCase):
 
         for batch_id, data in enumerate(reader()):
             loss_np = pe.run(feed=data, fetch_list=[loss.name])[0]
-            print batch_id, loss_np
+            print(batch_id, loss_np)
             if batch_id == 2:
                 break
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index 76389d916fc39f470a22aed4792bf7b754600436..9448d89cd58f4e5cff4bac49821fbc44c5a46246 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -37,7 +37,7 @@ def simple_fc_net(use_feed):
         reader = fluid.layers.io.double_buffer(reader)
         img, label = fluid.layers.read_file(reader)
     hidden = img
-    for _ in xrange(4):
+    for _ in range(4):
         hidden = fluid.layers.fc(
             hidden,
             size=200,
@@ -64,7 +64,7 @@ def fc_with_batchnorm(use_feed):
         img, label = fluid.layers.read_file(reader)
 
     hidden = img
-    for _ in xrange(1):
+    for _ in range(1):
         hidden = fluid.layers.fc(
             hidden,
             size=200,
@@ -98,16 +98,13 @@ class TestMNIST(TestParallelExecutorBase):
             fluid.recordio_writer.convert_reader_to_recordio_file(
                 MNIST_RECORDIO_FILE, reader, feeder)
 
-    def _init_data(self, random=True):
+    def _init_data(self):
         np.random.seed(5)
-        if random:
-            img = np.random.random(size=[32, 784]).astype(np.float32)
-        else:
-            img = np.ones(shape=[32, 784], dtype='float32')
+        img = np.random.random(size=[32, 784]).astype(np.float32)
         label = np.ones(shape=[32, 1], dtype='int64')
         return img, label
 
-    def _compare_reduce_and_allreduce(self, model, use_cuda, random_data=True):
+    def _compare_reduce_and_allreduce(self, model, use_cuda):
         if use_cuda and not core.is_compiled_with_cuda():
             return
         self.check_network_convergence(
@@ -115,7 +112,7 @@ class TestMNIST(TestParallelExecutorBase):
         self.check_network_convergence(
             model, use_cuda=use_cuda, allow_op_delay=True, use_reduce=True)
 
-        img, label = self._init_data(random_data)
+        img, label = self._init_data()
 
         all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
             model,
@@ -131,9 +128,9 @@ class TestMNIST(TestParallelExecutorBase):
             use_reduce=True)
 
         for loss in zip(all_reduce_first_loss, reduce_first_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+            self.assertAlmostEqual(loss[0], loss[1], delta=1e-6)
         for loss in zip(all_reduce_last_loss, reduce_last_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-4)
+            self.assertAlmostEqual(loss[0], loss[1], delta=1e-4)
 
     # simple_fc
     def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
@@ -166,27 +163,27 @@ class TestMNIST(TestParallelExecutorBase):
         if use_cuda and not core.is_compiled_with_cuda():
             return
 
-        img, label = self._init_data(random=False)
+        img, label = self._init_data()
 
         single_first_loss, single_last_loss = self.check_network_convergence(
             method=simple_fc_net,
-            seed=1000,
+            seed=1,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
             use_parallel_executor=False)
         parallel_first_loss, parallel_last_loss = self.check_network_convergence(
             method=simple_fc_net,
-            seed=1000,
+            seed=1,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
             use_parallel_executor=True)
 
-        for p_f in parallel_first_loss:
-            self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
-        for p_l in parallel_last_loss:
-            self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
+        self.assertAlmostEquals(
+            np.mean(parallel_first_loss), single_first_loss, delta=1e-6)
+        self.assertAlmostEquals(
+            np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
 
     def test_simple_fc_parallel_accuracy(self):
         self.check_simple_fc_parallel_accuracy(True)
@@ -211,7 +208,8 @@ class TestMNIST(TestParallelExecutorBase):
         self.check_batchnorm_fc_convergence(False)
 
     def test_batchnorm_fc_with_new_strategy(self):
-        self._compare_reduce_and_allreduce(fc_with_batchnorm, True)
+        # FIXME(zcd): close this test temporally.
+        # self._compare_reduce_and_allreduce(fc_with_batchnorm, True)
         self._compare_reduce_and_allreduce(fc_with_batchnorm, False)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
index 834e920845f29b153909a971eb5afc4f8a33346e..a28428d8dee201ba105e18684c15d4b4582d989f 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
@@ -21,6 +21,19 @@ from parallel_executor_test_base import TestParallelExecutorBase
 import unittest
 import math
 import os
+import numpy as np
+
+# FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor
+# and Executor is different. Because, for ParallelExecutor, the dropout_op of
+# the neural net will be copied N copies(N is the number of device). This will
+# lead to the random numbers generated by ParallelExecutor and Executor are different.
+# So, if we compare the loss of ParallelExecutor and Executor, we should remove the
+# dropout_op.
+remove_dropout = False
+
+# FIXME(zcd): If the neural net has batch_norm, the output of ParallelExecutor
+# and Executor is different.
+remove_bn = False
 
 
 def squeeze_excitation(input, num_channels, reduction_ratio):
@@ -53,7 +66,8 @@ def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
         groups=groups,
         act=None,
         bias_attr=False)
-    return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1)
+    return conv if remove_bn else fluid.layers.batch_norm(
+        input=conv, act=act, momentum=0.1)
 
 
 def shortcut(input, ch_out, stride):
@@ -92,13 +106,14 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
     return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
 
 
-def SE_ResNeXt50Small(batch_size=2, use_feed=False):
-    assert not use_feed, "SE_ResNeXt doesn't support feed yet"
+batch_size = 12
+img_shape = [3, 224, 224]
+
 
-    img = fluid.layers.fill_constant(
-        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
-    label = fluid.layers.fill_constant(
-        shape=[batch_size, 1], dtype='int64', value=0.0)
+def SE_ResNeXt50Small(use_feed):
+
+    img = fluid.layers.data(name='image', shape=img_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
     conv = conv_bn_layer(
         input=img, num_filters=16, filter_size=3, stride=2, act='relu')
@@ -127,7 +142,8 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False):
     reshape = fluid.layers.reshape(
         x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
     pool = fluid.layers.reduce_mean(input=reshape, dim=2)
-    dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2)
+    dropout = pool if remove_dropout else fluid.layers.dropout(
+        x=pool, dropout_prob=0.2, seed=1)
     # Classifier layer:
     prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
@@ -135,75 +151,135 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False):
     return loss
 
 
-class TestResnet(TestParallelExecutorBase):
-    def check_resnet_convergence_with_learning_rate_decay(self,
-                                                          use_cuda=True,
-                                                          use_reduce=False,
-                                                          iter=20):
+def cosine_decay(learning_rate, step_each_epoch, epochs=120):
+    """
+    Applies cosine decay to the learning rate.
+    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
+    """
+    global_step = _decay_step_counter()
 
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
+    with init_on_cpu():
+        epoch = ops.floor(global_step / step_each_epoch)
+        decayed_lr = learning_rate * \
+                     (ops.cos(epoch * (math.pi / epochs)) + 1)/2
+    return decayed_lr
 
-        os.environ['CPU_NUM'] = str(4)
 
-        def _cosine_decay(learning_rate, step_each_epoch, epochs=120):
-            """
-            Applies cosine decay to the learning rate.
-            lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
-            """
-            global_step = _decay_step_counter()
+def optimizer(learning_rate=0.01):
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=cosine_decay(
+            learning_rate=learning_rate, step_each_epoch=2, epochs=1),
+        momentum=0.9,
+        regularization=fluid.regularizer.L2Decay(1e-4))
+    return optimizer
 
-            with init_on_cpu():
-                epoch = ops.floor(global_step / step_each_epoch)
-                decayed_lr = learning_rate * \
-                            (ops.cos(epoch * (math.pi / epochs)) + 1)/2
-            return decayed_lr
 
-        def _optimizer(learning_rate=0.01):
-            optimizer = fluid.optimizer.Momentum(
-                learning_rate=_cosine_decay(
-                    learning_rate=learning_rate, step_each_epoch=2, epochs=1),
-                momentum=0.9,
-                regularization=fluid.regularizer.L2Decay(1e-4))
-            return optimizer
+class TestResnet(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+        global remove_dropout
+        global remove_bn
+        remove_dropout = False
+        remove_bn = False
+
+    def _init_data(self, batch_size=2, random=True):
+        np.random.seed(5)
+        if random:
+            img = np.random.random(
+                size=[batch_size] + img_shape).astype(np.float32)
+        else:
+            img = np.ones(shape=[batch_size] + img_shape, dtype='float32')
+        label = [np.random.randint(0, 999) for _ in range(batch_size)]
+        label = np.array(label).astype(np.int64).reshape(-1, 1)
+        return img, label
+
+    def _compare_reduce_and_allreduce(self,
+                                      model,
+                                      use_cuda,
+                                      iter=20,
+                                      delta2=1e-6):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
 
-        import functools
+        global remove_bn
+        remove_bn = True
 
-        batch_size = 2
+        img, label = self._init_data(batch_size=batch_size)
+        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            use_reduce=False,
+            optimizer=optimizer)
+        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            use_reduce=True,
+            optimizer=optimizer)
+
+        for loss in zip(all_reduce_first_loss, reduce_first_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+        for loss in zip(all_reduce_last_loss, reduce_last_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+
+    def _check_resnet_convergence(self,
+                                  model,
+                                  use_cuda=True,
+                                  use_reduce=False,
+                                  iter=20,
+                                  delta2=1e-6):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
 
+        global remove_dropout
+        global remove_bn
+        remove_dropout = True
+        remove_bn = True
+
+        img, label = self._init_data(batch_size=batch_size)
         single_first_loss, single_last_loss = self.check_network_convergence(
-            functools.partial(
-                SE_ResNeXt50Small, batch_size=batch_size),
+            model,
+            feed_dict={"image": img,
+                       "label": label},
             iter=iter,
             batch_size=batch_size,
             use_cuda=use_cuda,
             use_reduce=use_reduce,
-            optimizer=_optimizer,
+            optimizer=optimizer,
             use_parallel_executor=False)
-
         parallel_first_loss, parallel_last_loss = self.check_network_convergence(
-            functools.partial(
-                SE_ResNeXt50Small, batch_size=batch_size),
+            model,
+            feed_dict={"image": img,
+                       "label": label},
             iter=iter,
             batch_size=batch_size,
             use_cuda=use_cuda,
             use_reduce=use_reduce,
-            optimizer=_optimizer)
+            optimizer=optimizer)
 
-        for p_f in parallel_first_loss:
-            self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
-        for p_l in parallel_last_loss:
-            self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
+        self.assertAlmostEquals(
+            np.mean(parallel_first_loss), single_first_loss[0], delta=1e-6)
+        self.assertAlmostEquals(
+            np.mean(parallel_last_loss), single_last_loss[0], delta=delta2)
 
     def test_seresnext_with_learning_rate_decay(self):
-        self.check_resnet_convergence_with_learning_rate_decay(True, False)
-        self.check_resnet_convergence_with_learning_rate_decay(
-            False, False, iter=5)
-
-    def test_seresnext_with_new_strategy_with_learning_rate_decay(self):
-        self.check_resnet_convergence_with_learning_rate_decay(True, True)
-        self.check_resnet_convergence_with_learning_rate_decay(
-            False, True, iter=5)
+        self._check_resnet_convergence(model=SE_ResNeXt50Small, use_cuda=True)
+        self._check_resnet_convergence(
+            model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3)
+
+    def test_seresnext_with_new_strategy(self):
+        self._compare_reduce_and_allreduce(
+            model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-2)
+        self._compare_reduce_and_allreduce(
+            model=SE_ResNeXt50Small, use_cuda=False, iter=5)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
index 7688b8495d7f7c60e80f62dae2edd72be9f839d4..fcb5947ff05efd1c48ab9ec129ac9d17255d7020 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -25,7 +25,7 @@ def simple_fc_net():
     img = fluid.layers.data(name='image', shape=[784], dtype='float32')
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
     hidden = img
-    for _ in xrange(4):
+    for _ in range(4):
         hidden = fluid.layers.fc(
             hidden,
             size=200,
@@ -71,7 +71,7 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
                 share_vars_from=train_exe,
                 build_strategy=build_strategy)
 
-            for i in xrange(5):
+            for i in range(5):
                 test_loss, = test_exe.run([loss.name], feed=feed_dict)
 
                 train_loss, = train_exe.run([loss.name], feed=feed_dict)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
index b6215fddb11bb6b3a76b5a6395e7254d21971c13..8203d5d1fce0950130ab71db40fb306f73c41bd4 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -21,7 +21,7 @@ import paddle
 import paddle.dataset.wmt16 as wmt16
 import os
 
-WMT16_RECORDIO_FILE = "./wmt16_test_pe.recordio"
+WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio"
 
 
 class ModelHyperParams(object):
@@ -167,10 +167,9 @@ class TestTransformer(TestParallelExecutorBase):
                     writer.append_tensor(t)
                 writer.complete_append_tensor()
 
-    @unittest.skip("transformer is buggy in multi gpu")
     def test_main(self):
         self.check_network_convergence(transformer, use_cuda=True)
-        self.check_network_convergence(transformer, use_cuda=False)
+        self.check_network_convergence(transformer, use_cuda=False, iter=5)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_op.py b/python/paddle/fluid/tests/unittests/test_parallel_op.py
index 18309f457704f522457daefdb8464ae5df2ffcfb..c9617e36778740ce9620c3ad495c64c17277fde1 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
@@ -18,6 +18,7 @@ import paddle.fluid as fluid
 from paddle.fluid.layers.device import get_places
 import paddle.fluid.profiler as profiler
 import numpy
+import six
 
 
 class BaseParallelForTest(unittest.TestCase):
@@ -25,20 +26,20 @@ class BaseParallelForTest(unittest.TestCase):
         """
         Run the unittest for parallel.for
         Args:
-            callback(callable): A callable function returns a generator. There 
-                are two yields in the generator function. The first yield 
-                returns the data layers, and the second yield returns the loss. 
-                The modified data variables will be sent back during the first 
+            callback(callable): A callable function returns a generator. There
+                are two yields in the generator function. The first yield
+                returns the data layers, and the second yield returns the loss.
+                The modified data variables will be sent back during the first
                 yield.
 
             feed(dict): The executor feeding dictionary.
-            fetch(list|basestr): The fetch name lists. 
+            fetch(list|basestr): The fetch name lists.
 
         Returns:
             None
 
         Raises:
-            AssertionError when the computation of cpu, parallel.for in cpu, 
+            AssertionError when the computation of cpu, parallel.for in cpu,
                 gpu, parallel.for in gpu are different.
 
         """
@@ -95,14 +96,14 @@ class BaseParallelForTest(unittest.TestCase):
         """
         Run a single test, returns the fetch values
         Args:
-            place(Place): the computation place. 
-            use_parallel(bool): Whether use parallel.for or not. 
+            place(Place): the computation place.
+            use_parallel(bool): Whether use parallel.for or not.
 
         Returns:
             Fetched numpy arrays.
 
         """
-        if isinstance(fetch, basestring):
+        if isinstance(fetch, six.string_types):
             fetch = [fetch]
         main = fluid.Program()
         startup = fluid.Program()
@@ -124,7 +125,7 @@ class BaseParallelForTest(unittest.TestCase):
                     data = [data]
 
                 with pd.do():
-                    ins = map(pd.read_input, data)
+                    ins = list(map(pd.read_input, data))
                     if len(ins) == 1:
                         ins = ins[0]
                     loss = generator.send(ins)  # patch input
@@ -156,7 +157,7 @@ class BaseParallelForTest(unittest.TestCase):
 
         Returns:
             None
-            
+
         Raises:
             AssertionError
 
diff --git a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
index 2105d320665367e3ec1bfd7b3a353a144c91244f..8aff4e87f67bc61a162f09e982cf0a7a61639257 100644
--- a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
+++ b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
@@ -23,9 +23,9 @@ def PolygonBoxRestore(input):
     geo_channels = shape[1]
     h = shape[2]
     w = shape[3]
-    h_indexes = np.array(range(h) * w).reshape(
+    h_indexes = np.array(list(range(h)) * w).reshape(
         [w, h]).transpose()[np.newaxis, :]  # [1, h, w]
-    w_indexes = np.array(range(w) * h).reshape(
+    w_indexes = np.array(list(range(w)) * h).reshape(
         [h, w])[np.newaxis, :]  # [1, h, w]
     indexes = np.concatenate(
         (w_indexes, h_indexes))[np.newaxis, :]  # [1, 2, h, w]
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index f7e1e8573290766cde0c35816d687e7ba6fa4220..1cf70311b40bc7648b7462e93f201aa33c77b137 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -35,8 +35,8 @@ def max_pool2D_forward_naive(x,
              ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
                                                    paddings[1]) / strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
-    for i in xrange(H_out):
-        for j in xrange(W_out):
+    for i in range(H_out):
+        for j in range(W_out):
             r_start = np.max((i * strides[0] - paddings[0], 0))
             r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
             c_start = np.max((j * strides[1] - paddings[1], 0))
@@ -63,8 +63,8 @@ def avg_pool2D_forward_naive(x,
              ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
                                                    paddings[1]) / strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
-    for i in xrange(H_out):
-        for j in xrange(W_out):
+    for i in range(H_out):
+        for j in range(W_out):
             r_start = np.max((i * strides[0] - paddings[0], 0))
             r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
             c_start = np.max((j * strides[1] - paddings[1], 0))
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index 142165f29beeaedfaa660f04424147e06710d192..92c64b37921eafd4c90e247a235f2dacea8fea1e 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -38,13 +38,13 @@ def max_pool3D_forward_naive(x,
              ) / strides[2] + 1 if ceil_mode else (W - ksize[2] + 2 *
                                                    paddings[2]) / strides[2] + 1
     out = np.zeros((N, C, D_out, H_out, W_out))
-    for k in xrange(D_out):
+    for k in range(D_out):
         d_start = np.max((k * strides[0] - paddings[0], 0))
         d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
-        for i in xrange(H_out):
+        for i in range(H_out):
             h_start = np.max((i * strides[0] - paddings[0], 0))
             h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-            for j in xrange(W_out):
+            for j in range(W_out):
                 w_start = np.max((j * strides[1] - paddings[1], 0))
                 w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
                 x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
@@ -72,13 +72,13 @@ def avg_pool3D_forward_naive(x,
              ) / strides[2] + 1 if ceil_mode else (W - ksize[2] + 2 *
                                                    paddings[2]) / strides[2] + 1
     out = np.zeros((N, C, D_out, H_out, W_out))
-    for k in xrange(D_out):
+    for k in range(D_out):
         d_start = np.max((k * strides[0] - paddings[0], 0))
         d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
-        for i in xrange(H_out):
+        for i in range(H_out):
             h_start = np.max((i * strides[0] - paddings[0], 0))
             h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-            for j in xrange(W_out):
+            for j in range(W_out):
                 w_start = np.max((j * strides[1] - paddings[1], 0))
                 w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
                 x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
diff --git a/python/paddle/fluid/tests/unittests/test_pool_max_op.py b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
index cf9b7639224ef3804b946f729bb6a9cead4aae23..e6a9f6f08cf1445c14494506641b0c3502591c37 100644
--- a/python/paddle/fluid/tests/unittests/test_pool_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
@@ -29,21 +29,21 @@ def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False):
     W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
     out = np.zeros((N, C, D_out, H_out, W_out))
     mask = np.zeros((N, C, D_out, H_out, W_out))
-    for k in xrange(D_out):
+    for k in range(D_out):
         d_start = np.max((k * strides[0] - paddings[0], 0))
         d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
-        for i in xrange(H_out):
+        for i in range(H_out):
             h_start = np.max((i * strides[0] - paddings[0], 0))
             h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-            for j in xrange(W_out):
+            for j in range(W_out):
                 w_start = np.max((j * strides[1] - paddings[1], 0))
                 w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
                 x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
 
                 out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
 
-                for n in xrange(N):
-                    for c in xrange(C):
+                for n in range(N):
+                    for c in range(C):
                         arr = x_masked[n, c, :, :, :]
                         index = np.where(arr == np.max(arr))
                         sub_deep = index[0][0]
@@ -67,8 +67,8 @@ def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=False):
     W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
     mask = np.zeros((N, C, H_out, W_out))
-    for i in xrange(H_out):
-        for j in xrange(W_out):
+    for i in range(H_out):
+        for j in range(W_out):
             r_start = np.max((i * strides[0] - paddings[0], 0))
             r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
             c_start = np.max((j * strides[1] - paddings[1], 0))
@@ -77,8 +77,8 @@ def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=False):
 
             out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
 
-            for n in xrange(N):
-                for c in xrange(C):
+            for n in range(N):
+                for c in range(C):
                     arr = x_masked[n, c, :, :]
                     index = np.where(arr == np.max(arr))
                     sub_row = index[0][0]
diff --git a/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py b/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
index 091cfc9c72769fefc9c792bfeaa872cb357736b7..8c76393bdaccc0b701b409efebf08fac95aa5f1a 100644
--- a/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
+++ b/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
@@ -32,7 +32,7 @@ def py_pnpair_op(score, label, query, column=-1, weight=None):
 
     # accumulate statistics
     pos, neg, neu = 0, 0, 0
-    for _, ranks in predictions.items():
+    for _, ranks in list(predictions.items()):
         for e1, e2 in itertools.combinations(ranks, 2):
             s1, s2, l1, l2, w1, w2 = e1[0], e2[0], e1[1], e2[1], e1[2], e2[2]
             w = (w1 + w2) * 0.5
diff --git a/python/paddle/fluid/tests/unittests/test_precision_recall_op.py b/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
index 7830ba29583d369c4b9f6f3077dda1dda1fd1c46..5ae425fee18b9b1baa0b945782268b79d6bb6625 100644
--- a/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
+++ b/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
@@ -39,19 +39,19 @@ def get_states(idxs, labels, cls_num, weights=None):
     ins_num = idxs.shape[0]
     # TP FP TN FN
     states = np.zeros((cls_num, 4)).astype('float32')
-    for i in xrange(ins_num):
+    for i in range(ins_num):
         w = weights[i] if weights is not None else 1.0
         idx = idxs[i][0]
         label = labels[i][0]
         if idx == label:
             states[idx][0] += w
-            for j in xrange(cls_num):
+            for j in range(cls_num):
                 states[j][2] += w
             states[idx][2] -= w
         else:
             states[label][3] += w
             states[idx][1] += w
-            for j in xrange(cls_num):
+            for j in range(cls_num):
                 states[j][2] += w
             states[label][2] -= w
             states[idx][2] -= w
@@ -64,7 +64,7 @@ def compute_metrics(states, cls_num):
     total_fn_count = 0.0
     macro_avg_precision = 0.0
     macro_avg_recall = 0.0
-    for i in xrange(cls_num):
+    for i in range(cls_num):
         total_tp_count += states[i][0]
         total_fp_count += states[i][1]
         total_fn_count += states[i][3]
@@ -90,9 +90,9 @@ class TestPrecisionRecallOp_0(OpTest):
         ins_num = 64
         cls_num = 10
         max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = np.random.choice(xrange(cls_num), ins_num).reshape(
+        idxs = np.random.choice(range(cls_num), ins_num).reshape(
             (ins_num, 1)).astype('int32')
-        labels = np.random.choice(xrange(cls_num), ins_num).reshape(
+        labels = np.random.choice(range(cls_num), ins_num).reshape(
             (ins_num, 1)).astype('int32')
         states = get_states(idxs, labels, cls_num)
         metrics = compute_metrics(states, cls_num)
@@ -117,10 +117,10 @@ class TestPrecisionRecallOp_1(OpTest):
         ins_num = 64
         cls_num = 10
         max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = np.random.choice(xrange(cls_num), ins_num).reshape(
+        idxs = np.random.choice(range(cls_num), ins_num).reshape(
             (ins_num, 1)).astype('int32')
         weights = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        labels = np.random.choice(xrange(cls_num), ins_num).reshape(
+        labels = np.random.choice(range(cls_num), ins_num).reshape(
             (ins_num, 1)).astype('int32')
 
         states = get_states(idxs, labels, cls_num, weights)
@@ -151,10 +151,10 @@ class TestPrecisionRecallOp_2(OpTest):
         ins_num = 64
         cls_num = 10
         max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = np.random.choice(xrange(cls_num), ins_num).reshape(
+        idxs = np.random.choice(range(cls_num), ins_num).reshape(
             (ins_num, 1)).astype('int32')
         weights = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        labels = np.random.choice(xrange(cls_num), ins_num).reshape(
+        labels = np.random.choice(range(cls_num), ins_num).reshape(
             (ins_num, 1)).astype('int32')
         states = np.random.randint(0, 30, (cls_num, 4)).astype('float32')
 
diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
index f75a79bfa42405747df9e6f4f4ab743014e303b9..621dd681345d2c185c0644de917b8bcd529f9937 100644
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
@@ -183,7 +183,7 @@ class TestBlockDesc(unittest.TestCase):
         op2 = block.append_op()
         op0 = block._prepend_op()
         all_ops = []
-        for idx in xrange(0, block.op_size()):
+        for idx in range(0, block.op_size()):
             all_ops.append(block.op(idx))
         self.assertEqual(all_ops, [op0, op1, op2])
 
@@ -205,7 +205,7 @@ class TestBlockDesc(unittest.TestCase):
         program._sync_with_cpp()
 
         all_ops = []
-        for idx in xrange(0, block.op_size()):
+        for idx in range(0, block.op_size()):
             all_ops.append(block.op(idx))
         self.assertEqual(all_ops, [op0, op2])
 
diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py
index d35183647ea57e378f0fe201ef03001122cb329f..3ad85d57485956e0cadb197dadd172516fa15c39 100644
--- a/python/paddle/fluid/tests/unittests/test_reader_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
@@ -21,7 +21,7 @@ import unittest
 class TestReaderReset(unittest.TestCase):
     def prepare_data(self):
         def fake_data_generator():
-            for n in xrange(self.total_ins_num):
+            for n in range(self.total_ins_num):
                 yield np.ones(self.ins_shape) * n, n
 
         # Prepare data
diff --git a/python/paddle/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
index d6ff18430e319e236f03d5661381e923cc956590..2e22df2beba9d74e28788fb72f6f7f7f2bef534e 100644
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -203,12 +203,12 @@ class RecurrentOpTest1(unittest.TestCase):
                     num_grad[idx], ana_grad[idx], rtol=0.1).all())
 
     def check_forward(self):
-        print 'test recurrent op forward'
+        print('test recurrent op forward')
         pd_output = self.forward()
         py_output = self.py_rnn.forward()
-        print 'pd_output', pd_output
+        print('pd_output', pd_output)
         print
-        print 'py_output', py_output
+        print('py_output', py_output)
         self.assertEqual(pd_output.shape, py_output.shape)
         self.assertTrue(np.isclose(pd_output, py_output, rtol=0.1).all())
 
@@ -445,7 +445,7 @@ class RecurrentOpNoMemBootTest(RecurrentOpTest1):
         self.py_rnn = RecurrentOpNoMemBootTest.PySimpleRNN4(self.input_shape,
                                                             self.output_shape)
         self.output = layers.mean(self.create_rnn_op(), **self.p_info)
-        print self.main_program
+        print(self.main_program)
 
     def create_rnn_op(self):
         x = layers.data(
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index f51b5a7e9907294a5b91c920a363830d8b9a7137..2f5558578ac2a002a83c2a7e027ec5a96d8b4414 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -25,7 +25,7 @@ class TestReshapeOp(OpTest):
 
         self.op_type = "reshape"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape, "inplace": False}
+        self.attrs = {"shape": new_shape}
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
@@ -42,7 +42,7 @@ class TestReshapeOpDimInfer1(OpTest):
 
         self.op_type = "reshape"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape, "inplace": False}
+        self.attrs = {"shape": new_shape}
         self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
 
     def test_check_output(self):
@@ -60,7 +60,7 @@ class TestReshapeOpDimInfer2(OpTest):
 
         self.op_type = "reshape"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape, "inplace": False}
+        self.attrs = {"shape": new_shape}
         self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_seq_conv.py b/python/paddle/fluid/tests/unittests/test_seq_conv.py
index 9701d9adef1fd272f2520f66607acded6a8c25c6..1a6e1aad799e77b8e746353bee93680691939d24 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py
@@ -26,9 +26,9 @@ class TestSeqProject(OpTest):
         if self.context_length == 1 \
                 and self.context_start == 0 \
                 and self.padding_trainable:
-            print "If context_start is 0 " \
+            print("If context_start is 0 " \
                   "and context_length is 1," \
-                  " padding_trainable should be false."
+                  " padding_trainable should be false.")
             return
 
         # one level, batch size
@@ -212,7 +212,7 @@ class TestSeqProjectCase2(TestSeqProject):
         self.context_stride = 1
 
         self.input_size = [self.input_row, 23]
-        idx = range(self.input_size[0])
+        idx = list(range(self.input_size[0]))
         del idx[0]
         offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
                       [self.input_size[0]]]
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
index 0bbd31814efdff6050733f6876ef64e3fcaaaf76..5ff0dab23e516ae8114b8264492fb2a9d5c0b3f8 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -44,7 +44,7 @@ class TestSequenceExpand(OpTest):
             out_lod = [[]]
 
         offset = 0
-        for i in xrange(len(y_lod[ref_level])):
+        for i in range(len(y_lod[ref_level])):
             repeat_num = y_lod[ref_level][i]
             x_len = x_idx[i]
 
@@ -55,7 +55,7 @@ class TestSequenceExpand(OpTest):
                     stacked_x_sub = np.vstack((stacked_x_sub, x_sub))
                 out = np.vstack((out, stacked_x_sub))
                 if x_lod is not None:
-                    for j in xrange(repeat_num):
+                    for j in range(repeat_num):
                         out_lod[0].append(x_len)
             offset += x_len
 
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
index 68f2e5eba35ed318281d14e397dc6d363bcb4079..39b02ecf6ddb40737c4c1737d652c1a1b744d923 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
@@ -35,7 +35,7 @@ class TestSequenceReshape(OpTest):
     def compute_output(self, x, x_lod, dimension):
         x_width = x.shape[1]
         out_lod = [[]]
-        for i in xrange(len(x_lod[0])):
+        for i in range(len(x_lod[0])):
             seq_len = x_lod[0][i]
             offset = (seq_len * x_width) / dimension
             assert int(offset) * dimension == seq_len * x_width
diff --git a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
index 6f0e337034d1010880514181654170316fd9db19..a994bf181a74ca71a970da0105fe767f82750a6c 100644
--- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
@@ -48,7 +48,7 @@ class TestShrinkRNNMemoryBase(unittest.TestCase):
 
     def sum_lodtensor(self, tensor):
         sum_res = 0.0
-        for i in xrange(np.product(tensor.shape())):
+        for i in range(np.product(tensor.shape())):
             sum_res += tensor._get_float_element(i)
         return sum_res
 
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 0ab581cfb0ea0ff2205450b8e62edb8bf3c51707..70ad05597c4a160cf6a25aeb3c379320cef69c63 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -26,15 +26,22 @@ def stable_softmax(x):
 
 
 class TestSoftmaxOp(OpTest):
+    def get_x_shape(self):
+        return [10, 10]
+
     def setUp(self):
         self.op_type = "softmax"
         self.use_cudnn = False
         self.use_mkldnn = False
         self.dtype = np.float32
         self.init_kernel_type()
+        self.shape = self.get_x_shape()
+
+        x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
+        out = np.apply_along_axis(stable_softmax, 1,
+                                  x.reshape([-1, self.shape[-1]]))
+        out = out.reshape(self.shape)
 
-        x = np.random.uniform(0.1, 1, [10, 10]).astype(self.dtype)
-        out = np.apply_along_axis(stable_softmax, 1, x)
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
         self.attrs = {
@@ -63,6 +70,11 @@ class TestSoftmaxOp(OpTest):
             self.check_grad(["X"], "Out", max_relative_error=0.01)
 
 
+class TestSoftmaxOp2(TestSoftmaxOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp(TestSoftmaxOp):
@@ -70,6 +82,13 @@ class TestSoftmaxCUDNNOp(TestSoftmaxOp):
         self.use_cudnn = True
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp2(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxFP16Op(TestSoftmaxOp):
@@ -83,6 +102,13 @@ class TestSoftmaxFP16Op(TestSoftmaxOp):
                 self.check_output_with_place(place, atol=1e-3)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxFP16Op2(TestSoftmaxFP16Op):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
@@ -97,10 +123,22 @@ class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
                 self.check_output_with_place(place, atol=1e-3)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
 class TestSoftmaxMKLDNNOp(TestSoftmaxOp):
     def init_kernel_type(self):
         self.use_mkldnn = True
 
 
+class TestSoftmaxMKLDNNOp2(TestSoftmaxMKLDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_ids_op.py b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
index e9f0a06a56b42952800411d548bb3fc1732e031e..ca7861309839d183e18c168403881a0b1b5bf309 100644
--- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
@@ -15,6 +15,8 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
 
 
 class TestSplitIdsOp(OpTest):
@@ -31,5 +33,55 @@ class TestSplitIdsOp(OpTest):
         self.check_output()
 
 
+class TestSpliteIds(unittest.TestCase):
+    def get_places(self):
+        places = [core.CPUPlace()]
+        return places
+
+    def test_check_output(self):
+        for place in self.get_places():
+            self.check_with_place(place)
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        rows = [0, 5, 7, 4, 9]
+        height = 20
+        row_numel = 2
+
+        # initialize input variable X
+        x = scope.var('X').get_selected_rows()
+        x.set_rows(rows)
+        x.set_height(height)
+        np_array = np.ones((len(rows), row_numel)).astype("float32")
+        for i in range(len(rows)):
+            for j in range(row_numel):
+                np_array[i, j] = rows[i] + j
+        x_tensor = x.get_tensor()
+        x_tensor.set(np_array, place)
+
+        outs_name = ["out%d" % i for i in xrange(3)]
+        outs = [
+            scope.var(var_name).get_selected_rows() for var_name in outs_name
+        ]
+
+        # expected output selected rows
+        expected_out_rows = [[0, 9], [7, 4], [5]]
+
+        op = Operator("split_ids", Ids="X", Out=outs_name)
+
+        for _ in range(3):
+            op.run(scope, place)
+
+            for i in range(len(outs)):
+                expected_rows = expected_out_rows[i]
+                self.assertEqual(outs[i].rows(), expected_rows)
+                for j in range(len(expected_rows)):
+                    row = expected_rows[j]
+                    self.assertAlmostEqual(
+                        float(row), np.array(outs[i].get_tensor())[j, 0])
+                    self.assertAlmostEqual(
+                        float(row + 1), np.array(outs[i].get_tensor())[j, 1])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py
index eb49a53e54f4bdb6bcd6cb1991423970f29997bb..6b67a52e81b978ed78c72629f9177759f8e2c4e1 100644
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
@@ -26,7 +26,7 @@ class TestSplitOp(OpTest):
         self.inputs = {'X': x}
         self.attrs = {'axis': axis, 'sections': [2, 1, 2]}
         self.outputs = {'Out': [('out%d' % i, out[i]) \
-            for i in xrange(len(out))]}
+            for i in range(len(out))]}
 
     def _set_op_type(self):
         self.op_type = "split"
diff --git a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
index 61040a39ced6dc57d05a10bf0605c80011db45c3..2b261820e04b08234477fc0a9adde95262f99bba 100644
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
@@ -53,7 +53,7 @@ class TestSpliteSelectedRows(unittest.TestCase):
         height_sections = [5, 5, 5, 5, 3]
 
         # initialize output variables [out0, out1]
-        outs_name = ["out%d" % i for i in xrange(len(height_sections))]
+        outs_name = ["out%d" % i for i in range(len(height_sections))]
         outs = [
             scope.var(var_name).get_selected_rows() for var_name in outs_name
         ]
diff --git a/python/paddle/fluid/tests/unittests/test_spp_op.py b/python/paddle/fluid/tests/unittests/test_spp_op.py
index f0ab5909df62835b252154709e5ff75ca38235c8..3cbfc2a703f1c4a24674d468cd1152bfa6eb8ad2 100644
--- a/python/paddle/fluid/tests/unittests/test_spp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_spp_op.py
@@ -26,7 +26,7 @@ class TestSppOp(OpTest):
         input = np.random.random(self.shape).astype("float32")
         nsize, csize, hsize, wsize = input.shape
         out_level_flatten = []
-        for i in xrange(self.pyramid_height):
+        for i in range(self.pyramid_height):
             bins = np.power(2, i)
             kernel_size = [0, 0]
             padding = [0, 0]
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_op.py b/python/paddle/fluid/tests/unittests/test_top_k_op.py
index cc2fcc5ec0a076679c7dd85a7e8f8da6a170172b..cbc3da550306b9febe8a8fd22e7f71efa572a3d0 100644
--- a/python/paddle/fluid/tests/unittests/test_top_k_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py
@@ -28,7 +28,7 @@ class TestTopkOp(OpTest):
         self.inputs = {'X': input}
         self.attrs = {'k': k}
 
-        for rowid in xrange(32):
+        for rowid in range(32):
             row = input[rowid]
             output[rowid] = np.sort(row)[-k:]
             indices[rowid] = row.argsort()[-k:]
@@ -52,7 +52,7 @@ class TestTopkOp3d(OpTest):
         self.inputs = {'X': input_flat_2d}
         self.attrs = {'k': k}
 
-        for rowid in xrange(64):
+        for rowid in range(64):
             row = input_flat_2d[rowid]
             output[rowid] = np.sort(row)[-k:]
             indices[rowid] = row.argsort()[-k:]
diff --git a/python/paddle/fluid/tests/unittests/test_unpool_op.py b/python/paddle/fluid/tests/unittests/test_unpool_op.py
index a97d6dfdda9b79eed3be6302fb2b1c3810f189dc..ecce4cdde2d648fe7d65427e34c77f5f9ad61417 100644
--- a/python/paddle/fluid/tests/unittests/test_unpool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unpool_op.py
@@ -22,10 +22,10 @@ def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings):
     out_hsize = (s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0]
     out_wsize = (s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1]
     out = np.zeros((s0, s1, out_hsize, out_wsize))
-    for nidx in xrange(s0):
-        for cidx in xrange(s1):
-            for h in xrange(s2):
-                for w in xrange(s3):
+    for nidx in range(s0):
+        for cidx in range(s1):
+            for h in range(s2):
+                for w in range(s3):
                     index = indices[nidx, cidx, h, w]
                     hidx = (index - index % out_wsize) / out_wsize
                     widx = index % out_wsize
@@ -47,16 +47,16 @@ class TestUnpoolOp(OpTest):
                 self.strides[1] + 1
         input = np.zeros((nsize, csize, hsize_out, wsize_out))
         indices = np.zeros((nsize, csize, hsize_out, wsize_out))
-        for i in xrange(hsize_out):
-            for j in xrange(wsize_out):
+        for i in range(hsize_out):
+            for j in range(wsize_out):
                 r_start = np.max((i * self.strides[0] - self.paddings[0], 0))
                 r_end = np.min((i * self.strides[0] + self.ksize[0] - \
                         self.paddings[0], hsize))
                 c_start = np.max((j * self.strides[1] - self.paddings[1], 0))
                 c_end = np.min((j * self.strides[1] + self.ksize[1] - \
                         self.paddings[1], wsize))
-                for nidx in xrange(nsize):
-                    for cidx in xrange(csize):
+                for nidx in range(nsize):
+                    for cidx in range(csize):
                         x_masked = pre_input[nidx, cidx, r_start:r_end, \
                                 c_start:c_end]
                         input[nidx, cidx, i, j] = x_masked.max()
diff --git a/python/paddle/fluid/tests/unittests/test_while_op.py b/python/paddle/fluid/tests/unittests/test_while_op.py
index fe8808bc044684c96fb3382836be32dac1d241f3..790e6afe5f02236b00d9c67b7b25a881e07abace 100644
--- a/python/paddle/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_op.py
@@ -66,7 +66,7 @@ class TestWhileOp(unittest.TestCase):
         exe = Executor(cpu)
         d = []
 
-        for i in xrange(3):
+        for i in range(3):
             d.append(numpy.random.random(size=[10]).astype('float32'))
 
         outs = exe.run(feed={'d0': d[0],
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
index 55c6e54906e739ef0bc953fa5c9e9641ec575ccf..c6e176ca31c57e623addd9594be81c0abdce489b 100644
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -18,14 +18,6 @@ import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 
 
-def as_lodtensor(np_array, lod, place):
-    tensor = core.LoDTensor()
-    tensor.set(np_value, place)
-    if lod is not None:
-        tensor.set_recursive_sequence_lengths(lod)
-    return tensor
-
-
 def create_op(scope, op_type, inputs, outputs, attrs):
     kwargs = dict()
 
@@ -69,6 +61,11 @@ def create_op(scope, op_type, inputs, outputs, attrs):
 
 
 def set_input(scope, op, inputs, place):
+    def np_value_to_fluid_value(input):
+        if input.dtype == np.float16:
+            input = input.view(np.uint16)
+        return input
+
     def __set_input__(var_name, var):
         if isinstance(var, tuple) or isinstance(var, np.ndarray):
             tensor = scope.find_var(var_name).get_tensor()
@@ -76,7 +73,7 @@ def set_input(scope, op, inputs, place):
                 tensor.set_recursive_sequence_lengths(var[1])
                 var = var[0]
             tensor._set_dims(var.shape)
-            tensor.set(var, place)
+            tensor.set(np_value_to_fluid_value(var), place)
         elif isinstance(var, float):
             scope.find_var(var_name).set_float(var)
         elif isinstance(var, int):
@@ -104,6 +101,7 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
         if name not in np_list:
             assert var_proto.intermediate, "{} not found".format(name)
         else:
+            # inferece the dtype from numpy value.
             np_value = np_list[name]
             if isinstance(np_value, tuple):
                 dtype = np_value[0].dtype
@@ -116,6 +114,16 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
                 if is_input:
                     shape = list(np_value.shape)
                     lod_level = 0
+        # NOTE(dzhwinter): type hacking
+        # numpy float16 is binded to paddle::platform::float16
+        # in tensor_py.h via the help of uint16 datatype. Because
+        # the internal memory representation of float16 is
+        # actually uint16_t in paddle. So we use np.uint16 in numpy for
+        # raw memory, it can pass through the pybind. So in the testcase,
+        # we feed data use data.view(uint16), but the dtype is float16 in fact.
+        # The data.view(uint16) means do not cast the data type, but process data as the uint16
+        if dtype == np.uint16:
+            dtype = np.float16
         return block.create_var(
             dtype=dtype, shape=shape, lod_level=lod_level, name=name)
 
@@ -142,7 +150,7 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
 
 
 def append_loss_ops(block, output_names):
-    mean_inputs = map(block.var, output_names)
+    mean_inputs = list(map(block.var, output_names))
     # for item in mean_inputs:
     #     print(item)
     #     print("Item", item.dtype)
diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py
index c62792face3c353db1f2e3c77eaf4bd32fbded69..17ab875f6a555c99bbe480a54ea2df3b1f60de2d 100644
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
@@ -118,8 +118,9 @@ def multi_head_attention(queries,
         # FIXME(guosheng): Decouple the program desc with batch_size.
         return layers.reshape(
             x=trans_x,
-            shape=map(int,
-                      [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]]))
+            shape=list(
+                map(int, [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]
+                          ])))
 
     def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
         """
@@ -403,7 +404,7 @@ def transformer(
         trg_pad_idx,
         pos_pad_idx, ):
     file_obj = fluid.layers.open_recordio_file(
-        filename='./wmt16.recordio',
+        filename='/tmp/wmt16.recordio',
         shapes=[
             [batch_size * max_length, 1],
             [batch_size * max_length, 1],
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 6084e860ed9cb071f4afc8536f013843dc3f1a7b..1c4565a83c591f8efd56cbdbc843b2b19d233973 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -18,16 +18,15 @@ import errno
 import shutil
 import time
 
-import core
-
-import data_feeder
-import executor
-import framework
-import io
+from . import core
+from . import data_feeder
+from . import executor
+from . import framework
+from . import io
 # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
-import optimizer as opt_module
-import parallel_executor
-from transpiler import distribute_transpiler
+from . import optimizer as opt_module
+from . import parallel_executor
+from .transpiler import distribute_transpiler
 
 __all__ = [
     'Trainer', 'BeginEpochEvent', 'EndEpochEvent', 'BeginStepEvent',
@@ -658,11 +657,12 @@ def build_feed_var_list(program, feed_order):
         if not isinstance(feed_order, dict):
             raise TypeError(
                 "The 'feed_order' should be either None, list or dict.")
-        if not sorted(feed_order.values()) == range(len(feed_order)):
+        if not sorted(feed_order.values()) == list(range(len(feed_order))):
             raise ValueError(
                 "The values of 'feed_order' should be a permutation of [0, len(feed_order))"
             )
-        sorted_pair_list = sorted(feed_order.items(), key=lambda item: item[1])
+        sorted_pair_list = sorted(
+            list(feed_order.items()), key=lambda item: item[1])
         feed_var_list = [
             program.global_block().var(pair[0]) for pair in sorted_pair_list
         ]
@@ -1041,8 +1041,8 @@ def _save_pserver_vars_by_notify(executor, dirname, lookup_table,
         lookup_table(string): the lookup table name, when use distribute
             lookup table, we can get lookup table name by DistributeTranspiler.
             table_name
-        pserver_endpoints(list): the parameter server ip:port list.
-            when use distribute lookup table, we can get pserver_endpoints by
+        ps_endpoint_list(list): the parameter server ip:port list.
+            when use distribute lookup table, we can get ps_endpoint_list by
             distribute arguments.
     Return:
         None
@@ -1080,7 +1080,7 @@ def _save_trainer_args(dirname, trainer_id, trainer_args):
 
     cur_dir = _get_trainer_dir(dirname, trainer_id)
 
-    for name, value in trainer_args.iteritems():
+    for name, value in list(trainer_args.items()):
         args_file = os.path.join(cur_dir, name)
         with open(args_file, 'w') as f:
             f.write(str(value))
@@ -1218,10 +1218,10 @@ def _scroll_delete(dirname, max_num_checkpoints=3):
         serial_num = _get_dir_serial(serial)
         serial_map[serial_num] = serial
 
-    if len(serial_map.keys()) <= max_num_checkpoints:
+    if len(list(serial_map.keys())) <= max_num_checkpoints:
         return
 
-    serials = serial_map.keys()
+    serials = list(serial_map.keys())
     serials.sort(reverse=True)
     serials = serials[max_num_checkpoints:]
     for serial in serials:
diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py
index eae13b50398f791d4a203b72a0e96f3e87cc2a88..a8622ad54433fff40f68520955f0294e2955577e 100644
--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from distribute_transpiler import DistributeTranspiler, DistributeTranspilerConfig
-from inference_transpiler import InferenceTranspiler
-from memory_optimization_transpiler import memory_optimize, release_memory
-from ps_dispatcher import HashName, RoundRobin
+from .distribute_transpiler import DistributeTranspiler, DistributeTranspilerConfig
+from .inference_transpiler import InferenceTranspiler
+from .memory_optimization_transpiler import memory_optimize, release_memory
+from .ps_dispatcher import HashName, RoundRobin
 
 __all__ = [
     "DistributeTranspiler", "InferenceTranspiler", "memory_optimize",
diff --git a/python/paddle/fluid/transpiler/details/__init__.py b/python/paddle/fluid/transpiler/details/__init__.py
index dc597c33849dc06cc975b245099672f64c3539d3..1bfab1f219f8a2f08a0fb5c0042d87a3ad707dd5 100644
--- a/python/paddle/fluid/transpiler/details/__init__.py
+++ b/python/paddle/fluid/transpiler/details/__init__.py
@@ -12,5 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from program_utils import *
-from ufind import *
+from .program_utils import *
+from .ufind import *
diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py
index 2ca1d4716b103d17117ae3ee958667c3a9747cdf..76d10777f5f9ed6d27d55a640108bd036d8d8bac 100644
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -17,8 +17,8 @@ def delete_ops(block, ops):
     try:
         start = list(block.ops).index(ops[0])
         end = list(block.ops).index(ops[-1])
-        [block._remove_op(start) for _ in xrange(end - start + 1)]
-    except Exception, e:
+        [block._remove_op(start) for _ in range(end - start + 1)]
+    except Exception as e:
         raise e
     block.program._sync_with_cpp()
 
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 2dd9ec3e473e538cce19e9c6698ac4a2f76bcba7..15675b4e9f485edabd97c79a0166eb67b4de6fee 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -28,18 +28,17 @@ Steps to transpile pserver:
 5. add listen_and_serv op
 """
 
-from __future__ import print_function
-
 import math
 import random
 import numpy as np
 
-from ps_dispatcher import RoundRobin, HashName, PSDispatcher
+from .ps_dispatcher import RoundRobin, HashName, PSDispatcher
 from .. import core, framework
 from ..framework import Program, default_main_program, \
                         default_startup_program, Block, \
                         Parameter, grad_var_name
-from details import *
+from .details import *
+from functools import reduce
 
 LOOKUP_TABLE_TYPE = "lookup_table"
 LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
@@ -102,7 +101,7 @@ def slice_variable(var_list, slice_count, min_block_size):
                 block_size += dim1 - remains
         # update split_count after aligning
         split_count = int(math.ceil(var_numel / float(block_size)))
-        for block_id in xrange(split_count):
+        for block_id in range(split_count):
             curr_block_size = min(block_size, var_numel - (
                 (block_id) * block_size))
             block = VarBlock(var.name, block_id, curr_block_size)
@@ -117,7 +116,7 @@ class DistributeTranspilerConfig(object):
         try to choose the best method to balance loads for pservers.
     min_block_size (int): Minimum splitted element number in block.
         According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
-        We can use bandwidth effiently when data size is larger than 2MB.If you 
+        We can use bandwidth effiently when data size is larger than 2MB.If you
         want to change it, please be sure you see the slice_variable function.
     """
 
@@ -218,7 +217,7 @@ class DistributeTranspiler(object):
         #       fc_w@GRAD_trainer_0, fc_w@GRAD_trainer_1 --> pserver1
         #       fc_b@GRAD_trainer_0, fc_b@GRAD_trainer_1 --> pserver2
         # shuffle the map will avoid the uneven distribution above
-        grad_var_mapping_items = self.grad_var_mapping.items()
+        grad_var_mapping_items = list(self.grad_var_mapping.items())
         if not self.config.slice_var_up:
             random.seed(self.trainer_num)
             random.shuffle(grad_var_mapping_items)
@@ -278,7 +277,7 @@ class DistributeTranspiler(object):
             self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i])
 
         # step4: Concat the parameters splits together after recv.
-        for varname, splited_var in self.param_var_mapping.iteritems():
+        for varname, splited_var in list(self.param_var_mapping.items()):
             eps = []
             for var in splited_var:
                 index = [v.name for v in recv_vars].index(var.name)
@@ -293,16 +292,17 @@ class DistributeTranspiler(object):
                     RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                 })
 
-        program.global_block().append_op(
-            type="fetch_barrier",
-            inputs={},
-            outputs={},
-            attrs={
-                "endpoints": pserver_endpoints,
-                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-            })
+        if self.sync_mode:
+            program.global_block().append_op(
+                type="fetch_barrier",
+                inputs={},
+                outputs={},
+                attrs={
+                    "endpoints": pserver_endpoints,
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
 
-        for varname, splited_var in self.param_var_mapping.iteritems():
+        for varname, splited_var in list(self.param_var_mapping.items()):
             if len(splited_var) <= 1:
                 continue
             orig_param = program.global_block().vars[varname]
@@ -347,6 +347,7 @@ class DistributeTranspiler(object):
 
         # step1
         pserver_program = Program()
+        pserver_program.random_seed = self.origin_program.random_seed
         # step2: Create vars to receive vars at parameter servers.
         recv_inputs = []
         for v in self.param_grad_ep_mapping[endpoint]["params"]:
@@ -371,7 +372,7 @@ class DistributeTranspiler(object):
                     dtype=v.dtype,
                     shape=v.shape)
             if self.sync_mode and self.trainer_num > 1:
-                for trainer_id in xrange(self.trainer_num):
+                for trainer_id in range(self.trainer_num):
                     var = pserver_program.global_block().create_var(
                         name="%s.trainer_%d" % (orig_var_name, trainer_id),
                         persistable=False,
@@ -461,7 +462,7 @@ class DistributeTranspiler(object):
             per_opt_block = pserver_program.create_block(pre_block_idx)
             optimize_blocks.append(per_opt_block)
             # append grad merging ops before clip and weight decay
-            # cases may like: 
+            # cases may like:
             # L2Decay op -> clip op -> optimize
             for _, op in enumerate(self.optimize_ops):
                 # find the origin @GRAD var before clipping
@@ -494,6 +495,7 @@ class DistributeTranspiler(object):
             pserver_index = self.pserver_endpoints.index(endpoint)
             table_opt_block = self._create_table_optimize_block(
                 pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
+            optimize_blocks.append(table_opt_block)
             prefetch_var_name_to_block_id = self._create_prefetch_block(
                 pserver_index, pserver_program, table_opt_block)
             checkpoint_block_id = self._create_checkpoint_save_block(
@@ -548,6 +550,7 @@ class DistributeTranspiler(object):
         """
         s_prog = Program()
         orig_s_prog = default_startup_program()
+        s_prog.random_seed = orig_s_prog.random_seed
         params = self.param_grad_ep_mapping[endpoint]["params"]
 
         def _get_splited_name_and_shape(varname):
@@ -560,7 +563,7 @@ class DistributeTranspiler(object):
         # 1. create vars in pserver program to startup program
         pserver_vars = pserver_program.global_block().vars
         created_var_map = dict()
-        for _, var in pserver_vars.iteritems():
+        for _, var in list(pserver_vars.items()):
             tmpvar = s_prog.global_block()._clone_variable(var)
             created_var_map[var.name] = tmpvar
 
@@ -810,7 +813,9 @@ class DistributeTranspiler(object):
                         outputs={"Out": prefetch_output_vars},
                         attrs={
                             "epmap": pserver_endpoints,
-                            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                            # FIXME(qiao) temporarily disable this config because prefetch
+                            # is not act as other rpc op, it's more like a forward op
+                            # RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                         })
 
                     # insert concat_op
@@ -922,8 +927,6 @@ class DistributeTranspiler(object):
             self.table_name
         ][0]
         table_opt_block = pserver_program.create_block(pre_block_idx)
-        # only support sgd now
-        assert table_opt_op.type == "sgd"
 
         if self.sync_mode:
             # create grad vars in pserver program
@@ -963,11 +966,12 @@ class DistributeTranspiler(object):
             "LearningRate": [lr_var]
         }
         outputs = {"ParamOut": [param_var]}
-        table_opt_block.append_op(
-            type=table_opt_op.type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=table_opt_op.attrs)
+        # only support sgd now
+        import logging
+        logging.warn(
+            "distribute lookup table only support sgd optimizer, change it's optimizer to sgd instead of "
+            + table_opt_op.type)
+        table_opt_block.append_op(type="sgd", inputs=inputs, outputs=outputs)
 
         # add table parameter gradient and it's block id to grad_to_block_id
         grad_to_block_id.append(grad_var.name + ":" + str(table_opt_block.idx))
@@ -1018,11 +1022,11 @@ class DistributeTranspiler(object):
         var_mapping = dict()
         for block_str in block_list:
             varname, offset, size = block_str.split(":")
-            if not block_map.has_key(varname):
+            if varname not in block_map:
                 block_map[varname] = []
-            block_map[varname].append((long(offset), long(size)))
+            block_map[varname].append((int(offset), int(size)))
 
-        for varname, splited in block_map.iteritems():
+        for varname, splited in list(block_map.items()):
             orig_var = program.global_block().var(varname)
             if len(splited) == 1:
                 if self.sync_mode and add_trainer_suffix:
@@ -1185,7 +1189,7 @@ class DistributeTranspiler(object):
         grad_to_block_id.append(merged_var.name + ":" + str(optimize_block.idx))
         if self.sync_mode and self.trainer_num > 1:
             vars2merge = []
-            for i in xrange(self.trainer_num):
+            for i in range(self.trainer_num):
                 per_trainer_name = "%s.trainer_%d" % \
                 (merged_var_name, i)
                 vars2merge.append(pserver_block.vars[per_trainer_name])
@@ -1233,7 +1237,7 @@ class DistributeTranspiler(object):
                 # learning rate variable has already be created by non-optimize op,
                 # don't create it once again.
                 lr_varname = opt_op.input(key)[0]
-                if pserver_block.vars.has_key(lr_varname):
+                if lr_varname in pserver_block.vars:
                     new_inputs[key] = pserver_block.vars[opt_op.input(key)[0]]
                 else:
                     origin_var = origin_program.global_block().vars[lr_varname]
@@ -1273,7 +1277,9 @@ class DistributeTranspiler(object):
 
     def _is_splited_grad_var(self, var, var_dict):
         grad_block = None
-        for _, g in var_dict.iteritems():
+        # TODO(minqiyang): replace these items() with six.iteritems() to
+        # improve memory
+        for _, g in list(var_dict.items()):
             if self._orig_varname(g.name) == self._orig_varname(var.name):
                 if g.name.find(".trainer_") == -1:
                     grad_block = g
@@ -1283,7 +1289,7 @@ class DistributeTranspiler(object):
     def _clone_lr_op(self, program, block, op):
         inputs = self._get_input_map_from_op(
             self.origin_program.global_block().vars, op)
-        for key, varlist in inputs.iteritems():
+        for key, varlist in list(inputs.items()):
             if not isinstance(varlist, list):
                 varlist = [varlist]
             for var in varlist:
@@ -1292,7 +1298,7 @@ class DistributeTranspiler(object):
 
         outputs = self._get_output_map_from_op(
             self.origin_program.global_block().vars, op)
-        for key, varlist in outputs.iteritems():
+        for key, varlist in list(outputs.items()):
             if not isinstance(varlist, list):
                 varlist = [varlist]
             for var in varlist:
@@ -1307,7 +1313,7 @@ class DistributeTranspiler(object):
         # Append the ops for parameters that do not need to be optimized/updated
         inputs = self._get_input_map_from_op(
             self.origin_program.global_block().vars, opt_op)
-        for key, varlist in inputs.iteritems():
+        for key, varlist in list(inputs.items()):
             if not isinstance(varlist, list):
                 varlist = [varlist]
             for var in varlist:
@@ -1317,7 +1323,7 @@ class DistributeTranspiler(object):
                     var, program.global_block().vars)
                 if grad_block:
                     inputs[key] = grad_block
-                elif not program.global_block().vars.has_key(var.name):
+                elif var.name not in program.global_block().vars:
                     program.global_block().create_var(
                         name=var.name,
                         persistable=var.persistable,
@@ -1326,7 +1332,7 @@ class DistributeTranspiler(object):
 
         outputs = self._get_output_map_from_op(
             self.origin_program.global_block().vars, opt_op)
-        for key, varlist in outputs.iteritems():
+        for key, varlist in list(outputs.items()):
             if not isinstance(varlist, list):
                 varlist = [varlist]
             for var in varlist:
@@ -1334,7 +1340,7 @@ class DistributeTranspiler(object):
                     var, program.global_block().vars)
                 if grad_block:
                     outputs[key] = grad_block
-                elif not program.global_block().vars.has_key(var.name):
+                elif var.name not in program.global_block().vars:
                     program.global_block()._clone_variable(var)
 
         return optimize_block.append_op(
@@ -1355,8 +1361,8 @@ class DistributeTranspiler(object):
     def _create_ufind(self, optimize_ops):
         # Create a unit find data struct by optimize ops
         ufind = UnionFind(optimize_ops)
-        for i in xrange(len(optimize_ops)):
-            for j in xrange(i, len(optimize_ops)):
+        for i in range(len(optimize_ops)):
+            for j in range(i, len(optimize_ops)):
                 op1 = optimize_ops[i]
                 op2 = optimize_ops[j]
                 if self._is_op_connected(op1, op2):
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
index f1905f08787da7a58a41d840ea68fb6c07f4028f..142fa5c31d2c558e482001da73fa26d8396c3967 100644
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -305,6 +305,6 @@ class InferenceTranspiler(object):
             args += current_op.output_arg_names
         args = list(set(args))  # unique the input and output arguments
 
-        for var in self.block.vars.keys():
+        for var in list(self.block.vars.keys()):
             if var not in args:
                 self.block._remove_var(var)
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 0ca5cf813b51e200da5edd5830767ad9457acec2..20ba7ed2b0b9df0d0432727ee1f69f61533c402e 100644
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -16,6 +16,8 @@ from collections import defaultdict
 from .. import core
 from ..framework import Program, default_main_program, Parameter
 from ..backward import _rename_arg_
+from functools import reduce
+from six.moves import range
 
 dtype_to_size = {
     core.VarDesc.VarType.FP16: 2,
@@ -107,7 +109,7 @@ class ControlFlowGraph(object):
         # Repeatedly apply liveness updates until the algorithm stablize
         # on a complete set live input vars and live output vars.
         while True:
-            for i in reversed(range(self.op_size)):
+            for i in reversed(list(range(self.op_size))):
                 live_in[i] = set(self._live_in[i])
                 live_out[i] = set(self._live_out[i])
                 for s in self._successors[i]:
@@ -172,9 +174,10 @@ class ControlFlowGraph(object):
             is_forward = i < self._forward_num
             in_diff, out_diff = self._get_diff(self._live_in[i],
                                                self._live_out[i])
-            can_optimize = filter(
-                lambda x: self._check_var_validity(block_desc, x, is_forward),
-                in_diff)
+            can_optimize = [
+                x for x in in_diff
+                if self._check_var_validity(block_desc, x, is_forward)
+            ]
             if can_optimize:
                 index = i + fwd_id + 1 if is_forward else i - self._forward_num + bwd_id + 1
                 delete_op = block_desc._insert_op(index)
@@ -213,9 +216,10 @@ class ControlFlowGraph(object):
             block_desc = op.block()
             is_forward = i < self._forward_num
             if self.pool:
-                defs_can_optimize = filter(
-                    lambda x: self._check_var_validity(block_desc, x, is_forward),
-                    self._defs[i])
+                defs_can_optimize = [
+                    x for x in self._defs[i]
+                    if self._check_var_validity(block_desc, x, is_forward)
+                ]
                 out_pair = [
                     (x, self._find_var(block_desc, x, is_forward).shape())
                     for x in defs_can_optimize
@@ -261,9 +265,10 @@ class ControlFlowGraph(object):
                         break
 
             in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i])
-            can_optimize = filter(
-                lambda x: self._check_var_validity(block_desc, x, is_forward),
-                in_diff)
+            can_optimize = [
+                x for x in in_diff
+                if self._check_var_validity(block_desc, x, is_forward)
+            ]
             if can_optimize:
                 for var_name in can_optimize:
                     self.pool.append((var_name, self._find_var(
diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py
index 776619cd36722e338a9fdd5e13bceeaf3724de2c..b125eba4f83c588fa2fa81a357604a7d8592ea80 100644
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
@@ -14,6 +14,7 @@
 
 import collections
 import contextlib
+import six
 import sys
 
 __all__ = ['generate', 'switch', 'guard']
@@ -67,8 +68,10 @@ def switch(new_generator=None):
 
 @contextlib.contextmanager
 def guard(new_generator=None):
-    if isinstance(new_generator, basestring):
+    if isinstance(new_generator, six.string_types):
         new_generator = UniqueNameGenerator(new_generator)
+    elif isinstance(new_generator, six.binary_type):
+        new_generator = UniqueNameGenerator(new_generator.decode())
     old = switch(new_generator)
     yield
     switch(old)
diff --git a/python/paddle/reader/creator.py b/python/paddle/reader/creator.py
index 4c905d959fad4e8c1a8826ce8dc60c5fa834514d..c861020225fb6fe0a29653363c2151b20dc8f578 100644
--- a/python/paddle/reader/creator.py
+++ b/python/paddle/reader/creator.py
@@ -67,11 +67,14 @@ def recordio(paths, buf_size=100):
 
     import recordio as rec
     import paddle.reader.decorator as dec
-    import cPickle as pickle
+    import six
+    import six.moves.cPickle as pickle
 
     def reader():
-        if isinstance(paths, basestring):
+        if isinstance(paths, six.string_types):
             path = paths
+        elif isinstance(paths, six.binary_type):
+            path = paths.decode()
         else:
             path = ",".join(paths)
         f = rec.reader(path)
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index 4b1fe94222d35f8c0e4e4cccc364227a3f9509d0..ce410e61b92e7d3f32fa5bfeb415e4b6c5fa9df6 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -21,6 +21,9 @@ from threading import Thread
 import subprocess
 
 from six.moves.queue import Queue
+from six.moves import zip_longest
+from six.moves import map
+from six.moves import zip
 import itertools
 import random
 import zlib
@@ -42,7 +45,7 @@ def map_readers(func, *readers):
         rs = []
         for r in readers:
             rs.append(r())
-        for e in itertools.imap(func, *rs):
+        for e in map(func, *rs):
             yield e
 
     return reader
@@ -148,16 +151,16 @@ def compose(*readers, **kwargs):
         for r in readers:
             rs.append(r())
         if not check_alignment:
-            for outputs in itertools.izip(*rs):
-                yield sum(map(make_tuple, outputs), ())
+            for outputs in zip(*rs):
+                yield sum(list(map(make_tuple, outputs)), ())
         else:
-            for outputs in itertools.izip_longest(*rs):
+            for outputs in zip_longest(*rs):
                 for o in outputs:
                     if o is None:
                         # None will be not be present if compose is aligned
                         raise ComposeNotAligned(
                             "outputs of readers are not aligned.")
-                yield sum(map(make_tuple, outputs), ())
+                yield sum(list(map(make_tuple, outputs)), ())
 
     return reader
 
@@ -306,7 +309,7 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
         args = (in_queue, out_queue, mapper, out_order) if order else (
             in_queue, out_queue, mapper)
         workers = []
-        for i in xrange(process_num):
+        for i in range(process_num):
             worker = Thread(target=target, args=args)
             worker.daemon = True
             workers.append(worker)
diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py
index bee24d3b6579db5e99ec66931df201fdf9e1af07..537df489b9738864933b3a7922d178701db3d19f 100644
--- a/python/paddle/reader/tests/decorator_test.py
+++ b/python/paddle/reader/tests/decorator_test.py
@@ -136,7 +136,7 @@ class TestXmap(unittest.TestCase):
                     reader = paddle.reader.xmap_readers(mapper,
                                                         reader_creator_10(0),
                                                         tNum, size, order)
-                    for n in xrange(3):
+                    for n in range(3):
                         result = []
                         for i in reader():
                             result.append(i)
@@ -156,7 +156,7 @@ class TestPipeReader(unittest.TestCase):
 
         import tempfile
 
-        records = [str(i) for i in xrange(5)]
+        records = [str(i) for i in range(5)]
         temp = tempfile.NamedTemporaryFile()
         try:
             with open(temp.name, 'w') as f:
diff --git a/python/paddle/trainer/PyDataProviderWrapper.py b/python/paddle/trainer/PyDataProviderWrapper.py
index 6af250772859811b3c48434ab005e50b435dd320..374976db9f17ad9b1fd33c5d4adf77155336d100 100644
--- a/python/paddle/trainer/PyDataProviderWrapper.py
+++ b/python/paddle/trainer/PyDataProviderWrapper.py
@@ -42,7 +42,7 @@ except ImportError:
 try:
     import cPickle as pickle
 except ImportError:
-    import pickle
+    import six.moves.cPickle as pickle
 
 import io
 
diff --git a/python/paddle/trainer_config_helpers/data_sources.py b/python/paddle/trainer_config_helpers/data_sources.py
index ab9a2562dcccb394c0b24741ceeb10061e40cb9a..a2a32d848cbc4200397e6a12a3662419102da0a9 100644
--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ b/python/paddle/trainer_config_helpers/data_sources.py
@@ -20,7 +20,7 @@ from .utils import deprecated
 try:
     import cPickle as pickle
 except ImportError:
-    import pickle
+    import six.moves.cPickle as pickle
 
 __all__ = ['define_py_data_sources2']
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index d9787ef42a31b8dfd1836e7a01d5664049cc66b5..ee34c157334b533b9c330b8103424964d7df510b 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -28,7 +28,7 @@ from .default_decorators import *
 try:
     import cPickle as pickle
 except ImportError:
-    import pickle
+    import six.moves.cPickle as pickle
 import copy
 
 __all__ = [
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
index 0d544efac9cd20157f87b5cd3b68f97ab5ed2dbc..8312900dc43fdd64cc1a205ab846b6f1deaecf5d 100644
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
@@ -29,13 +29,13 @@ __all__ = ['test, get_dict', 'get_embedding', 'convert']
 
 DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
-WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
+WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
 WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
-VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
+VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt'
 VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
-TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
+TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt'
 TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
-EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
+EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb'
 EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
 
 UNK_IDX = 0
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
index 5104e29051e4480f3a7eb18421f1b519841b009b..1ec210f265049c8b62cd99cd218f25a9f846ef43 100644
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -41,7 +41,7 @@ URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
              'wmt_shrinked_data/wmt14.tgz')
 MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
 # BLEU of this trained model is 26.92
-URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
+URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz'
 MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
 
 START = "<s>"
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
index 2c65222c8aa7a019f0f8fea68fe02612f70bd41f..aa14d3a2a12208eda11e82d88bc582eb3d2f5893 100755
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -4,7 +4,7 @@ TOTAL_ERRORS=0
 
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
-    if [[ $file =~ ^(paddle/legacy/api/.*|paddle/legacy/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/legacy/trainer/.*|paddle/legacy/utils/.*|paddle/testing/TestUtil.*) ]]; then
+    if [[ $file =~ ^(paddle/legacy/api/.*|paddle/legacy/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/legacy/trainer/.*|paddle/legacy/utils/.*|paddle/testing/TestUtil.*|patches/grpc/.*) ]]; then
         continue;
     else
         cpplint --filter=-readability/fn_size $file;
diff --git a/tools/diff_api.py b/tools/diff_api.py
index cf9f2c72cb78ddf88ff2a7bb1c0ee4b00ec0ec96..97c739ed2a5627ad9fd326f206976a4579dc26a3 100644
--- a/tools/diff_api.py
+++ b/tools/diff_api.py
@@ -20,9 +20,7 @@ for each_diff in result:
     if each_diff[0] in ['-', '?']:  # delete or change API is not allowed
         error = True
     elif each_diff[0] == '+':
-        # only new layers is allowed.
-        if not each_diff.startswith('+ paddle.fluid.layers.'):
-            error = True
+        error = True
 
     if each_diff[0] != ' ':
         print(each_diff)
diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
index bca0b77ad71a3f65dda15191e5f540bfc2e043d1..0d59e4c110ff8502acb4dbcda15f855f7652a946 100644
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -13,7 +13,7 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
 ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
 
-RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz
+RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz
 COPY build_scripts /build_scripts
 RUN bash build_scripts/build.sh && \
   bash build_scripts/install_nccl2.sh && rm -r build_scripts
@@ -40,11 +40,13 @@ RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddl
 
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install -r /root/requirements.txt && \
     go get github.com/Masterminds/glide && \
     rm -rf /root/requirements.txt
 
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
 
 RUN wget -O /opt/swig-2.0.12.tar.gz https://cytranet.dl.sourceforge.net/project/swig/swig/swig-2.0.12/swig-2.0.12.tar.gz && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
diff --git a/tools/test_runner.py b/tools/test_runner.py
index 9dc750b89058cd73355a2f7984d577252c03526d..2d6a3cf8a97a3bbaa69b66f5343c54b750624329 100644
--- a/tools/test_runner.py
+++ b/tools/test_runner.py
@@ -12,19 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
 import unittest
 import os
 import sys
 import paddle.fluid as fluid
 import importlib
-import cStringIO
+from six.moves import cStringIO
 
 
 def main():
     sys.path.append(os.getcwd())
     some_test_failed = False
     for module_name in sys.argv[1:]:
-        buffer = cStringIO.StringIO()
+        buffer = cStringIO()
         main = fluid.Program()
         startup = fluid.Program()
         scope = fluid.core.Scope()
@@ -37,8 +38,11 @@ def main():
                     res = unittest.TextTestRunner(stream=buffer).run(tests)
                     if not res.wasSuccessful():
                         some_test_failed = True
-                        print >> sys.stderr, module_name, 'failed\n', buffer.getvalue(
-                        )
+                        print(
+                            module_name,
+                            'failed\n',
+                            buffer.getvalue(),
+                            file=sys.stderr)
 
     if some_test_failed:
         exit(1)