merge develop

a314a80c · sneaxiy · d8756913 · aa6b2bda · a314a80c · a314a80c
309 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -25,5 +25,7 @@ third_party/
 bazel-*
 third_party/
+build_*
 # clion workspace.
 cmake-build-*
+model_test
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,10 +69,11 @@ option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(WITH_INFERENCE    "Compile fluid inference library"              ON)
+option(ON_INFER         "Turn on inference optimization."               OFF)
 option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"  OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
-option(WITH_FAST_MATH   "Make use of fast math library"                 OFF)
+option(WITH_FAST_MATH   "Make use of fast math library, might affect the precision to some extent" ON)
 # PY_VERSION
 if(NOT PY_VERSION)
@@ -127,6 +128,9 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
 set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
  "A path setting fluid shared and static libraries")
+set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING
+  "A path setting fluid inference shared and static libraries")
 if (WITH_C_API AND WITH_PYTHON)
  message(WARNING "It is suggest not embedded a python interpreter in Paddle "
    "when using C-API. It will give an unpredictable behavior when using a "
@@ -176,6 +180,7 @@ include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
+include(external/xxhash)    # download xxhash
 if (NOT WIN32)
 # there is no official support of snappystream, warpctc, nccl, cupti in windows
@@ -298,3 +303,8 @@ if(WITH_DOC)
    find_python_module(recommonmark REQUIRED)
    add_subdirectory(doc)
 endif()
+if (ON_INFER)
+    message(WARNING "On inference mode, will take place some specific optimization.")
+    add_definitions(-DPADDLE_ON_INFERENCE)
+endif()
--- a/Dockerfile
+++ b/Dockerfile
@@ -75,14 +75,14 @@ RUN pip3 install -U wheel && \
    pip3 install -U docopt PyYAML sphinx==1.5.6 && \
    pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
    easy_install -U pip && \
-    pip install -U wheel && \
+    pip install -U pip setuptools wheel && \
    pip install -U docopt PyYAML sphinx==1.5.6 && \
    pip install sphinx-rtd-theme==0.1.9 recommonmark
-RUN pip3 install pre-commit 'ipython==5.3.0' && \
+RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
    pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
    pip3 install opencv-python && \
-    pip install pre-commit 'ipython==5.3.0' && \
+    pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
    pip install opencv-python

--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.0/getstarted/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
-### Latest PaddlePaddle Release: [Fluid 0.15.0](https://github.com/PaddlePaddle/Paddle/tree/v0.15.0)
+### Latest PaddlePaddle Release: [Fluid 1.0.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.0.0)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@@ -27,9 +27,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==0.15.0.post87
+pip install paddlepaddle-gpu==1.0.1.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==0.15.0.post85
+pip install paddlepaddle-gpu==1.0.1.post85
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==0.15.0.post85
 ## Installation
-It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/install/install_doc.html) on our website.
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) on our website.
 ## Documentation
-We provide [English](http://paddlepaddle.org/documentation/docs/en/0.15.0/getstarted/index_en.html) and
+We provide [English](http://paddlepaddle.org/documentation/docs/en/1.0.0/getstarted/index_en.html) and
-[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/index.html) documentation.
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) documentation.
 - [Deep Learning 101](https://github.com/PaddlePaddle/book)
  You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/user_guides/howto/training/cluster_howto.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.0/user_guides/howto/training/cluster_howto.html)
  You can run distributed training jobs on MPI clusters.
- [Python API](http://paddlepaddle.org/documentation/api/zh/0.15.0/fluid.html)
+- [Python API](http://paddlepaddle.org/documentation/api/zh/1.0/fluid.html)
   Our new API enables much shorter programs.
- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/advanced_usage/development/contribute_to_paddle.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.0/advanced_usage/development/contribute_to_paddle.html)
   We appreciate your contributions!

--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
+INCLUDE(ExternalProject)
+set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash)
+set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash)
+set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
+IF(WITH_STATIC_LIB)
+  SET(BUILD_CMD make lib)
+ELSE()
+  SET(BUILD_CMD sed -i "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib)
+ENDIF()
+ExternalProject_Add(
+    extern_xxhash
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
+    GIT_TAG         "v0.6.5"
+    PREFIX          ${XXHASH_SOURCE_DIR}
+    DOWNLOAD_NAME   "xxhash"
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_IN_SOURCE 1
+    PATCH_COMMAND
+    BUILD_COMMAND     ${BUILD_CMD}
+    INSTALL_COMMAND   export PREFIX=${XXHASH_INSTALL_DIR}/ && make install
+    TEST_COMMAND      ""
+)
+set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
+INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR})
+add_library(xxhash STATIC IMPORTED GLOBAL)
+set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES})
+include_directories(${XXHASH_INCLUDE_DIR})
+add_dependencies(xxhash extern_xxhash)
+LIST(APPEND external_project_dependencies xxhash)
+IF(WITH_C_API)
+  INSTALL(DIRECTORY ${XXHASH_INCLUDE_DIR} DESTINATION third_party/xxhash)
+  IF(ANDROID)
+    INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib/${ANDROID_ABI})
+  ELSE()
+    INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib)
+  ENDIF()
+ENDIF()
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -261,6 +261,13 @@ function(cc_library TARGET_NAME)
        add_dependencies(${TARGET_NAME} mklml)
        target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
      endif()
+      # remove link to python, see notes at:
+      # https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually
+      if("${cc_library_DEPS};" MATCHES "python;")
+        list(REMOVE_ITEM cc_library_DEPS python)
+        add_dependencies(${TARGET_NAME} python)
+        target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup")
+      endif()
      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
    endif()
@@ -311,6 +318,8 @@ function(cc_test TARGET_NAME)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
+    # No unit test should exceed 10 minutes.
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
  endif()
 endfunction(cc_test)
@@ -629,6 +638,8 @@ function(py_test TARGET_NAME)
             PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    # No unit test should exceed 10 minutes.
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
  endif()
 endfunction()

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -14,11 +14,14 @@
 # make package for paddle fluid shared and static library
 function(copy TARGET)
+    if (NOT ON_INFER)
+      message(WARNING "Turn on the ON_INFER flag when building inference_lib only.")
+    endif()
    set(options "")
    set(oneValueArgs "")
    set(multiValueArgs SRCS DSTS DEPS)
    cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE)
+    set(fluid_lib_dist_dep ${TARGET} ${fluid_lib_dist_dep} PARENT_SCOPE)
    list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
    list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
@@ -31,7 +34,7 @@ function(copy TARGET)
    foreach(index RANGE ${len})
        list(GET copy_lib_SRCS ${index} src)
        list(GET copy_lib_DSTS ${index} dst)
-        add_custom_command(TARGET ${TARGET} PRE_BUILD 
+        add_custom_command(TARGET ${TARGET} PRE_BUILD
          COMMAND mkdir -p "${dst}"
          COMMAND cp -r "${src}" "${dst}"
          COMMENT "copying ${src} -> ${dst}")
@@ -67,6 +70,13 @@ copy(boost_lib
  DEPS boost
 )
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash")
+copy(xxhash_lib
+  SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
+  DSTS ${dst_dir} ${dst_dir}/lib
+  DEPS xxhash
+)
 if(NOT PROTOBUF_FOUND)
    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
    copy(protobuf_lib
@@ -150,16 +160,16 @@ if (WITH_ANAKIN AND WITH_MKL)
        SRCS
        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
        ${ANAKIN_INSTALL_DIR} # anakin release
-        DSTS ${dst_dir}/inference/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
+        DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
     list(APPEND inference_deps anakin_inference_lib)
 endif()
 set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-       ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
+       ${src_dir}/${module}/api/paddle_inference_api.h
       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
 )
 set(module "platform")
@@ -185,20 +195,41 @@ copy(cmake_cache
  SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
  DSTS ${FLUID_INSTALL_DIR})
-add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) 
+# This command generates a complete fluid library for both train and inference
+add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
+# Following commands generate a inference-only fluid library
+# third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR}
+copy(third_party DEPS fluid_lib_dist
+  SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt
+  DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR}
+)
+# only need libpaddle_fluid.so/a and paddle_inference_api.h for inference-only library
+copy(inference_api_lib DEPS fluid_lib_dist
+  SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.*
+       ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_inference_api.h
+  DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include
+)
+add_custom_target(inference_lib_dist DEPENDS third_party inference_api_lib)
 # paddle fluid version
-execute_process(
+function(version version_file)
-  COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+  execute_process(
-  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+    COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
-  OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-set(version_file ${FLUID_INSTALL_DIR}/version.txt)
+    OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
-file(WRITE ${version_file}
+  file(WRITE ${version_file}
-  "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
+    "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
-  "WITH_MKL: ${WITH_MKL}\n"
+    "WITH_MKL: ${WITH_MKL}\n"
-  "WITH_GPU: ${WITH_GPU}\n")
+    "WITH_MKLDNN: ${WITH_MKLDNN}\n"
-if(WITH_GPU)
+    "WITH_GPU: ${WITH_GPU}\n")
-  file(APPEND ${version_file}
+  if(WITH_GPU)
-    "CUDA version: ${CUDA_VERSION}\n"
+    file(APPEND ${version_file}
-    "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
+      "CUDA version: ${CUDA_VERSION}\n"
-endif()
+      "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
+  endif()
+endfunction()
+version(${FLUID_INSTALL_DIR}/version.txt)
+version(${FLUID_INFERENCE_INSTALL_DIR}/version.txt)
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -61,12 +61,12 @@ paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None
 paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
 paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None))
+paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
 paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
 paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn'], varargs=None, keywords=None, defaults=(None, None, False))
+paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
-paddle.fluid.layers.softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None))
+paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
 paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
 paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
 paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
@@ -75,7 +75,8 @@ paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'outp
 paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
 paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.sequence_unpad ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
 paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
 paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
@@ -84,7 +85,8 @@ paddle.fluid.layers.reduce_min ArgSpec(args=['input', 'dim', 'keep_dim', 'name']
 paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
 paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
+paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer'))
 paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None))
 paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None))
@@ -95,8 +97,8 @@ paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_ti
 paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
-paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None))
-paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
@@ -105,7 +107,7 @@ paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label',
 paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
-paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None))
+paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None))
 paddle.fluid.layers.squeeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.unsqueeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None))
@@ -114,6 +116,7 @@ paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], var
 paddle.fluid.layers.pad_constant_like ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None))
 paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None))
 paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
+paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None))
 paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
 paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR'))
 paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
@@ -127,6 +130,7 @@ paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.margin_rank_loss ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None))
 paddle.fluid.layers.elu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
 paddle.fluid.layers.relu6 ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None))
 paddle.fluid.layers.pow ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
@@ -170,6 +174,9 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
 paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
+paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@@ -198,6 +205,9 @@ paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None
 paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.has_inf ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.has_nan ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.isfinite ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))

--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -12,6 +12,5 @@ endif(NOT WIN32)
 if(WITH_INFERENCE)
  # NOTE: please add subdirectory inference at last.
  add_subdirectory(inference)
+  add_subdirectory(train)
 endif()
-add_subdirectory(train)
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <typeindex>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 namespace paddle {

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -95,9 +95,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
  for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
    if (pass->Type() == "multi_devices_pass") {
-      pass->Erase("enable_sequence_execution");
+      pass->Erase("enable_sequential_execution");
-      if (enable_sequence_execution_) {
+      if (enable_sequential_execution_) {
-        pass->Set("enable_sequence_execution", new bool(true));
+        pass->Set("enable_sequential_execution", new bool(true));
      }
      pass->Erase("places");

--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -69,7 +69,7 @@ struct BuildStrategy {
  bool enable_data_balance_{false};
-  bool enable_sequence_execution_{false};
+  bool enable_sequential_execution_{false};
  // User normally doesn't need to call this API.
  // The PassBuilder allows for more customized insert, remove of passes

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -239,9 +239,9 @@ size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
 // optimizer nodes after last backward nodes.
 // However, the assumption by SSAGraphBuilder should be relaxed in the future.
 std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(
-    const ir::Graph &graph, bool enable_sequence_execution = false) {
+    const ir::Graph &graph, bool enable_sequential_execution = false) {
  std::vector<ir::Node *> ret;
-  if (enable_sequence_execution) {
+  if (enable_sequential_execution) {
    VLOG(10) << "sequential execution mode is enabled";
    for (auto *node : graph.Nodes()) {
      if (node->IsOp()) {
@@ -269,9 +269,9 @@ std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(
  std::vector<ir::Node *> sorted_ret;
  for (size_t i = 0; i < ret.size(); ++i) {
    if (i < last_backward) {
-      if (boost::get<int>(ret[i]->Op()->GetAttr(
+      if (static_cast<bool>(boost::get<int>(ret[i]->Op()->GetAttr(
-              OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
-          static_cast<int>(OpRole::kOptimize)) {
+                            static_cast<int>(OpRole::kOptimize))) {
        optimize_ops.push_back(ret[i]);
      } else {
        sorted_ret.push_back(ret[i]);
@@ -304,10 +304,10 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  Init();
  // Give the topology sort order and rebuild the graph structure.
-  bool enable_sequence_execution = Has("enable_sequence_execution") &&
+  bool enable_sequential_execution = Has("enable_sequential_execution") &&
-                                   Get<bool>("enable_sequence_execution");
+                                     Get<bool>("enable_sequential_execution");
  std::vector<ir::Node *> sorted_ops =
-      SortOpsAndDelayOptimizeOp(*graph, enable_sequence_execution);
+      SortOpsAndDelayOptimizeOp(*graph, enable_sequential_execution);
  auto nodes = graph->ReleaseNodes();
  ir::Graph &result = *graph;
@@ -465,7 +465,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
  }
  // Insert dependencies between computation_ops
-  if (enable_sequence_execution) {
+  if (enable_sequential_execution) {
    InsertSequenceDependenciesBetweenComputationOps(graph.get());
  }

--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -64,7 +64,8 @@ class OpHandleBase {
  virtual bool IsMultiDeviceTransfer() { return false; }
  const platform::DeviceContext *DeviceContext(platform::Place place) {
-    return dev_ctxes_[place];
+    auto it = dev_ctxes_.find(place);
+    return it != dev_ctxes_.end() ? it->second : nullptr;
  }
  void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {

--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -49,6 +49,8 @@ struct VarHandleBase {
  void AddOutput(OpHandleBase* out, ir::Node* node) {
    if (pending_ops_.find(out) == pending_ops_.end()) {
+      PADDLE_ENFORCE(out != nullptr, "The output of %s should not be nullptr",
+                     this->Node()->Name());
      pending_ops_.insert(out);
      node_->outputs.push_back(node);
    }

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -46,6 +46,41 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
  VLOG(5) << "destroy ExecutorPrepareContext";
 }
+template <typename RefCntMap>
+static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
+                                GarbageCollector<Tensor>* gc,
+                                RefCntMap* ref_cnts) {
+  std::unordered_set<Tensor*> erase_tensors;
+  auto handler = [&](const VariableNameMap& name_map) {
+    for (auto& name_pair : name_map) {
+      for (auto& name : name_pair.second) {
+        auto it = ref_cnts->find(name);
+        if (it == ref_cnts->end()) continue;
+        if ((it->second)-- == 1) {
+          auto* var = scope.FindVar(name);
+          if (var != nullptr) {
+            VLOG(10) << "Erase tensor \'" << name << "\'";
+            if (var->IsType<LoDTensor>()) {
+              erase_tensors.insert(var->GetMutable<LoDTensor>());
+            } else if (var->IsType<SelectedRows>()) {
+              erase_tensors.insert(
+                  var->GetMutable<SelectedRows>()->mutable_value());
+            }
+          }
+        }
+      }
+    }
+  };
+  handler(op->Inputs());
+  handler(op->Outputs());
+  if (!erase_tensors.empty()) {
+    gc->Add(erase_tensors);
+  }
+}
 Executor::Executor(const platform::Place& place) : place_(place) {}
 void Executor::Close() {
@@ -66,7 +101,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
  } else if (var_type == proto::VarType::FETCH_LIST) {
    var->GetMutable<FeedFetchList>();
  } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope>>();
+    var->GetMutable<std::vector<framework::Scope*>>();
  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
    var->GetMutable<LoDRankTable>();
  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
@@ -331,9 +366,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  }
  int64_t max_memory_size = GetEagerDeletionThreshold();
  std::unique_ptr<GarbageCollector<Tensor>> gc;
-  if (max_memory_size >= 0) {
+  // WhileOp would set keep_kids to false
+  // WhileGradOp would need the scopes created in WhileOp
+  // Perhaps, we should not perform eager deletion in WhileOp
+  // The scopes and variables created by WhileOp would be deleted
+  // in WhileGradOp.
+  if (max_memory_size >= 0 && !keep_kids) {
    ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
    if (platform::is_gpu_place(place_)) {
@@ -352,45 +391,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
    op->Run(*local_scope, place_);
    if (gc != nullptr) {
-      std::vector<std::string> erase_vars;
+      DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
-      for (auto& input : op->Inputs()) {
+                          &(ctx->cur_ref_cnts_));
-        for (auto& input_name : input.second) {
-          auto it = ctx->cur_ref_cnts_.find(input_name);
-          if (it == ctx->cur_ref_cnts_.end()) continue;
-          if (it->second == 1) {  // should delete it
-            erase_vars.emplace_back(input_name);
-            ctx->cur_ref_cnts_.erase(input_name);
-          } else {
-            --(it->second);
-          }
-        }
-      }
-      for (auto& output : op->Outputs()) {
-        for (auto& output_name : output.second) {
-          auto it = ctx->cur_ref_cnts_.find(output_name);
-          if (it == ctx->cur_ref_cnts_.end()) continue;
-          if (it->second == 1) {
-            erase_vars.emplace_back(output_name);
-            ctx->cur_ref_cnts_.erase(output_name);
-          } else {
-            --(it->second);
-          }
-        }
-      }
-      if (!erase_vars.empty()) {
-        std::vector<framework::LoDTensor*> erase_tensors;
-        for (auto& name : erase_vars) {
-          auto* var = local_scope->FindVar(name);
-          if (var == nullptr) continue;
-          if (var->IsType<framework::LoDTensor>()) {
-            auto* tensor = var->GetMutable<framework::LoDTensor>();
-            erase_tensors.push_back(tensor);
-          }
-        }
-        if (!erase_tensors.empty()) gc->Add(erase_tensors);
-      }
    }
    if (FLAGS_benchmark) {

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -32,38 +32,32 @@ template <typename T>
 std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
    const ProgramDesc& prog, size_t block_id) {
  auto& block = prog.Block(block_id);
-  std::unordered_set<std::string> ignored_vars;
  std::unordered_map<std::string, T> ref_cnts;
-  for (auto var_desc : block.AllVars()) {
+  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
-    auto type = var_desc->Proto()->type().type();
+    for (auto& name_pair : name_map) {
-    if (type != proto::VarType::LOD_TENSOR || var_desc->Persistable()) {
+      for (auto& name : name_pair.second) {
-      ignored_vars.insert(var_desc->Name());  // ignore persistable vars
+        auto* var_desc = block.FindVar(name);
-    }
+        if (var_desc == nullptr || var_desc->Persistable()) continue;
-  }
+        auto type = var_desc->Proto()->type().type();
+        if (type != proto::VarType::LOD_TENSOR &&
-  for (auto op_desc : block.AllOps()) {
+            type != proto::VarType::SELECTED_ROWS) {
-    for (auto& input : op_desc->Inputs()) {
+          continue;
-      for (auto& input_name : input.second) {
-        if (!ignored_vars.count(input_name)) {
-          if (ref_cnts.count(input_name))
-            ++ref_cnts[input_name];
-          else
-            ref_cnts[input_name] = 1;
        }
-      }
-    }
-    for (auto& output : op_desc->Outputs()) {
+        auto it = ref_cnts.find(name);
-      for (auto output_name : output.second) {
+        if (it != ref_cnts.end()) {
-        if (!ignored_vars.count(output_name)) {
+          ++it->second;
-          if (ref_cnts.count(output_name))
+        } else {
-            ++ref_cnts[output_name];
+          ref_cnts[name] = 1;
-          else
-            ref_cnts[output_name] = 1;
        }
      }
    }
+  };
+  for (auto op_desc : block.AllOps()) {
+    update_ref_cnts(op_desc, op_desc->Inputs());
+    update_ref_cnts(op_desc, op_desc->Outputs());
  }
  return ref_cnts;
 }

--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -27,8 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
  // be created.
  VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
  Variable* g_feed_value = scope->Var(var_name);
-  auto& feed_inputs =
+  auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
-      *(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
  if (index >= feed_inputs.size()) {
    feed_inputs.resize(index + 1);
  }

--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -80,7 +80,6 @@ message OpProto {
    optional bool duplicable = 3 [ default = false ];
    optional bool intermediate = 4 [ default = false ];
    optional bool dispensable = 5 [ default = false ];
-    optional string reuse = 6;
  }
  // AttrProto describes the C++ type Attribute.

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -10,7 +10,7 @@ function(pass_library TARGET DEST)
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass ${op_library_DEPS})
+    cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS})
    # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
    if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
        message(STATUS "add pass ${TARGET} ${DEST}")
@@ -25,19 +25,25 @@ cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
 cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
 cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
 cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
+cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(fc_fuse_pass inference)
-if (WITH_MKLDNN)
-    pass_library(conv_relu_mkldnn_fuse_pass inference)
-endif ()
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
 pass_library(fc_lstm_fuse_pass inference)
 pass_library(embedding_fc_lstm_fuse_pass inference)
 pass_library(fc_gru_fuse_pass inference)
 pass_library(seq_concat_fc_fuse_pass inference)
+pass_library(conv_bn_fuse_pass inference)
+pass_library(seqconv_eltadd_relu_fuse_pass inference)
+if(WITH_MKLDNN)
+    pass_library(mkldnn_placement_pass base)
+    pass_library(conv_bias_mkldnn_fuse_pass inference)
+    pass_library(conv_relu_mkldnn_fuse_pass inference)
+    pass_library(conv_elementwise_add_mkldnn_fuse_pass inference)
+endif()
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
@@ -53,4 +59,5 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 if (WITH_MKLDNN)
    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
+    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
 endif ()
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -262,7 +262,7 @@ std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl(
  std::unordered_set<std::string> specified_vars({"data_lod_attention",
                                                  "cell_init", "hidden_init",
                                                  "data", "week", "minute"});
-  int count = 0;
+  size_t count = 0;
  for (auto* node : graph->Nodes()) {
    if (node->IsVar() && specified_vars.count(node->Name())) {
      ++count;

--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h"
+#include <functional>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+template <typename BinaryOperation>
+LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b,
+                               BinaryOperation f) {
+  PADDLE_ENFORCE_EQ(vec_a.dims(), vec_b.dims());
+  LoDTensor vec_y;
+  vec_y.Resize(vec_a.dims());
+  const float* a = vec_a.data<float>();
+  const float* b = vec_b.data<float>();
+  float* y = vec_y.mutable_data<float>(platform::CPUPlace());
+  for (int i = 0; i < vec_a.numel(); i++) {
+    y[i] = f(a[i], b[i]);
+  }
+  return vec_y;
+}
+std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init(name_scope_, graph.get());
+  auto* scope = param_scope();
+  PADDLE_ENFORCE(scope);
+  GraphPatternDetector gpd;
+  auto* conv_input =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
+          ->AsInput()
+          ->assert_is_op_input("conv2d", "Input");
+  patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(), name_scope_);
+  conv_bias_pattern(conv_input);
+  int found_conv_bias_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle ConvBias fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
+                              conv_bias_pattern);                      // Filter
+    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_bias_pattern);  // tmp
+    GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_bias_pattern);  // CONV op
+    // bias
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_bias, eltwise_bias, conv_bias_pattern);
+    // output
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_bias_pattern);
+    // elementwise_add op
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_bias_pattern);
+    PADDLE_ENFORCE(subgraph.count(conv_input));
+    // check if fuse can be done and if MKL-DNN should be used
+    FuseOptions fuse_option = FindFuseOption(*conv, *eltwise);
+    if (fuse_option == DO_NOT_FUSE || fuse_option == FUSE_NATIVE) {
+      VLOG(3) << "do not perform conv+bias fuse";
+      return;
+    }
+    auto* eltwise_bias_tensor =
+        scope->FindVar(eltwise_bias->Name())->GetMutable<LoDTensor>();
+    auto input_names = conv->Op()->InputNames();
+    bool has_bias = std::find(input_names.begin(), input_names.end(), "Bias") !=
+                    input_names.end();
+    if (has_bias && conv->Op()->Input("Bias").size() > 0) {
+      auto conv_bias_names = conv->Op()->Input("Bias");
+      // add eltwise bias to existing conv bias
+      PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1);
+      auto* conv_bias_var = scope->FindVar(conv_bias_names[0]);
+      auto* conv_bias_tensor = conv_bias_var->GetMutable<LoDTensor>();
+      PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(), eltwise_bias_tensor->dims());
+      *conv_bias_tensor = tensor_apply_eltwise(
+          *conv_bias_tensor, *eltwise_bias_tensor, std::plus<float>());
+      conv->Op()->SetOutput("Output",
+                            std::vector<std::string>({eltwise_out->Name()}));
+      GraphSafeRemoveNodes(graph.get(), {eltwise, conv_out});
+      IR_NODE_LINK_TO(conv, eltwise_out);
+    } else {
+      // take eltwise bias as conv bias
+      OpDesc desc;
+      desc.SetInput(
+          "Input", std::vector<std::string>({subgraph.at(conv_input)->Name()}));
+      desc.SetInput("Filter", std::vector<std::string>({conv_weight->Name()}));
+      desc.SetInput("Bias", std::vector<std::string>({eltwise_bias->Name()}));
+      desc.SetOutput("Output", std::vector<std::string>({eltwise_out->Name()}));
+      desc.SetType("conv2d");
+      for (auto& attr : conv->Op()->GetAttrMap()) {
+        desc.SetAttr(attr.first, attr.second);
+      }
+      auto conv_bias_node = g->CreateOpNode(&desc);
+      IR_NODE_LINK_TO(subgraph.at(conv_input), conv_bias_node);
+      IR_NODE_LINK_TO(conv_weight, conv_bias_node);
+      IR_NODE_LINK_TO(eltwise_bias, conv_bias_node);
+      IR_NODE_LINK_TO(conv_bias_node, eltwise_out);
+      GraphSafeRemoveNodes(graph.get(), {conv, eltwise, conv_out});
+    }
+    found_conv_bias_count++;
+  };
+  gpd(graph.get(), handler);
+  AddStatis(found_conv_bias_count);
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(conv_bias_mkldnn_fuse_pass,
+              paddle::framework::ir::ConvBiasFusePass);
--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+/*
+* Fuse the Conv and Elementwise_add to a ConvBiasOp.
+*/
+class ConvBiasFusePass : public FusePassBase {
+ public:
+  virtual ~ConvBiasFusePass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"conv_bias_mkldnn_fuse"};
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/conv_bn_fuse_pass.h"
+#include <functional>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+#define GET_CONV_BN_NODES(pattern_name)                                      \
+  /* OPERATORS */                                                            \
+  GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name);                       \
+  GET_IR_NODE_FROM_SUBGRAPH(batch_norm, batch_norm, pattern_name);           \
+  /* CONV inputs */                                                          \
+  GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name);         \
+  /* CONV outputs */                                                         \
+  GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name);               \
+  /* BN inputs */                                                            \
+  GET_IR_NODE_FROM_SUBGRAPH(bn_scale, bn_scale, pattern_name);               \
+  GET_IR_NODE_FROM_SUBGRAPH(bn_bias, bn_bias, pattern_name);                 \
+  GET_IR_NODE_FROM_SUBGRAPH(bn_mean, bn_mean, pattern_name);                 \
+  GET_IR_NODE_FROM_SUBGRAPH(bn_variance, bn_variance, pattern_name);         \
+  /* BN outputs */                                                           \
+  GET_IR_NODE_FROM_SUBGRAPH(bn_out, bn_out, pattern_name); /* Out */         \
+  GET_IR_NODE_FROM_SUBGRAPH(bn_mean_out, bn_mean_out, pattern_name);         \
+  GET_IR_NODE_FROM_SUBGRAPH(bn_variance_out, bn_variance_out, pattern_name); \
+  GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, pattern_name);     \
+  GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance, pattern_name)
+void recompute_bias_and_weights(const Scope* scope,
+                                ir::Node* conv_weight,            //
+                                const ir::Node& bn_scale,         //
+                                const LoDTensor& bn_bias_tensor,  //
+                                const ir::Node& bn_mean,          //
+                                const ir::Node& bn_variance,      //
+                                LoDTensor* eltwise_y_in_tensor,   //
+                                float epsilon) {
+  using EigenVectorArrayMap =
+      Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
+  using ConstEigenVectorArrayMap =
+      Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
+  using EigenMatrixArrayMap = Eigen::Map<
+      Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+  // Re-compute bias of conv2d from BN
+  PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), bn_bias_tensor.dims());
+  auto* scale_tensor = scope->FindVar(bn_scale.Name())->GetMutable<LoDTensor>();
+  auto* variance_tensor =
+      scope->FindVar(bn_variance.Name())->GetMutable<LoDTensor>();
+  auto* mean_tensor = scope->FindVar(bn_mean.Name())->GetMutable<LoDTensor>();
+  ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
+                                       scale_tensor->numel(), 1);
+  EigenVectorArrayMap variance_array(
+      variance_tensor->mutable_data<float>(platform::CPUPlace()),
+      variance_tensor->numel(), 1);
+  ConstEigenVectorArrayMap mean_array(mean_tensor->data<float>(),
+                                      mean_tensor->numel(), 1);
+  ConstEigenVectorArrayMap bn_bias_array(bn_bias_tensor.data<float>(),
+                                         bn_bias_tensor.numel(), 1);
+  // variance will not be used anymore, so make it std_array and then tmp_array
+  variance_array += epsilon;
+  variance_array = variance_array.sqrt();
+  variance_array = scale_array / variance_array;
+  EigenVectorArrayMap eltwise_y_in_array(
+      eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
+      eltwise_y_in_tensor->numel(), 1);
+  eltwise_y_in_array =
+      ((eltwise_y_in_array - mean_array) * variance_array) + bn_bias_array;
+  // Re-compute weight of conv2d from BN
+  auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
+  auto weights_shape = weights->dims();
+  auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
+  EigenMatrixArrayMap weights_array_2d(
+      weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],
+      weights_shape_2d[1]);
+  weights_array_2d.colwise() *= variance_array;
+}
+std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init(name_scope_, graph.get());
+  auto* scope = param_scope();
+  PADDLE_ENFORCE(scope);
+  GraphPatternDetector gpd;
+  auto* conv_input =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
+          ->AsInput()
+          ->assert_is_op_input("conv2d", "Input");
+  patterns::ConvBN conv_bn_pattern(gpd.mutable_pattern(), name_scope_);
+  conv_bn_pattern(conv_input, false /*with_eltwise_add*/);
+  int found_conv_bn_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle ConvBN fuse";
+    // conv, batch_norm,
+    // conv_weight, conv_out,
+    // bn_scale, bn_bias, bn_mean, bn_variance,
+    // bn_out, bn_mean_out, bn_variance_out, bn_saved_mean,
+    // bn_saved_variance
+    GET_CONV_BN_NODES(conv_bn_pattern);
+    // check if fuse can be done and if MKL-DNN should be used
+    FuseOptions fuse_option = FindFuseOption(*conv, *batch_norm);
+    if (fuse_option == DO_NOT_FUSE) {
+      VLOG(3) << "do not perform conv+bn fuse";
+      return;
+    }
+    // Create eltwise_y (conv bias) variable
+    VarDesc eltwise_y_in_desc(
+        patterns::PDNodeName(name_scope_, "eltwise_y_in"));
+    eltwise_y_in_desc.SetPersistable(true);
+    auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
+    auto* eltwise_y_in_tensor =
+        scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
+    // Get batch norm bias
+    auto* bn_bias_tensor =
+        scope->FindVar(bn_bias->Name())->GetMutable<LoDTensor>();
+    // Initialize eltwise_y
+    eltwise_y_in_tensor->Resize(bn_bias_tensor->dims());
+    std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
+                eltwise_y_in_tensor->numel(), 0.0f);
+    // update weights and biases
+    float epsilon = boost::get<float>(batch_norm->Op()->GetAttr("epsilon"));
+    recompute_bias_and_weights(scope, conv_weight, *bn_scale, *bn_bias_tensor,
+                               *bn_mean, *bn_variance, eltwise_y_in_tensor,
+                               epsilon);
+    // with MKL-DNN fuse conv+bn into conv with bias
+    // without MKL-DNN fuse conv+bn into conv+elementwise_add
+    if (fuse_option == FUSE_MKLDNN) {
+      auto input_names = conv->Op()->InputNames();
+      bool has_bias = std::find(input_names.begin(), input_names.end(),
+                                "Bias") != input_names.end();
+      if (has_bias && conv->Op()->Input("Bias").size() > 0) {
+        // reuse existing conv bias node
+        auto conv_bias_names = conv->Op()->Input("Bias");
+        PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1);
+        auto* conv_bias_var = scope->FindVar(conv_bias_names[0]);
+        auto* conv_bias_tensor = conv_bias_var->GetMutable<LoDTensor>();
+        PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(),
+                          eltwise_y_in_tensor->dims());
+        auto eigen_conv_bias = EigenVector<float>::From(*conv_bias_tensor);
+        eigen_conv_bias += EigenVector<float>::From(*eltwise_y_in_tensor);
+      } else {
+        // add new conv_bias node
+        conv->Op()->SetInput(
+            "Bias", std::vector<std::string>({eltwise_y_in_node->Name()}));
+        IR_NODE_LINK_TO(eltwise_y_in_node, conv);
+      }
+      conv->Op()->SetOutput("Output",
+                            std::vector<std::string>({bn_out->Name()}));
+      GraphSafeRemoveNodes(
+          graph.get(),
+          {conv_out, bn_scale, bn_bias, bn_mean, bn_variance, batch_norm,
+           bn_mean_out, bn_variance_out, bn_saved_mean, bn_saved_variance});
+      IR_NODE_LINK_TO(conv, bn_out);
+      found_conv_bn_count++;
+    } else {  // fuse_option == FUSE_NATIVE
+      // create an elementwise add node.
+      OpDesc desc;
+      desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
+      desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
+      desc.SetOutput("Out", std::vector<std::string>({bn_out->Name()}));
+      desc.SetType("elementwise_add");
+      desc.SetAttr("axis", 1);
+      auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
+      GraphSafeRemoveNodes(
+          graph.get(),
+          {bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out,
+           bn_variance_out, bn_saved_mean, bn_saved_variance});
+      IR_NODE_LINK_TO(conv_out, eltwise_op);
+      IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
+      IR_NODE_LINK_TO(eltwise_op, bn_out);
+      found_conv_bn_count++;
+    }
+  };
+  gpd(graph.get(), handler);
+  AddStatis(found_conv_bn_count);
+  return graph;
+}
+std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init(name_scope_, graph.get());
+  auto* scope = param_scope();
+  PADDLE_ENFORCE(scope);
+  GraphPatternDetector gpd;
+  auto* conv_input =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
+          ->AsInput()
+          ->assert_is_op_input("conv2d", "Input");
+  patterns::ConvBN conv_bn_pattern(gpd.mutable_pattern(), name_scope_);
+  conv_bn_pattern(conv_input, true /*with_eltwise_add*/);
+  int found_conv_bn_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle ConvBN fuse";
+    // conv, batch_norm,
+    // conv_weight, conv_out,
+    // bn_scale, bn_bias, bn_mean, bn_variance,
+    // bn_out, bn_mean_out, bn_variance_out, bn_saved_mean,bn_saved_variance
+    GET_CONV_BN_NODES(conv_bn_pattern);
+    // OPERATORS
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_bn_pattern);
+    // BIAS inputs
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_bn_pattern);
+    // BIAS outputs
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_bn_pattern);
+    // Get eltwise_y (conv bias) variable
+    auto* eltwise_y_in_tensor =
+        scope->FindVar(eltwise_y_in->Name())->GetMutable<LoDTensor>();
+    // Get batch norm bias
+    auto* bn_bias_tensor =
+        scope->FindVar(bn_bias->Name())->GetMutable<LoDTensor>();
+    // update weights and biases
+    float epsilon = boost::get<float>(batch_norm->Op()->GetAttr("epsilon"));
+    recompute_bias_and_weights(scope, conv_weight, *bn_scale, *bn_bias_tensor,
+                               *bn_mean, *bn_variance, eltwise_y_in_tensor,
+                               epsilon);
+    // Update the elementwise_add node
+    eltwise->Op()->SetAttr("axis", 1);
+    eltwise->Op()->SetOutput("Out", std::vector<std::string>({bn_out->Name()}));
+    GraphSafeRemoveNodes(
+        graph.get(),
+        {bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out,
+         bn_variance_out, bn_saved_mean, bn_saved_variance, eltwise_out});
+    IR_NODE_LINK_TO(eltwise, bn_out);
+    found_conv_bn_count++;
+  };
+  gpd(graph.get(), handler);
+  AddStatis(found_conv_bn_count);
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(conv_bn_fuse_pass, paddle::framework::ir::ConvBNFusePass);
+REGISTER_PASS(conv_eltwiseadd_bn_fuse_pass,
+              paddle::framework::ir::ConvEltwiseAddBNFusePass);
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+/*
+ * Fuse the Conv and BatchNorm to a ConvBNMKLDNNOp.
+ */
+class ConvBNFusePass : public FusePassBase {
+ public:
+  virtual ~ConvBNFusePass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"conv_bn_fuse"};
+};
+class ConvEltwiseAddBNFusePass : public FusePassBase {
+ public:
+  virtual ~ConvEltwiseAddBNFusePass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"conv_eltwiseadd_bn_fuse"};
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h"
+#include <functional>
+#include <utility>
+#include "paddle/fluid/framework/ir/graph_traits.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace {
+// The function keeps the graph consistent by replacing
+// a node 'from' in the set of inputs nodes
+// of the visited node by a node 'to'.
+void CorrectGraphEdges(Graph* graph, Node* from, Node* to) {
+  for (auto& node : GraphTraits::DFS(*graph)) {
+    auto from_in_inputs =
+        std::find(std::begin(node.inputs), std::end(node.inputs), from);
+    if (from_in_inputs != std::end(node.inputs)) {
+      IR_NODE_LINK_TO(to, (&node));
+      auto inputs = node.Op()->Inputs();
+      using input_type = VariableNameMap::value_type;
+      std::for_each(std::begin(inputs), std::end(inputs),
+                    [from, to, &node](const input_type& i) -> void {
+                      auto param_names = i.second;
+                      auto pi = std::find(std::begin(param_names),
+                                          std::end(param_names), from->Name());
+                      if (pi != std::end(param_names)) {
+                        node.Op()->SetInput(i.first, {to->Name()});
+                      }
+                    });
+    }
+  }
+}
+}  // namespace
+using graph_ptr = std::unique_ptr<ir::Graph>;
+graph_ptr ConvElementwiseAddMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::Conv conv_pattern{pattern, name_scope_};
+  auto conv_output = conv_pattern();
+  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope_};
+  elementwise_add_pattern(conv_output);
+  conv_output->AsIntermediate();
+  auto conv_op_has_bias = [](const Node& conv_op) -> std::pair<bool, Node*> {
+    auto bias_input_names = conv_op.Op()->Inputs();
+    auto bias_it = bias_input_names.find("Bias");
+    if (bias_it != std::end(bias_input_names)) {
+      bool has_bias = !bias_it->second.empty();
+      if (has_bias) {
+        auto conv_bias_names = bias_it->second;
+        auto conv_bias_names_it =
+            std::find_if(std::begin(conv_op.inputs), std::end(conv_op.inputs),
+                         [&conv_bias_names](Node* n) -> bool {
+                           return n->Name() == conv_bias_names[0];
+                         });
+        return std::make_pair(has_bias, *conv_bias_names_it);
+      }
+    }
+    return std::make_pair(false, nullptr);
+  };
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
+                              elementwise_add_pattern);
+    if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+    OpDesc op_desc;
+    op_desc.SetType("conv2d");
+    op_desc.SetInput("Input", {conv_input->Name()});
+    op_desc.SetInput("Filter", {conv_filter->Name()});
+    op_desc.SetInput("ResidualData", {elementwise_add_x->Name()});
+    op_desc.SetOutput("Output", {conv_output->Name()});
+    bool has_bias;
+    Node* conv_bias;
+    std::tie(has_bias, conv_bias) = conv_op_has_bias(*conv_op);
+    if (has_bias) {
+      op_desc.SetInput("Bias", {conv_bias->Name()});
+    }
+    for (const auto& attr : conv_op->Op()->GetAttrMap()) {
+      op_desc.SetAttr(attr.first, attr.second);
+    }
+    op_desc.SetAttr("fuse_residual_connection", true);
+    auto fused_conv_op = g->CreateOpNode(&op_desc);
+    IR_NODE_LINK_TO(conv_input, fused_conv_op);
+    IR_NODE_LINK_TO(conv_filter, fused_conv_op);
+    IR_NODE_LINK_TO(elementwise_add_x, fused_conv_op);
+    IR_NODE_LINK_TO(fused_conv_op, conv_output);
+    if (has_bias) {
+      IR_NODE_LINK_TO(conv_bias, fused_conv_op);
+    }
+    CorrectGraphEdges(g, elementwise_add_out, conv_output);
+    GraphSafeRemoveNodes(g, {elementwise_add_out, conv_op, elementwise_add_op});
+  };
+  gpd(graph.get(), handler);
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass,
+              paddle::framework::ir::ConvElementwiseAddMKLDNNFusePass);
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class ConvElementwiseAddMKLDNNFusePass : public FusePassBase {
+ public:
+  virtual ~ConvElementwiseAddMKLDNNFusePass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"residual_connections_fuse_pass"};
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include <string>
+#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/graph_traits.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace {
+constexpr int nodes_removed = 3;
+constexpr int nodes_added = 1;
+void SetOp(ProgramDesc* prog, const std::string& type,
+           const std::vector<std::pair<std::string, std::string>>& inputs,
+           const std::pair<std::string, std::string>& output) {
+  auto op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", true);
+  for (const auto& input : inputs) {
+    op->SetInput(input.first, {input.second});
+  }
+  op->SetOutput(output.first, {output.second});
+}
+struct IsReachable {
+  using func = std::function<bool(const std::string&, const std::string&)>;
+  auto operator()(const std::unique_ptr<ir::Graph>& graph) -> func {
+    auto find_node = [](const std::unique_ptr<ir::Graph>& graph,
+                        const std::string& name) -> Node* {
+      for (auto& node : GraphTraits::DFS(*graph)) {
+        if (name == node.Name()) {
+          return &node;
+        }
+      }
+      return nullptr;
+    };
+    return [&](std::string from, const std::string to) -> bool {
+      if (from == to) return true;
+      std::map<std::string, bool> visited;
+      for (auto& node : GraphTraits::DFS(*graph)) {
+        visited[node.Name()] = false;
+      }
+      visited[from] = true;
+      std::list<std::string> queue;
+      queue.push_back(from);
+      while (!queue.empty()) {
+        auto cur = find_node(graph, queue.front());
+        queue.pop_front();
+        if (cur == nullptr) return false;
+        for (auto n : cur->outputs) {
+          if (n->Name() == to) return true;
+          if (!visited[n->Name()]) {
+            visited[n->Name()] = true;
+            queue.push_back(n->Name());
+          }
+        }
+      }
+      return false;
+    };
+  }
+};
+void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph) {
+  int conv_count = 0;
+  int elementwise_add_count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "conv2d") {
+      ++conv_count;
+    }
+    if (node->IsOp() && node->Op()->Type() == "elementwise_add") {
+      ++elementwise_add_count;
+    }
+  }
+  EXPECT_EQ(conv_count, 1);
+  EXPECT_EQ(elementwise_add_count, 0);
+}
+ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars,
+                             const std::vector<std::string>& persistent_vars) {
+  ProgramDesc prog;
+  auto add_var_to_prog = [&prog](const std::string& var_name) -> VarDesc* {
+    auto var = prog.MutableBlock(0)->Var(var_name);
+    var->SetType(proto::VarType::LOD_TENSOR);
+    return var;
+  };
+  for (const auto& v : transient_vars) {
+    add_var_to_prog(v);
+  }
+  for (const auto& v : persistent_vars) {
+    auto var = add_var_to_prog(v);
+    var->SetPersistable(true);
+  }
+  return prog;
+}
+}  // namespace
+TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionWithElementwiseAddRelu) {
+  auto prog =
+      BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"});
+  SetOp(&prog, "conv2d",
+        {{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}},
+        {"Output", "b"});
+  SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
+  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  IsReachable is_reachable;
+  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+  auto pass =
+      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
+  int original_nodes_num = graph->Nodes().size();
+  graph = pass->Apply(std::move(graph));
+  int current_nodes_num = graph->Nodes().size();
+  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
+            current_nodes_num);
+  AssertOpsCount(graph);
+}
+TEST(ConvElementwiseAddMKLDNNFusePass,
+     ConvolutionWithElementwiseAddReluNoBias) {
+  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
+  SetOp(&prog, "conv2d", {{"Input", "a"}, {"Filter", "weights"}},
+        {"Output", "b"});
+  SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
+  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  IsReachable is_reachable;
+  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+  auto pass =
+      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
+  int original_nodes_num = graph->Nodes().size();
+  graph = pass->Apply(std::move(graph));
+  int current_nodes_num = graph->Nodes().size();
+  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
+            current_nodes_num);
+  AssertOpsCount(graph);
+}
+TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionElementwiseAdd) {
+  auto prog = BuildProgramDesc({"a", "b", "c", "d"}, {"bias", "weights"});
+  SetOp(&prog, "conv2d",
+        {{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}},
+        {"Output", "b"});
+  SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  IsReachable is_reachable;
+  EXPECT_TRUE(is_reachable(graph)("a", "d"));
+  auto pass =
+      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
+  int original_nodes_num = graph->Nodes().size();
+  graph = pass->Apply(std::move(graph));
+  int current_nodes_num = graph->Nodes().size();
+  EXPECT_FALSE(is_reachable(graph)("a", "d"));
+  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
+            current_nodes_num);
+  AssertOpsCount(graph);
+}
+TEST(ConvElementwiseAddMKLDNNFusePass, SigmoidConvolutionAddElementwiseRelu) {
+  auto prog =
+      BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"});
+  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
+  SetOp(&prog, "conv2d",
+        {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
+        {"Output", "c"});
+  SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "d"}}, {"Out", "e"});
+  SetOp(&prog, "relu", {{"X", "e"}}, {"Out", "f"});
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  IsReachable is_reachable;
+  EXPECT_TRUE(is_reachable(graph)("a", "f"));
+  auto pass =
+      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
+  int original_nodes_num = graph->Nodes().size();
+  graph = pass->Apply(std::move(graph));
+  int current_nodes_num = graph->Nodes().size();
+  EXPECT_TRUE(is_reachable(graph)("a", "f"));
+  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
+            current_nodes_num);
+  AssertOpsCount(graph);
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+USE_PASS(conv_elementwise_add_mkldnn_fuse_pass);
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
@@ -46,6 +46,12 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
    GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern);  // Out
    GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern);  // ReLU op
+    FuseOptions fuse_option = FindFuseOption(*conv, *relu);
+    if (fuse_option == DO_NOT_FUSE) {
+      VLOG(3) << "do not perform conv+relu fuse";
+      return;
+    }
    // Transform Conv node into ConvReLU node.
    OpDesc* desc = conv->Op();
    desc->SetOutput("Output", std::vector<std::string>({relu_out->Name()}));

--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
@@ -20,17 +20,19 @@ namespace paddle {
 namespace framework {
 namespace ir {
-void SetOp(ProgramDesc* prog, const std::string& type,
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs) {
+           const std::vector<std::string>& outputs, bool use_mkldnn = false) {
  auto* op = prog->MutableBlock(0)->AppendOp();
  op->SetType(type);
  if (type == "conv2d") {
-    op->SetAttr("use_mkldnn", true);
+    op->SetAttr("use_mkldnn", use_mkldnn);
+    op->SetAttr("name", name);
    op->SetInput("Input", {inputs[0]});
    op->SetInput("Filter", {inputs[1]});
    op->SetInput("Bias", {inputs[2]});
  } else if (type == "relu") {
+    op->SetAttr("use_mkldnn", use_mkldnn);
    op->SetInput("X", inputs);
  }
  op->SetOutput("Out", outputs);
@@ -43,7 +45,8 @@ void SetOp(ProgramDesc* prog, const std::string& type,
 ProgramDesc BuildProgramDesc() {
  ProgramDesc prog;
  for (auto& v :
-       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g"})) {
+       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g",
+                                 "h", "weights2", "bias2", "k", "l"})) {
    auto* var = prog.MutableBlock(0)->Var(v);
    var->SetType(proto::VarType::SELECTED_ROWS);
    if (v == "weights" || v == "bias") {
@@ -51,14 +54,24 @@ ProgramDesc BuildProgramDesc() {
    }
  }
-  SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
+  SetOp(&prog, "OP0", "op0", std::vector<std::string>({"a"}),
        std::vector<std::string>({"b"}));
-  SetOp(&prog, "OP1", std::vector<std::string>({"b"}),
+  SetOp(&prog, "OP1", "op1", std::vector<std::string>({"b"}),
        std::vector<std::string>({"c"}));
-  SetOp(&prog, "conv2d", std::vector<std::string>({"c", "weights", "bias"}),
+  // conv+relu, both with MKL-DNN
-        std::vector<std::string>({"f"}));
+  SetOp(&prog, "conv2d", "conv1",
-  SetOp(&prog, "relu", std::vector<std::string>({"f"}),
+        std::vector<std::string>({"c", "weights", "bias"}),
-        std::vector<std::string>({"g"}));
+        std::vector<std::string>({"f"}), true);
+  SetOp(&prog, "relu", "relu1", std::vector<std::string>({"f"}),
+        std::vector<std::string>({"g"}), true);
+  SetOp(&prog, "OP3", "op3", std::vector<std::string>({"g"}),
+        std::vector<std::string>({"h"}));
+  // conv+relu, only one with MKL-DNN
+  SetOp(&prog, "conv2d", "conv2",
+        std::vector<std::string>({"h", "weights2", "bias2"}),
+        std::vector<std::string>({"k"}), true);
+  SetOp(&prog, "relu", "relu2", std::vector<std::string>({"k"}),
+        std::vector<std::string>({"l"}));
  return prog;
 }
@@ -88,10 +101,16 @@ TEST(ConvReLUFusePass, basic) {
      auto* op = node->Op();
      ASSERT_TRUE(op->HasAttr("use_mkldnn"));
      EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn")));
-      ASSERT_TRUE(op->HasAttr("fuse_relu"));
+      // check if only "conv1" convolution is fused
-      bool fuse_relu = boost::get<bool>(op->GetAttr("fuse_relu"));
+      auto op_name = boost::get<std::string>(op->GetAttr("name"));
-      if (fuse_relu) {
+      if (op_name == "conv1") {
-        ++conv_relu_count;
+        ASSERT_TRUE(op->HasAttr("fuse_relu"));
+        bool fuse_relu = boost::get<bool>(op->GetAttr("fuse_relu"));
+        if (fuse_relu) {
+          ++conv_relu_count;
+        }
+      } else if (op_name == "conv2") {
+        ASSERT_FALSE(op->HasAttr("fuse_relu"));
      }
    }
  }

--- a/paddle/fluid/framework/ir/fuse_pass_base.cc
+++ b/paddle/fluid/framework/ir/fuse_pass_base.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+void FusePassBase::Init(const std::string& repr, Graph* graph) const {
+  repr_ = repr;
+  graph_ = graph;
+}
+Scope* FusePassBase::param_scope() const {
+  PADDLE_ENFORCE(graph_->Has(kParamScopeAttr));
+  return graph_->Get<framework::Scope*>(kParamScopeAttr);
+}
+void FusePassBase::AddStatis(int count_of_fused) const {
+  PADDLE_ENFORCE(graph_);
+  PADDLE_ENFORCE(!repr_.empty());
+  if (!graph_->Has(kFuseStatisAttr)) {
+    graph_->Set(kFuseStatisAttr, new std::unordered_map<std::string, int>);
+  }
+  auto& info =
+      graph_->Get<std::unordered_map<std::string, int>>(kFuseStatisAttr);
+  info[repr_] = count_of_fused;
+}
+FuseOptions FusePassBase::FindFuseOption(const Node& node1,
+                                         const Node& node2) const {
+#ifdef PADDLE_WITH_MKLDNN
+  bool node1_mkldnn = node1.Op()->HasAttr("use_mkldnn") &&
+                      boost::get<bool>(node1.Op()->GetAttr("use_mkldnn"));
+  bool node2_mkldnn = node2.Op()->HasAttr("use_mkldnn") &&
+                      boost::get<bool>(node2.Op()->GetAttr("use_mkldnn"));
+  if (node1_mkldnn && node2_mkldnn)
+    return FUSE_MKLDNN;
+  else if (!node1_mkldnn && !node2_mkldnn)
+    return FUSE_NATIVE;
+  else
+    return DO_NOT_FUSE;
+#else
+  return FUSE_NATIVE;
+#endif
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/fuse_pass_base.h
+++ b/paddle/fluid/framework/ir/fuse_pass_base.h
@@ -25,32 +25,24 @@ namespace ir {
 static const char kParamScopeAttr[] = "__param_scope__";
 static const char kFuseStatisAttr[] = "__fuse_statis__";
+enum FuseOptions {
+  DO_NOT_FUSE,  // fusing will not be done
+  FUSE_NATIVE,  // fusing will be done without MKL-DNN
+  FUSE_MKLDNN   // fusing will be done with MKL-DNN
+};
 class FusePassBase : public Pass {
 public:
-  void Init(const std::string& repr, Graph* graph) const {
+  void Init(const std::string& repr, Graph* graph) const;
-    repr_ = repr;
+  Scope* param_scope() const;
-    graph_ = graph;
+  void AddStatis(int count_of_fused) const;
-  }
-  Scope* param_scope() const {
-    PADDLE_ENFORCE(graph_->Has(kParamScopeAttr));
-    return graph_->Get<framework::Scope*>(kParamScopeAttr);
-  }
-  void AddStatis(int count_of_fused) const {
-    PADDLE_ENFORCE(graph_);
-    PADDLE_ENFORCE(!repr_.empty());
-    if (!graph_->Has(kFuseStatisAttr)) {
-      graph_->Set(kFuseStatisAttr, new std::unordered_map<std::string, int>);
-    }
-    auto& info =
-        graph_->Get<std::unordered_map<std::string, int>>(kFuseStatisAttr);
-    info[repr_] = count_of_fused;
-  }
  virtual ~FusePassBase() {}
 protected:
+  virtual FuseOptions FindFuseOption(const Node& node1,
+                                     const Node& node2) const;
  mutable Graph* graph_;
  mutable std::string repr_;
 };

--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -120,19 +120,25 @@ size_t GraphNum(const Graph &graph) {
  std::deque<ir::Node *> q_nodes;
  std::vector<std::unordered_set<ir::Node *>> graph_nodes;
  std::unordered_set<ir::Node *> g_nodes;
+  // q_set used to record records in the queue.
+  std::unordered_set<ir::Node *> q_set;
  size_t graph_count = 0;
-  auto traverse_nodes = [&visited_nodes,
+  auto traverse_nodes = [&visited_nodes, &q_nodes,
-                         &q_nodes](const std::vector<ir::Node *> &nodes) {
+                         &q_set](const std::vector<ir::Node *> &nodes) {
-    std::copy_if(
+    for (auto n : nodes) {
-        nodes.begin(), nodes.end(), std::back_inserter(q_nodes),
+      if (visited_nodes.count(n) == 0 && q_set.count(n) == 0) {
-        [&visited_nodes](Node *node) { return !visited_nodes.count(node); });
+        q_nodes.push_back(n);
+        q_set.insert(n);
+      }
+    }
  };
  while (visited_nodes.size() != nodes.size()) {
    if (!q_nodes.empty()) {
      auto cur_node = q_nodes.front();
      q_nodes.pop_front();
+      q_set.erase(cur_node);
      visited_nodes.insert(cur_node);
      g_nodes.insert(cur_node);
      traverse_nodes(cur_node->inputs);
@@ -146,6 +152,7 @@ size_t GraphNum(const Graph &graph) {
      for (auto &n : nodes) {
        if (visited_nodes.count(n) == 0) {
          q_nodes.push_back(n);
+          q_set.insert(n);
          break;
        }
      }

--- a/paddle/fluid/framework/ir/graph_helper_test.cc
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
@@ -200,15 +200,15 @@ TEST(GraphHelperTest, GraphNum) {
  Graph g(prog);
  BuildZeroGraph(&g);
-  ASSERT_EQ(GraphNum(g), 0);
+  ASSERT_EQ(GraphNum(g), 0UL);
  Graph g2(prog);
  BuildOneGraph(&g2);
-  ASSERT_EQ(GraphNum(g2), 1);
+  ASSERT_EQ(GraphNum(g2), 1UL);
  Graph g3(prog);
  BuildTwoGraphs(&g3);
-  ASSERT_EQ(GraphNum(g3), 2);
+  ASSERT_EQ(GraphNum(g3), 2UL);
 }
 }  // namespace ir

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -259,6 +259,8 @@ GraphPatternDetector::DetectPatterns() {
  return result;
 }
+// TODO(Superjomn) enhance the function as it marks unique unique as duplicates
+// see https://github.com/PaddlePaddle/Paddle/issues/13550
 void GraphPatternDetector::UniquePatterns(
    std::vector<GraphPatternDetector::subgraph_t> *subgraphs) {
  if (subgraphs->empty()) return;
@@ -626,6 +628,112 @@ bool VarLinksFromOp(Node *node, const std::string &op_type) {
  return false;
 }
+PDNode *patterns::ConvBN::operator()(paddle::framework::ir::PDNode *conv_input,
+                                     bool with_eltwise_add) {
+  // Create Operators
+  conv_input->assert_is_op_input("conv2d", "Input");
+  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
+  PDNode *eltwise_op = nullptr;
+  if (with_eltwise_add) {
+    eltwise_op =
+        pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
+  }
+  auto *batch_norm_op =
+      pattern->NewNode(batch_norm_repr())->assert_is_op("batch_norm");
+  // Create variables
+  // Conv Filter
+  auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
+                              ->AsInput()
+                              ->assert_is_persistable_var()
+                              ->assert_is_op_input("conv2d", "Filter");
+  auto *conv_out_var = pattern->NewNode(conv_out_repr())
+                           ->AsIntermediate()
+                           ->assert_is_only_output_of_op("conv2d");
+  PDNode *eltwise_y_in_var = nullptr;
+  PDNode *eltwise_out_var = nullptr;
+  if (with_eltwise_add) {
+    // Conv output as Bias input
+    conv_out_var->assert_is_op_input("elementwise_add", "X");
+    // Bias
+    eltwise_y_in_var = pattern->NewNode(eltwise_y_in_repr())
+                           ->assert_is_op_input("elementwise_add", "Y")
+                           ->AsInput();
+    eltwise_out_var = pattern->NewNode(eltwise_out_repr())
+                          ->AsIntermediate()
+                          ->assert_is_only_output_of_op("elementwise_add");
+  } else {
+    // Conv output as BN input
+    conv_out_var->assert_is_op_input("batch_norm", "X");
+  }
+  // BN Scale
+  auto *bn_scale_var = pattern->NewNode(bn_scale_repr())
+                           ->AsInput()
+                           ->assert_is_persistable_var()
+                           ->assert_is_op_input("batch_norm", "Scale");
+  // BN Bias
+  auto *bn_bias_var = pattern->NewNode(bn_bias_repr())
+                          ->AsInput()
+                          ->assert_is_persistable_var()
+                          ->assert_is_op_input("batch_norm", "Bias");
+  // BN Mean
+  auto *bn_mean_var = pattern->NewNode(bn_mean_repr())
+                          ->AsInput()
+                          ->assert_is_persistable_var()
+                          ->assert_is_op_input("batch_norm", "Mean");
+  // BN Variance
+  auto *bn_variance_var = pattern->NewNode(bn_variance_repr())
+                              ->AsInput()
+                              ->assert_is_persistable_var()
+                              ->assert_is_op_input("batch_norm", "Variance");
+  // BN output
+  auto *bn_out_var = pattern->NewNode(bn_out_repr())
+                         ->AsOutput()
+                         ->assert_is_op_output("batch_norm");
+  auto *bn_mean_out_var = pattern->NewNode(bn_mean_out_repr())
+                              ->AsOutput()
+                              ->assert_is_op_output("batch_norm", "MeanOut");
+  auto *bn_variance_out_var =
+      pattern->NewNode(bn_variance_out_repr())
+          ->AsOutput()
+          ->assert_is_op_output("batch_norm", "VarianceOut");
+  auto *bn_saved_mean_var =
+      pattern->NewNode(bn_saved_mean_repr())
+          ->AsOutput()
+          ->assert_is_op_output("batch_norm", "SavedMean");
+  auto *bn_saved_variance_var =
+      pattern->NewNode(bn_saved_variance_repr())
+          ->AsOutput()
+          ->assert_is_op_output("batch_norm", "SavedVariance");
+  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
+  if (with_eltwise_add) {
+    eltwise_op->LinksFrom({conv_out_var, eltwise_y_in_var})
+        .LinksTo({eltwise_out_var});
+    batch_norm_op
+        ->LinksFrom({eltwise_out_var, bn_scale_var, bn_bias_var, bn_mean_var,
+                     bn_variance_var})
+        .LinksTo({bn_out_var, bn_mean_out_var, bn_variance_out_var,
+                  bn_saved_mean_var, bn_saved_variance_var});
+  } else {
+    batch_norm_op
+        ->LinksFrom({conv_out_var, bn_scale_var, bn_bias_var, bn_mean_var,
+                     bn_variance_var})
+        .LinksTo({bn_out_var, bn_mean_out_var, bn_variance_out_var,
+                  bn_saved_mean_var, bn_saved_variance_var});
+  }
+  return bn_out_var;
+}
 PDNode *patterns::ConvReLU::operator()(
    paddle::framework::ir::PDNode *conv_input) {
  // Create Operators
@@ -653,6 +761,51 @@ PDNode *patterns::ConvReLU::operator()(
  return relu_out_var;
 }
+PDNode *patterns::SeqConvEltAddRelu::operator()(
+    paddle::framework::ir::PDNode *seqconv_input) {
+  // Create Operators
+  seqconv_input->assert_is_op_input("sequence_conv", "X");
+  auto *seqconv_op = pattern->NewNode(seqconv_repr())
+                         ->assert_is_op("sequence_conv")
+                         ->assert_op_attr<bool>("paddingTrainable", false)
+                         ->assert_op_attr<int>("contextStride", 1);
+  auto *eltadd_op =
+      pattern->NewNode(eltadd_repr())->assert_is_op("elementwise_add");
+  auto *relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu");
+  // Create variables
+  // Filter
+  auto *seqconv_weight_var =
+      pattern->NewNode(seqconv_weight_repr())
+          ->AsInput()
+          ->assert_is_persistable_var()
+          ->assert_is_op_input("sequence_conv", "Filter");
+  // Bias
+  auto *eltadd_bias_var = pattern->NewNode(eltadd_bias_repr())
+                              ->AsInput()
+                              ->assert_is_op_input("elementwise_add");
+  // intermediate variable, will be removed in the IR after fuse.
+  auto *seqconv_out_var = pattern->NewNode(seqconv_out_repr())
+                              ->AsIntermediate()
+                              ->assert_is_only_output_of_op("sequence_conv")
+                              ->assert_is_op_input("elementwise_add");
+  auto *eltadd_out_var = pattern->NewNode(eltadd_out_repr())
+                             ->AsIntermediate()
+                             ->assert_is_only_output_of_op("elementwise_add")
+                             ->assert_is_only_input_of_op("relu");
+  // output
+  auto *relu_out_var = pattern->NewNode(relu_out_repr())
+                           ->AsOutput()
+                           ->assert_is_op_output("relu");
+  seqconv_op->LinksFrom({seqconv_input, seqconv_weight_var})
+      .LinksTo({seqconv_out_var});
+  eltadd_op->LinksFrom({seqconv_out_var, eltadd_bias_var})
+      .LinksTo({eltadd_out_var});
+  relu_op->LinksFrom({eltadd_out_var}).LinksTo({relu_out_var});
+  return relu_out_var;
+}
 PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x,
                                 bool with_bias) {
  // Create shared nodes.
@@ -858,6 +1011,79 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
  return ele_add_grad;
 }
+PDNode *patterns::ConvBias::operator()(
+    paddle::framework::ir::PDNode *conv_input) {
+  // Create Operators
+  conv_input->assert_is_op_input("conv2d", "Input");
+  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
+  auto *eltiwse_op =
+      pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
+  // Create variables
+  // Filter
+  auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
+                              ->AsInput()
+                              ->assert_is_persistable_var()
+                              ->assert_is_op_input("conv2d", "Filter");
+  // intermediate variable, will be removed in the IR after fuse.
+  auto *conv_out_var = pattern->NewNode(conv_out_repr())
+                           ->AsIntermediate()
+                           ->assert_is_only_output_of_op("conv2d")
+                           ->assert_is_op_input("elementwise_add");
+  // Bias stored in elementwise_add
+  auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr())
+                               ->AsInput()
+                               ->assert_is_persistable_var()
+                               ->assert_is_op_input("elementwise_add", "Y");
+  // output
+  auto *eltwise_out_var = pattern->NewNode(eltwise_out_repr())
+                              ->AsOutput()
+                              ->assert_is_op_output("elementwise_add");
+  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
+  eltiwse_op->LinksFrom({conv_out_var, eltwise_bias_var})
+      .LinksTo({eltwise_out_var});
+  return eltwise_out_var;
+}
+PDNode *patterns::Conv::operator()() {
+  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
+  auto input_var = pattern->NewNode(conv_input_repr())
+                       ->AsInput()
+                       ->assert_is_op_input("conv2d", "Input");
+  auto filter_var = pattern->NewNode(conv_filter_repr())
+                        ->AsInput()
+                        ->assert_is_op_input("conv2d", "Filter");
+  auto output_var = pattern->NewNode(conv_output_repr())
+                        ->AsOutput()
+                        ->assert_is_op_output("conv2d", "Output");
+  conv_op->LinksFrom({input_var, filter_var});
+  conv_op->LinksTo({output_var});
+  return output_var;
+}
+PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var) {
+  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
+                                ->assert_is_op("elementwise_add");
+  x_var->assert_is_op_input("elementwise_add", "X");
+  auto y_var = pattern->NewNode(elementwise_add_x_repr())
+                   ->AsInput()
+                   ->assert_is_op_input("elementwise_add", "Y");
+  auto out_var = pattern->NewNode(elementwise_add_out_repr())
+                     ->AsOutput()
+                     ->assert_is_op_output("elementwise_add", "Out");
+  elementwise_add_op->LinksFrom({x_var, y_var});
+  elementwise_add_op->LinksTo({out_var});
+  return out_var;
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -128,6 +128,15 @@ struct PDNode {
      const std::unordered_set<std::string>& op_types,
      const std::string& argument, int nth);
+  template <typename T>
+  PDNode* assert_op_attr(const std::string& attr_name, const T& attr) {
+    asserts_.emplace_back([=](Node* x) {
+      return x && x->IsOp() && x->Op()->HasAttr(attr_name) &&
+             boost::get<T>(x->Op()->GetAttr(attr_name)) == attr;
+    });
+    return this;
+  }
 private:
  PDNode(PDPattern* pattern, const std::string& name = "",
         Type type = Type::kVar)
@@ -375,6 +384,44 @@ struct PatternBase {
  size_t id_;
 };
+// Conv with batch norm
+// op: conv + (elementwise_add +) batch_norm
+// named nodes:
+// conv_weight, conv_out, conv,
+// bn_x, bn_scale, bn_bias, bn_mean,  bn_variance,
+// bn_batch_norm, bn_y, bn_mean_out, bn_variance_out,
+// bn_saved_mean, bn_saved_variance
+struct ConvBN : public PatternBase {
+  ConvBN(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_bn") {}
+  PDNode* operator()(PDNode* conv_input, bool with_eltwise_add);
+  // declare operator node's name
+  PATTERN_DECL_NODE(conv);
+  PATTERN_DECL_NODE(batch_norm);
+  PATTERN_DECL_NODE(eltwise);  // ELEMENTWISE_ADD
+  // CONV inputs
+  PATTERN_DECL_NODE(conv_weight);  // Filter
+  // CONV outputs
+  PATTERN_DECL_NODE(conv_out);  // tmp
+  // ELTWISE inputs
+  PATTERN_DECL_NODE(eltwise_y_in);
+  // ELTWISE outputs
+  PATTERN_DECL_NODE(eltwise_out);  // tmp
+  // BN inputs
+  PATTERN_DECL_NODE(bn_scale);
+  PATTERN_DECL_NODE(bn_bias);
+  PATTERN_DECL_NODE(bn_mean);
+  PATTERN_DECL_NODE(bn_variance);
+  // BN outputs
+  PATTERN_DECL_NODE(bn_out);  // Out
+  PATTERN_DECL_NODE(bn_mean_out);
+  PATTERN_DECL_NODE(bn_variance_out);
+  PATTERN_DECL_NODE(bn_saved_mean);
+  PATTERN_DECL_NODE(bn_saved_variance);
+};
 // CONV with ReLU
 // op: conv + relu
 // named nodes:
@@ -396,6 +443,31 @@ struct ConvReLU : public PatternBase {
  PATTERN_DECL_NODE(relu_out);
 };
+// SEQCONV with Elementwise_Add ReLU
+// op: seqconv + elementwise_add + relu
+// named nodes:
+// seqconv_input, seqconv_weight,
+// seqconv_out, seqconv,
+// elementwise_add_bias, elementwise_add_out, elementwise_add
+// relu_out, relu
+struct SeqConvEltAddRelu : public PatternBase {
+  SeqConvEltAddRelu(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "seqconv_eltadd_relu") {}
+  PDNode* operator()(PDNode* seqconv_input);
+  // declare operator node's name
+  PATTERN_DECL_NODE(seqconv);
+  PATTERN_DECL_NODE(eltadd);
+  PATTERN_DECL_NODE(relu);
+  // declare variable node's name
+  PATTERN_DECL_NODE(seqconv_weight);
+  PATTERN_DECL_NODE(seqconv_out);
+  PATTERN_DECL_NODE(eltadd_bias);
+  PATTERN_DECL_NODE(eltadd_out);
+  PATTERN_DECL_NODE(relu_out);
+};
 // FC with bias
 // op: mul + elementwise_add
 // named nodes:
@@ -540,6 +612,65 @@ struct ElewiseAddActInplaceGrad : public PatternBase {
  PATTERN_DECL_NODE(d_ele_y);
  PATTERN_DECL_NODE(ele_y);
 };
+// Conv with Elementwise_add as bias
+// op: conv + elementwise_add
+// named nodes:
+// conv_input, conv_weight,
+// conv_out, conv,
+// eltwise_bias, eltwise_out,
+// elementwise_add
+struct ConvBias : public PatternBase {
+  ConvBias(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_bias") {}
+  PDNode* operator()(PDNode* conv_input);
+  // declare operator node's name
+  PATTERN_DECL_NODE(conv);
+  PATTERN_DECL_NODE(eltwise);
+  // declare variable node's name
+  PATTERN_DECL_NODE(conv_weight);
+  PATTERN_DECL_NODE(conv_out);
+  PATTERN_DECL_NODE(eltwise_bias);
+  PATTERN_DECL_NODE(eltwise_out);
+};
+// Convolution op
+// Forward pass for convolution.
+// conv_input, conv_bias and conv_filter are inputs.
+// conv_output is a result of the operator.
+// residual_data is data used by skip connection.
+// If residual connection fusion is on, the formula is:
+// conv_output = conv_op(conv_filter, conv_input, conv_bias)
+//             + conv_residual_data
+// If the fusion is off, conv_residual_data is not added.
+struct Conv : public PatternBase {
+  Conv(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "convolution") {}
+  PDNode* operator()();
+  PATTERN_DECL_NODE(conv_op);
+  PATTERN_DECL_NODE(conv_input);
+  PATTERN_DECL_NODE(conv_filter);
+  PATTERN_DECL_NODE(conv_residual_data);
+  PATTERN_DECL_NODE(conv_output);
+};
+// ElementwiseAdd used in residual connections.
+// y_var is used and convolution output.
+// The operator is removed, when residual
+// connection fusion is on.
+struct ElementwiseAdd : public PatternBase {
+  ElementwiseAdd(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "elementwise_add") {}
+  PDNode* operator()(PDNode* x_var);
+  PATTERN_DECL_NODE(elementwise_add_op);
+  PATTERN_DECL_NODE(elementwise_add_x);
+  PATTERN_DECL_NODE(elementwise_add_y);
+  PATTERN_DECL_NODE(elementwise_add_out);
+};
 }  // namespace patterns
 // Link two ir::Nodes from each other.

--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -124,7 +124,7 @@ TEST(GraphTest, Basic) {
      ASSERT_EQ(n->outputs.size(), 0UL);
    }
  }
-  ASSERT_EQ(nodes.size(), 5);
+  ASSERT_EQ(nodes.size(), 5UL);
 }
 TEST(GraphTest, WriteAfterRead) {

--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/ir/mkldnn_placement_pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Aplies MKL-DNN placement strategy.";
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) {
+      n->Op()->SetAttr("use_mkldnn", true);
+    }
+  }
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(mkldnn_placement_pass,
+              paddle::framework::ir::MKLDNNPlacementPass);
--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class MKLDNNPlacementPass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h"
+#include <string>
+#include "paddle/fluid/framework/lod_tensor.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+  PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "X"))
+                  ->assert_is_op_input("sequence_conv")
+                  ->assert_var_not_persistable();
+  patterns::SeqConvEltAddRelu fuse_pattern(pattern, name_scope);
+  fuse_pattern(x);
+  // Create New OpDesc
+  auto fuse_creator = [&](Node* seqconv, Node* input, Node* seqconv_weight,
+                          Node* eltadd_bias, Node* relu_out) {
+    OpDesc op_desc;
+    op_desc.SetType("fusion_seqconv_eltadd_relu");
+    op_desc.SetInput("X", {input->Name()});
+    op_desc.SetInput("Filter", {seqconv_weight->Name()});
+    op_desc.SetInput("Bias", {eltadd_bias->Name()});
+    op_desc.SetAttr("contextLength", seqconv->Op()->GetAttr("contextLength"));
+    op_desc.SetAttr("contextStart", seqconv->Op()->GetAttr("contextStart"));
+    op_desc.SetAttr("contextStride", seqconv->Op()->GetAttr("contextStride"));
+    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+    const std::string ColMat = patterns::UniqueKey("SeqConvColMat");
+    op_desc.SetOutput("ColMat", {ColMat});
+    op_desc.SetOutput("Out", {relu_out->Name()});
+    scope->Var(ColMat)->GetMutable<LoDTensor>();
+    auto* op = graph->CreateOpNode(&op_desc);
+    IR_NODE_LINK_TO(input, op);
+    IR_NODE_LINK_TO(seqconv_weight, op);
+    IR_NODE_LINK_TO(eltadd_bias, op);
+    IR_NODE_LINK_TO(op, relu_out);
+    return op;
+  };
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle SeqConv EltAdd Relu fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(seqconv, seqconv, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(seqconv_weight, seqconv_weight, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(seqconv_out, seqconv_out, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd, eltadd, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_bias, eltadd_bias, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_out, eltadd_out, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(relu, relu, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, fuse_pattern);
+    fuse_creator(seqconv, subgraph.at(x), seqconv_weight, eltadd_bias,
+                 relu_out);
+    std::unordered_set<const Node*> marked_nodes(
+        {seqconv, seqconv_out, eltadd, eltadd_out, relu});
+    GraphSafeRemoveNodes(graph, marked_nodes);
+    ++fusion_count;
+  };
+  gpd(graph, handler);
+  return fusion_count;
+}
+std::unique_ptr<ir::Graph> SeqConvEltAddReluFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope());
+  AddStatis(fusion_count);
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(seqconv_eltadd_relu_fuse_pass,
+              paddle::framework::ir::SeqConvEltAddReluFusePass);
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class SeqConvEltAddReluFusePass : public FusePassBase {
+ public:
+  virtual ~SeqConvEltAddReluFusePass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"seqconv_eltadd_relu_fuse"};
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/lod_tensor_array.h
+++ b/paddle/fluid/framework/lod_tensor_array.h
@@ -18,6 +18,82 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
+// NOTE The vector<LoDTensor> can't be replaced with the class LoDTensorArray
+// directly, because there are many vector<LoDTensor> used accross the project,
+// and some of them are treated as LoDTensorArray.
+#if !defined(PADDLE_ON_INFERENCE)
 using LoDTensorArray = std::vector<LoDTensor>;
-}
+#else  // !PADDLE_ON_INFERENCE
+#pragma message "LoDTensorArray is replaced with the inference one."
+/*
+ * A LoDTensorArray which will not deallocate buffer when resized, fix the data
+ * diff in inference, and more performance friendly in the concurrency
+ * scenerios.
+ */
+class LoDTensorArray {
+ public:
+  LoDTensorArray() = default;
+  using iterator = std::vector<LoDTensor>::iterator;
+  using const_iterator = std::vector<LoDTensor>::const_iterator;
+  const_iterator begin() const { return array_.begin(); }
+  const_iterator end() const { return array_.begin() + size_; }
+  iterator begin() { return array_.begin(); }
+  iterator end() { return array_.begin() + size_; }
+  void push_back(const LoDTensor& x) {
+    if (size_ < array_.size()) {
+      array_[size_++] = x;
+    } else {
+      array_.push_back(x);
+      ++size_;
+    }
+  }
+  void resize(size_t size) {
+    if (array_.size() < size) {
+      array_.resize(size);
+    }
+    size_ = size;
+  }
+  void emplace_back() { array_.emplace_back(); }
+  void emplace_back(LoDTensor&& x) { array_.emplace_back(std::move(x)); }
+  LoDTensor& back() { return array_.back(); }
+  size_t space() const { return array_.size(); }
+  void reserve(size_t size) {
+    // Naive warning to tell user this array might be to large. The memory and
+    // buffer used by this TensorArray will not be deleted during the training
+    // and inference phase, so attention not to make it expand too long.
+    if (size > 800UL) {
+      LOG(WARNING) << "TensorArray has more than 800 items";
+    }
+    array_.reserve(size);
+  }
+  bool empty() const { return size_ == 0UL; }
+  void clear() { size_ = 0UL; }
+  LoDTensor& operator[](size_t id) { return array_[id]; }
+  const LoDTensor& operator[](size_t id) const { return array_[id]; }
+  LoDTensor& at(size_t id) { return array_.at(id); }
+  const LoDTensor& at(size_t id) const { return array_.at(id); }
+  size_t size() const { return size_; }
+ private:
+  size_t size_{0};
+  std::vector<LoDTensor> array_;
+};
+#endif  // !PADDLE_ON_INFERENCE
+}  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -542,6 +542,33 @@ class CPUVector : public std::vector<T, std::allocator<T>> {
    this->reserve(this->size() + size_t(end - begin));
    this->insert(this->end(), begin, end);
  }
+  const T *CUDAData(platform::Place place) const {
+    PADDLE_THROW(
+        "Vector::CUDAData() method is not supported in CPU-only version");
+  }
+  T *CUDAMutableData(platform::Place place) {
+    PADDLE_THROW(
+        "Vector::CUDAMutableData() method is not supported in CPU-only "
+        "version");
+  }
+  const T *Data(platform::Place place) const {
+    PADDLE_ENFORCE(
+        platform::is_cpu_place(place),
+        "Vector::Data() method is not supported when not in CPUPlace");
+    return this->data();
+  }
+  T *MutableData(platform::Place place) {
+    PADDLE_ENFORCE(
+        platform::is_cpu_place(place),
+        "Vector::MutableData() method is not supported when not in CPUPlace");
+    return this->data();
+  }
+  const void *Handle() const { return static_cast<const void *>(this); }
 };
 template <typename T>

--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -37,7 +37,7 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
  } else if (var_type == proto::VarType::FETCH_LIST) {
    var->GetMutable<FeedFetchList>();
  } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope>>();
+    var->GetMutable<std::vector<framework::Scope *>>();
  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
    var->GetMutable<LoDRankTable>();
  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
@@ -146,22 +146,5 @@ void NaiveExecutor::CleanFeedFetchOps() {
  ops_.swap(ops);
 }
-void NaiveExecutor::EnableMKLDNN(const ProgramDesc &program) {
-#ifdef PADDLE_WITH_MKLDNN
-  VLOG(3) << "use_mkldnn=True";
-  for (size_t block_id = 0; block_id < program.Size(); ++block_id) {
-    auto *block = const_cast<ProgramDesc &>(program).MutableBlock(block_id);
-    for (auto *op : block->AllOps()) {
-      if (op->HasAttr("use_mkldnn")) {
-        op->SetAttr("use_mkldnn", true);
-      }
-    }
-  }
-#else
-  LOG(WARNING)
-      << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
-#endif
-}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -48,8 +48,6 @@ class NaiveExecutor {
  void CleanFeedFetchOps();
-  void EnableMKLDNN(const ProgramDesc& program);
 protected:
  void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id);

--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -50,6 +50,27 @@ class CompileTimeInferShapeContext : public InferShapeContext {
  const std::vector<std::string> &Outputs(
      const std::string &name) const override;
+  void ShareDim(const std::string &in, const std::string &out, size_t i = 0,
+                size_t j = 0) override {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    const std::string &input_n = Inputs(in)[i];
+    const std::string &output_n = Outputs(out)[j];
+    PADDLE_ENFORCE(input_n != framework::kEmptyVarName, "The %s[%d] is @EMPTY@",
+                   in, i);
+    PADDLE_ENFORCE(output_n != framework::kEmptyVarName,
+                   "The %s[%d] is @EMPTY@", out, j);
+    auto *in_var = block_.FindVarRecursive(input_n);
+    auto *out_var = block_.FindVarRecursive(output_n);
+    PADDLE_ENFORCE(in_var->GetType() == out_var->GetType(),
+                   "The type of %s and %s is not the same.", input_n, output_n);
+    SetDim(output_n, GetDim(input_n));
+  }
  void ShareLoD(const std::string &in, const std::string &out, size_t i = 0,
                size_t j = 0) const override {
    PADDLE_ENFORCE_LT(i, Inputs(in).size());
@@ -64,10 +85,6 @@ class CompileTimeInferShapeContext : public InferShapeContext {
      VLOG(3) << "input " << in << " is not LodTensor";
      return;
    }
-    PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarType::LOD_TENSOR,
-                      "The %d-th output of Output(%s) must be LoDTensor.", j,
-                      out);
    out_var->SetLoDLevel(in_var->GetLoDLevel());
  }
@@ -498,20 +515,14 @@ void OpDesc::InferShape(const BlockDesc &block) const {
 }
 void OpDesc::InferVarType(BlockDesc *block) const {
+  // There are a few places that var type can be set.
+  // When VarDesc is created, default set to LOD_TENSOR.
+  // When output variable is created, default is defaut set to LOD_TENSOR.
+  // We limit here to be the only place that operator defines its customized
+  // var type inference. Hence, we don't do any "default" setting here.
  auto &info = OpInfoMap::Instance().Get(this->Type());
  if (info.infer_var_type_) {
    info.infer_var_type_(*this, block);
-  } else {
-    // all output type is LoDTensor by default
-    VLOG(10) << this->Type()
-             << " has not registered InferVarType. Set output variables to "
-                "LOD_TENSOR";
-    for (auto &out_pair : this->outputs_) {
-      for (auto &out_var_name : out_pair.second) {
-        block->FindRecursiveOrCreateVar(out_var_name)
-            .SetType(proto::VarType::LOD_TENSOR);
-      }
-    }
  }
 }

--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -100,16 +100,6 @@ class OpDesc {
  std::vector<std::string> InputNames() const { return MapKeys(inputs_); }
  std::vector<std::string> OutputNames() const { return MapKeys(outputs_); }
-  void SetInputMap(const VariableNameMap &input) {
-    this->inputs_ = input;
-    this->need_update_ = true;
-  }
-  void SetOutputMap(const VariableNameMap &output) {
-    this->outputs_ = output;
-    this->need_update_ = true;
-  }
  const VariableNameMap &Inputs() const { return inputs_; }
  const VariableNameMap &Outputs() const { return outputs_; }

--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -21,7 +21,6 @@ namespace framework {
 void OpProtoAndCheckerMaker::Validate() {
  validated_ = true;
  CheckNoDuplicatedInOutAttrs();
-  CheckReuseVars();
 }
 OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput(
@@ -40,40 +39,6 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
  return OpProtoAndCheckerMaker::VariableBuilder{output};
 }
-void OpProtoAndCheckerMaker::Reuse(const std::string& name,
-                                   const std::string& reused_name) {
-  bool found = false;
-  proto::OpProto::Var* var;
-  for (auto& var : proto_->inputs()) {
-    if (var.name() == reused_name) {
-      found = true;
-      break;
-    }
-  }
-  PADDLE_ENFORCE(found == true,
-                 "Input/Output name: %s reused_name: %s, one of them is not "
-                 "exists or not matched.",
-                 name, reused_name);
-  found = false;
-  for (int i = 0; i < proto_->outputs().size(); ++i) {
-    var = proto_->mutable_outputs()->Mutable(i);
-    if (var->name() == name) {
-      PADDLE_ENFORCE(!var->has_reuse(),
-                     "Output(%s) has been set reused var of %s", name,
-                     var->reuse());
-      found = true;
-      var->set_reuse(reused_name);
-      break;
-    }
-  }
-  PADDLE_ENFORCE(found == true,
-                 "Input/Output name: %s reused_name: %s, one of them is not "
-                 "exists or not matched.",
-                 name, reused_name);
-}
 void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
  std::unordered_set<std::string> names;
  auto checker = [&](const std::string& name) {
@@ -91,24 +56,6 @@ void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
  }
 }
-void OpProtoAndCheckerMaker::CheckReuseVars() {
-  std::unordered_set<std::string> names;
-  for (auto& input : proto_->inputs()) {
-    names.insert(input.name());
-  }
-  auto checker = [&](const std::string& name, const std::string& reused) {
-    PADDLE_ENFORCE(
-        names.count(reused),
-        "Output [%s] reuse Input [%s], but the input is not registered.", name,
-        reused);
-  };
-  for (auto& output : proto_->outputs()) {
-    if (output.has_reuse()) {
-      checker(output.name(), output.reuse());
-    }
-  }
-}
 void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
                                        OpAttrChecker* attr_checker) {
  proto_ = proto;
@@ -124,6 +71,8 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
           static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
           static_cast<int>(OpRole::kLoss) |
               static_cast<int>(OpRole::kBackward),
+           static_cast<int>(OpRole::kOptimize) |
+               static_cast<int>(OpRole::kLRSched),
           static_cast<int>(OpRole::kNotSpecified)})
      .SetDefault(static_cast<int>(OpRole::kNotSpecified));
  AddAttr<std::vector<std::string>>(OpRoleVarAttrName(),

--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -14,25 +14,26 @@ limitations under the License. */
 #pragma once
 #include <string>
-#include <unordered_set>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/framework.pb.h"
 namespace paddle {
 namespace framework {
+//////////////////////////
+// Don't add more roles to make this too complicated!
+//////////////////////////
 enum class OpRole {
  kForward = 0x0000,
  kBackward = 0x0001,
  kOptimize = 0x0002,
  // RPC role is for send/recv releated op
-  kRPC = 0x0003,
+  kRPC = 0x0004,
  // Dist role is for split_byref/split_selected_rows/concat
  // used for distributed training.
-  kDist = 0x0004,
+  kDist = 0x0008,
  // Tag all learning rate scheduler operators.
-  kLRSched = 0x0005,
+  kLRSched = 0x0016,
  kLoss = 0x0100,
  // The default value of op's role. This should be only used for unittests and
@@ -73,11 +74,6 @@ class OpProtoAndCheckerMaker {
      var_->set_dispensable(true);
      return *this;
    }
-    VariableBuilder &Reuse(const std::string &name) {
-      var_->set_reuse(name);
-      return *this;
-    }
  };
  VariableBuilder AddInput(const std::string &name, const std::string &comment);
@@ -85,8 +81,6 @@ class OpProtoAndCheckerMaker {
  VariableBuilder AddOutput(const std::string &name,
                            const std::string &comment);
-  void Reuse(const std::string &name, const std::string &reused_name);
  template <typename T>
  TypedAttrChecker<T> &AddAttr(const std::string &name,
                               const std::string &comment,
@@ -105,8 +99,6 @@ class OpProtoAndCheckerMaker {
  void CheckNoDuplicatedInOutAttrs();
  void Validate();
-  void CheckReuseVars();
  proto::OpProto *proto_;
  OpAttrChecker *op_checker_;
  bool validated_{false};

--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -47,120 +47,3 @@ TEST(ProtoMaker, DuplicatedInOut) {
  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
               paddle::platform::EnforceNotMet);
 }
-class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "input of test op");
-    AddOutput("XOut", "output of test op").Reuse("X");
-  }
-};
-class TestInplaceProtoMaker2
-    : public paddle::framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "input of test op");
-    AddOutput("XOut", "output of test op").Reuse("X");
-    AddOutput("NoOut", "output of test op").Reuse("NotExists");
-  }
-};
-TEST(ProtoMaker, InplaceOutput) {
-  paddle::framework::proto::OpProto op_proto, op_proto2;
-  paddle::framework::OpAttrChecker op_checker;
-  TestInplaceProtoMaker proto_maker;
-  TestInplaceProtoMaker2 proto_maker2;
-  proto_maker(&op_proto, &op_checker);
-  ASSERT_THROW(proto_maker2(&op_proto2, &op_checker),
-               paddle::platform::EnforceNotMet);
-}
-// normal reuse
-class TestReuseProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "input of test op");
-    AddInput("Y", "input of test op");
-    AddOutput("Out", "output of test op");
-    AddOutput("XOut", "output of test op");
-    // avoid destructor exception.
-    // Validate();
-    TestReuse();
-  }
-  virtual void TestReuse() {}
-};
-// test duplicate reuse error
-class TestReuseProtoMaker2 : public TestReuseProtoMaker {
- public:
-  void TestReuse() {
-    Reuse("Out", "X");
-    Reuse("Out", "Y");
-  }
-};
-// NotExists Input
-class TestReuseProtoMaker3 : public TestReuseProtoMaker {
- public:
-  void TestReuse() {
-    Reuse("Out", "NotExists");
-    Reuse("XOut", "X");
-  }
-};
-// NotExists Output
-class TestReuseProtoMaker4 : public TestReuseProtoMaker {
- public:
-  void TestReuse() { Reuse("NotExists", "X"); }
-};
-TEST(ProtoMaker, Reuse) {
-  paddle::framework::proto::OpProto op_proto;
-  paddle::framework::OpAttrChecker op_checker;
-  TestReuseProtoMaker proto_maker;
-  proto_maker(&op_proto, &op_checker);
-}
-// NOTE(dzhwinter):
-// There is a Fatal CHECK on base class destructor, which will call abort inside
-// instead of
-// throw an exception. If we throw an exception in Make(), we will trigger the
-// CHECK and terminate the tests.
-//
-// I had tried to replace the default CHECK with a exception, however, it's
-// still not supported by glog.
-// the details:
-// https://github.com/google/glog/issues/249
-// https://github.com/facebookresearch/TensorComprehensions/issues/351
-/*
-TEST(ProtoMaker, ReuseWithException) {
-  paddle::framework::proto::OpProto op_proto2, op_proto3, op_proto4;
-  paddle::framework::OpAttrChecker op_checker;
-  TestReuseProtoMaker2 proto_maker2;
-  TestReuseProtoMaker3 proto_maker3;
-  TestReuseProtoMaker4 proto_maker4;
-  EXPECT_THROW(proto_maker2(&op_proto2, &op_checker),
-               paddle::platform::EnforceNotMet);
-  EXPECT_THROW(proto_maker3(&op_proto3, &op_checker),
-               paddle::platform::EnforceNotMet);
-  EXPECT_THROW(proto_maker4(&op_proto4, &op_checker),
-               paddle::platform::EnforceNotMet);
-}
-void FailureFunction() {
-  throw std::runtime_error("Check failed in destructor.");
-  // return 0;
-}
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  google::InstallFailureFunction(&FailureFunction);
-  return RUN_ALL_TESTS();
-}
-*/
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -149,9 +149,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
    platform::SetDeviceId(dev_id);
 #endif
  }
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  platform::RecordEvent record_event(Type(), pool.Get(place));
+  // The profile has a process-wide mutex, results in serious performance issue
-  RunImpl(scope, place);
+  // in concurrency scenerio. Here use an `if` to fix this issue.
+  // Please not remove the `if`, ask @Superjomn if there are any concern.
+  if (platform::IsProfileEnabled()) {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    platform::RecordEvent record_event(Type(), pool.Get(place));
+    RunImpl(scope, place);
+  } else {
+    RunImpl(scope, place);
+  }
  VLOG(3) << place << " " << DebugStringEx(&scope);
 }
@@ -542,6 +550,36 @@ class RuntimeInferShapeContext : public InferShapeContext {
    return op_.Outputs(name);
  }
+  void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) override {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    const std::string& input_n = Inputs(in)[i];
+    const std::string& output_n = Outputs(out)[j];
+    Variable* in_var = scope_.FindVar(input_n);
+    Variable* out_var = scope_.FindVar(output_n);
+    PADDLE_ENFORCE(in_var->Type() == out_var->Type(),
+                   "The type of %s and %s is not the same.", output_n,
+                   GetDim(input_n));
+    if (in_var->IsType<framework::SelectedRows>()) {
+      auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
+      auto out_sele_rows = out_var->GetMutable<framework::SelectedRows>();
+      out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims());
+      out_sele_rows->set_rows(in_sele_rows.rows());
+      out_sele_rows->set_height(in_sele_rows.height());
+    } else if (in_var->IsType<framework::LoDTensor>()) {
+      auto& in_lod_tensor = in_var->Get<framework::LoDTensor>();
+      auto* out_lod_tensor = out_var->GetMutable<framework::LoDTensor>();
+      out_lod_tensor->Resize(in_lod_tensor.dims());
+    } else {
+      PADDLE_THROW(
+          "Currently, the input type of ShareDim only can be LoDTensor "
+          "or SelectedRows.");
+    }
+  }
  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
                size_t j = 0) const override {
    const std::vector<std::string>& inputs = Inputs(in);

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -156,12 +156,10 @@ ParallelExecutor::ParallelExecutor(
                           params, member_->local_scopes_, member_->use_cuda_);
 #endif
-  if (VLOG_IS_ON(5)) {
+  // If the loss_var_name is given, the number of graph should be only one.
-    // If the loss_var_name is given, the number of graph should be only one.
+  if (loss_var_name.size()) {
-    if (loss_var_name.size()) {
+    PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
-      PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
+                      "The number of graph should be only one");
-                        "The number of graph should be only one");
-    }
  }
  if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
@@ -299,6 +297,12 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
 }
 ParallelExecutor::~ParallelExecutor() {
+  const auto dev_ctxs =
+      platform::DeviceContextPool::Instance().GetAllDeviceContexts();
+  for (auto &dev_ctx : dev_ctxs) {
+    dev_ctx->Wait();
+  }
  if (member_->own_local_scope_) {
    for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
      Scope *local_scope = member_->local_scopes_[i];
@@ -307,6 +311,10 @@ ParallelExecutor::~ParallelExecutor() {
      }
    }
  }
+  // member_ must be destructed before gcs_ since the destructor of
+  // ReferenceCountOpHandle use raw pointers of gcs_ inside.
+  member_.reset();
 }
 }  // namespace framework

--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -75,7 +75,7 @@ class ParallelExecutor {
 private:
  void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
-  ParallelExecutorPrivate *member_;
+  std::unique_ptr<ParallelExecutorPrivate> member_;
 #ifdef PADDLE_WITH_CUDA
  // ref_cnts_ is only initialized when ParallelExecutor constructs, and then

--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -126,7 +126,7 @@ const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
  std::vector<std::string> feed_target_names;
  for (auto *op : global_block.AllOps()) {
    if (op->Type() == kFeedOpType) {
-      int col = boost::get<int>(op->GetAttr("col"));
+      size_t col = boost::get<int>(op->GetAttr("col"));
      if (col >= feed_target_names.size()) {
        feed_target_names.resize(col + 1);
      }
@@ -143,7 +143,7 @@ const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
  std::vector<std::string> fetch_target_names;
  for (auto *op : global_block.AllOps()) {
    if (op->Type() == kFetchOpType) {
-      int col = boost::get<int>(op->GetAttr("col"));
+      size_t col = boost::get<int>(op->GetAttr("col"));
      if (col >= fetch_target_names.size()) {
        fetch_target_names.resize(col + 1);
      }

--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
@@ -103,7 +103,7 @@ TEST(ProgramDesc, copy_ctor) {
      ASSERT_EQ(1, op->GetBlockAttrId("sub_block"));
      found_sub_block = true;
-      ASSERT_EQ(2, op->GetBlocksAttrIds("sub_blocks").size());
+      ASSERT_EQ(2UL, op->GetBlocksAttrIds("sub_blocks").size());
      found_sub_blocks = true;
    }
  }

--- a/paddle/fluid/framework/reader_test.cc
+++ b/paddle/fluid/framework/reader_test.cc
@@ -39,8 +39,8 @@ TEST(READER, decorate_chain) {
  {
    auto endpoints = root->GetEndPoints();
    ASSERT_EQ(endpoints.size(), 2U);
-    ASSERT_NE(endpoints.count(end_point1.get()), 0);
+    ASSERT_NE(endpoints.count(end_point1.get()), 0UL);
-    ASSERT_NE(endpoints.count(end_point2.get()), 0);
+    ASSERT_NE(endpoints.count(end_point2.get()), 0UL);
  }
  {

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -49,18 +49,18 @@ int64_t GetEagerDeletionThreshold() {
 Scope::~Scope() { DropKids(); }
 Scope& Scope::NewScope() const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  kids_.push_back(new Scope(this));
  return *kids_.back();
 }
 Variable* Scope::Var(const std::string& name) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  return VarInternal(name);
 }
 Variable* Scope::Var(std::string* name) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  if (name != nullptr) {
    *name = new_name;
@@ -69,29 +69,34 @@ Variable* Scope::Var(std::string* name) {
 }
 Variable* Scope::FindVar(const std::string& name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  return FindVarInternal(name);
 }
+Variable* Scope::FindLocalVar(const std::string& name) const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return FindVarLocally(name);
+}
 const Scope* Scope::FindScope(const Variable* var) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  return FindScopeInternal(var);
 }
 void Scope::DropKids() {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  for (Scope* s : kids_) delete s;
  kids_.clear();
 }
 bool Scope::HasKid(const Scope* scope) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  return it != this->kids_.end();
 }
 std::vector<std::string> Scope::LocalVarNames() const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  std::vector<std::string> known_vars;
  known_vars.reserve(this->vars_.size());
  for (auto& p : vars_) {
@@ -101,7 +106,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }
 void Scope::DeleteScope(Scope* scope) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
  this->kids_.erase(it);
@@ -114,7 +119,7 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  std::set<std::string> var_set(var_names.begin(), var_names.end());
  for (auto it = vars_.begin(); it != vars_.end();) {
    if (var_set.find(it->first) != var_set.end()) {
@@ -127,12 +132,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 void Scope::Rename(const std::string& origin_name,
                   const std::string& new_name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  RenameInternal(origin_name, new_name);
 }
 std::string Scope::Rename(const std::string& origin_name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  RenameInternal(origin_name, new_name);
  return new_name;

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -63,6 +63,11 @@ class Scope {
  /// Caller doesn't own the returned Variable.
  Variable* FindVar(const std::string& name) const;
+  /// Find a variable in the current scope.
+  /// Return nullptr if cannot find.
+  /// Caller doesn't own the returned Variable.
+  Variable* FindLocalVar(const std::string& name) const;
  const Scope* parent() const { return parent_; }
  /// Find the scope or an ancestor scope that contains the given variable.
@@ -73,6 +78,8 @@ class Scope {
  /// Drop all kids scopes belonged to this scope.
  void DropKids();
+  std::list<Scope*>& kids() const { return kids_; }
  /// Find if a scope exists in the kid scopes
  bool HasKid(const Scope* scope) const;

--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -91,7 +91,7 @@ TEST(SelectedRows, SparseTable) {
  ASSERT_TRUE(table.HasKey(10));
  ASSERT_TRUE(table.HasKey(8));
  ASSERT_TRUE(table.HasKey(6));
-  ASSERT_EQ(table.rows().size(), 3);
+  ASSERT_EQ(table.rows().size(), 3UL);
  framework::Tensor ids;
  ids.Resize(framework::make_ddim({4}));

--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -56,6 +56,9 @@ class InferShapeContext {
  virtual const std::vector<std::string> &Outputs(
      const std::string &name) const = 0;
+  virtual void ShareDim(const std::string &in, const std::string &out,
+                        size_t i = 0, size_t j = 0) = 0;
  virtual void ShareLoD(const std::string &in, const std::string &out,
                        size_t i = 0, size_t j = 0) const = 0;

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -36,6 +36,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
  auto size = src.numel() * SizeOfType(src.type());
  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
  }
@@ -71,6 +76,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
    if (platform::is_same_place(src_place, dst_place)) {
+      if (src_ptr == dst_ptr) {
+        VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+                << dst_place;
+        return;
+      }
      memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
                   stream);
    } else {
@@ -114,6 +124,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
  auto dst_ptr = dst->mutable_data(dst_place, src.type());
  auto size = src.numel() * SizeOfType(src.type());
  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data from " << src_place << " to "
+              << dst_place;
+      return;
+    }
    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
  }
@@ -130,6 +145,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
  } else if (platform::is_gpu_place(src_place) &&
             platform::is_gpu_place(dst_place)) {
+    if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
+      VLOG(3) << "Skip copy the same data from " << src_place << " to "
+              << dst_place;
+      return;
+    }
    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
@@ -165,10 +185,12 @@ inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
 }
 template <typename Predicate>
-struct AnyVisitor : public boost::static_visitor<bool> {
+class AnyVisitor : public boost::static_visitor<bool> {
+ private:
  const framework::Tensor& tensor_;
  Predicate predicate_;
+ public:
  AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
      : tensor_(tensor), predicate_(std::move(predicate)) {}
@@ -206,6 +228,27 @@ struct AnyVisitor : public boost::static_visitor<bool> {
  }
 };
+template <typename Predicate>
+class AnyOutVisitor : public boost::static_visitor<> {
+ private:
+  const framework::Tensor& tensor_;
+  mutable framework::Tensor* out_;
+  Predicate predicate_;
+ public:
+  AnyOutVisitor(const framework::Tensor& tensor, Predicate predicate,
+                framework::Tensor* out)
+      : tensor_(tensor), out_(out), predicate_(std::move(predicate)) {}
+  template <typename Place>
+  void operator()(const Place& place) const {
+    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+    out_->Resize({1});
+    out_->mutable_data<bool>(place);
+    AnyImpl(predicate_, tensor_, *ctx, out_);
+  }
+};
 template <typename Predicate>
 inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
  AnyVisitor<Predicate> visitor(tensor, predicate);
@@ -213,6 +256,14 @@ inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
  return platform::VisitPlace(place, visitor);
 }
+template <typename Predicate>
+inline void Any(const framework::Tensor& tensor, Predicate predicate,
+                framework::Tensor* out) {
+  AnyOutVisitor<Predicate> visitor(tensor, predicate, out);
+  auto place = tensor.place();
+  platform::VisitPlace(place, visitor);
+}
 struct ContainsNANPredicate {
  template <typename T>
  auto operator()(const T& eigen_vec) const
@@ -227,6 +278,12 @@ bool TensorContainsNAN(const framework::Tensor& tensor) {
  return Any(tensor, predicate);
 }
+void TensorContainsNAN(const framework::Tensor& tensor,
+                       framework::Tensor* out) {
+  ContainsNANPredicate predicate;
+  Any(tensor, predicate, out);
+}
 struct ContainsInfPredicate {
  template <typename T>
  auto operator()(const T& eigen_vec) const
@@ -241,6 +298,71 @@ bool TensorContainsInf(const framework::Tensor& tensor) {
  return Any(tensor, predicate);
 }
+void TensorContainsInf(const framework::Tensor& tensor,
+                       framework::Tensor* out) {
+  ContainsInfPredicate predicate;
+  Any(tensor, predicate, out);
+}
+// NOTE(dzhwinter):
+// Isfinite need a AllVisitor to loop through all the elements.
+// We choose two cuda call instead of one allvisitor. The AllVisitor
+// should be implemented if the performance hurts.
+bool TensorIsfinite(const framework::Tensor& tensor) {
+  ContainsInfPredicate pred_inf;
+  ContainsNANPredicate pred_nan;
+  return !Any(tensor, pred_inf) && !Any(tensor, pred_nan);
+}
+#ifdef PADDLE_WITH_CUDA
+template <typename T>
+static inline void __global__ BothFalse(const T* cmp, T* out) {
+  out[0] = (!cmp[0]) && (!out[0]);
+}
+#endif
+struct BothFalseVisitor : public boost::static_visitor<> {
+  const framework::Tensor& in_;
+  mutable framework::Tensor* out_;
+  BothFalseVisitor(const framework::Tensor& in, framework::Tensor* out)
+      : in_(in), out_(out) {}
+  template <typename Place>
+  void operator()(const Place& place) const {
+    VisitorImpl(place);
+  }
+  void VisitorImpl(const platform::CUDAPlace& gpu) const {
+#ifdef PADDLE_WITH_CUDA
+    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(gpu);
+    BothFalse<bool><<<1, 1, 0, ctx->stream()>>>(in_.data<bool>(),
+                                                out_->mutable_data<bool>(gpu));
+#endif
+  }
+  void VisitorImpl(const platform::CPUPlace& cpu) const {
+    bool lhs = !in_.data<bool>()[0];
+    bool rhs = !out_->mutable_data<bool>(cpu)[0];
+    out_->mutable_data<bool>(cpu)[0] = lhs && rhs;
+  }
+  void VisitorImpl(
+      const platform::CUDAPinnedPlace& cpu /* equals to cpu*/) const {
+    bool lhs = !in_.data<bool>()[0];
+    bool rhs = !out_->mutable_data<bool>(cpu)[0];
+    out_->mutable_data<bool>(cpu)[0] = lhs && rhs;
+  }
+};
+void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) {
+  framework::Tensor tmp;
+  TensorContainsInf(tensor, &tmp);
+  TensorContainsNAN(tensor, out);
+  BothFalseVisitor visitor(tmp, out);
+  auto place = tensor.place();
+  platform::VisitPlace(place, visitor);
+}
 void TensorToStream(std::ostream& os, const Tensor& tensor,
                    const platform::DeviceContext& dev_ctx) {
  {  // the 1st field, uint32_t version

--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -57,8 +57,15 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
 template <typename T>
 void TesnorToVector(const Tensor& src, std::vector<T>* dst);
+// copy the result bool to cpu
 bool TensorContainsNAN(const framework::Tensor& tensor);
 bool TensorContainsInf(const framework::Tensor& tensor);
+bool TensorIsfinite(const framework::Tensor& tensor);
+// store the result bool in gpu tensor, async operation. Faster than above ones.
+void TensorContainsNAN(const framework::Tensor& tensor, framework::Tensor* out);
+void TensorContainsInf(const framework::Tensor& tensor, framework::Tensor* out);
+void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out);
 void TensorToStream(std::ostream& os, const Tensor& tensor,
                    const platform::DeviceContext& dev_ctx);

--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -36,7 +36,12 @@ TEST(TensorCopy, Tensor) {
  TensorCopy(src_tensor, *cpu_place, &dst_tensor);
  const int* dst_ptr = dst_tensor.data<int>();
-  ASSERT_NE(src_ptr, dst_ptr);
+  EXPECT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
+  TensorCopy(dst_tensor, *cpu_place, &dst_tensor);
  for (size_t i = 0; i < 9; ++i) {
    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
  }
@@ -47,7 +52,7 @@ TEST(TensorCopy, Tensor) {
  TensorCopy(slice_tensor, *cpu_place, &dst_tensor);
  const int* slice_ptr = slice_tensor.data<int>();
  dst_ptr = dst_tensor.data<int>();
-  ASSERT_NE(dst_ptr, slice_ptr);
+  EXPECT_NE(dst_ptr, slice_ptr);
  for (size_t i = 0; i < 3; ++i) {
    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
  }
@@ -77,11 +82,20 @@ TEST(TensorCopy, Tensor) {
    // Sync before Compare Tensors
    gpu_ctx.Wait();
    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, dst_ptr);
+    EXPECT_NE(src_ptr, dst_ptr);
    for (size_t i = 0; i < 9; ++i) {
      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
    }
+    // Copy the same tensor
+    TensorCopy(gpu_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+    gpu_ctx.Wait();
+    const int* dst_ptr_tmp = dst_tensor.data<int>();
+    EXPECT_NE(src_ptr, dst_ptr_tmp);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]);
+    }
    Tensor slice_tensor = src_tensor.Slice(1, 2);
    // CPU Slice Tensor to GPU Tensor
@@ -94,7 +108,7 @@ TEST(TensorCopy, Tensor) {
    gpu_ctx.Wait();
    const int* slice_ptr = slice_tensor.data<int>();
    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(dst_ptr, slice_ptr);
+    EXPECT_NE(dst_ptr, slice_ptr);
    for (size_t i = 0; i < 3; ++i) {
      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
    }
@@ -117,7 +131,7 @@ TEST(TensorFromVector, Tensor) {
    // Compare Tensors
    const int* cpu_ptr = cpu_tensor.data<int>();
    const int* src_ptr = src_vec.data();
-    ASSERT_NE(src_ptr, cpu_ptr);
+    EXPECT_NE(src_ptr, cpu_ptr);
    for (size_t i = 0; i < 9; ++i) {
      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
    }
@@ -127,7 +141,7 @@ TEST(TensorFromVector, Tensor) {
    paddle::framework::TensorFromVector<int>(src_vec, &cpu_tensor);
    cpu_ptr = cpu_tensor.data<int>();
    src_ptr = src_vec.data();
-    ASSERT_NE(src_ptr, cpu_ptr);
+    EXPECT_NE(src_ptr, cpu_ptr);
    for (size_t i = 0; i < 5; ++i) {
      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
    }
@@ -161,8 +175,8 @@ TEST(TensorFromVector, Tensor) {
    const int* src_ptr = src_vec.data();
    const int* cpu_ptr = cpu_tensor.data<int>();
    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, cpu_ptr);
+    EXPECT_NE(src_ptr, cpu_ptr);
-    ASSERT_NE(src_ptr, dst_ptr);
+    EXPECT_NE(src_ptr, dst_ptr);
    for (size_t i = 0; i < 9; ++i) {
      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
@@ -181,8 +195,8 @@ TEST(TensorFromVector, Tensor) {
    src_ptr = src_vec.data();
    cpu_ptr = cpu_tensor.data<int>();
    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, cpu_ptr);
+    EXPECT_NE(src_ptr, cpu_ptr);
-    ASSERT_NE(src_ptr, dst_ptr);
+    EXPECT_NE(src_ptr, dst_ptr);
    for (size_t i = 0; i < 5; ++i) {
      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
@@ -235,9 +249,9 @@ TEST(TensorContainsNAN, CPU) {
    buf[0] = 0.0;
    buf[1] = NAN;
    buf[2] = 0.0;
-    ASSERT_TRUE(paddle::framework::TensorContainsNAN(src));
+    EXPECT_TRUE(paddle::framework::TensorContainsNAN(src));
    buf[1] = 0.0;
-    ASSERT_FALSE(paddle::framework::TensorContainsNAN(src));
+    EXPECT_FALSE(paddle::framework::TensorContainsNAN(src));
  }
  {
@@ -248,9 +262,9 @@ TEST(TensorContainsNAN, CPU) {
    buf[0] = 0.0;
    buf[1].x = 0x7fff;
    buf[2] = 0.0;
-    ASSERT_TRUE(paddle::framework::TensorContainsNAN(src));
+    EXPECT_TRUE(paddle::framework::TensorContainsNAN(src));
    buf[1] = 0.0;
-    ASSERT_FALSE(paddle::framework::TensorContainsNAN(src));
+    EXPECT_FALSE(paddle::framework::TensorContainsNAN(src));
  }
 }
@@ -261,9 +275,9 @@ TEST(TensorContainsInf, CPU) {
    buf[0] = 1.0;
    buf[1] = INFINITY;
    buf[2] = 0.0;
-    ASSERT_TRUE(paddle::framework::TensorContainsInf(src));
+    EXPECT_TRUE(paddle::framework::TensorContainsInf(src));
    buf[1] = 1.0;
-    ASSERT_FALSE(paddle::framework::TensorContainsInf(src));
+    EXPECT_FALSE(paddle::framework::TensorContainsInf(src));
  }
  {
@@ -274,9 +288,55 @@ TEST(TensorContainsInf, CPU) {
    buf[0] = 1.0;
    buf[1].x = 0x7c00;
    buf[2] = 0.0;
-    ASSERT_TRUE(paddle::framework::TensorContainsInf(src));
+    EXPECT_TRUE(paddle::framework::TensorContainsInf(src));
+    buf[1] = 1.0;
+    EXPECT_FALSE(paddle::framework::TensorContainsInf(src));
+  }
+}
+TEST(TensorIsfinite, CPU) {
+  {
+    paddle::framework::Tensor src, out;
+    double* buf = src.mutable_data<double>({3}, paddle::platform::CPUPlace());
+    buf[0] = 1.0;
+    buf[1] = INFINITY;
+    buf[2] = 0.0;
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], false);
+    buf[1] = 1.0;
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], true);
+  }
+  {
+    paddle::framework::Tensor src, out;
+    double* buf = src.mutable_data<double>({3}, paddle::platform::CPUPlace());
+    buf[0] = 1.0;
+    buf[1] = NAN;
+    buf[2] = 0.0;
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], false);
+    buf[1] = 1.0;
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], true);
+  }
+  {
+    paddle::framework::Tensor src, out;
+    paddle::platform::float16* buf =
+        src.mutable_data<paddle::platform::float16>(
+            {3}, paddle::platform::CPUPlace());
+    buf[0] = 1.0;
+    buf[1].x = 0x7c00;
+    buf[2] = 0.0;
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], false);
    buf[1] = 1.0;
-    ASSERT_FALSE(paddle::framework::TensorContainsInf(src));
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], true);
+    buf[1].x = 0x7fff;
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], false);
  }
 }
@@ -299,9 +359,9 @@ TEST(Tensor, FromAndToStream) {
    TensorFromStream(iss, &dst_tensor, cpu_ctx);
    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
    for (int i = 0; i < 5; ++i) {
-      ASSERT_EQ(dst_ptr[i], array[i]);
+      EXPECT_EQ(dst_ptr[i], array[i]);
    }
-    ASSERT_EQ(dst_tensor.dims(), src_tensor.dims());
+    EXPECT_EQ(dst_tensor.dims(), src_tensor.dims());
    delete place;
  }
 #ifdef PADDLE_WITH_CUDA
@@ -323,7 +383,7 @@ TEST(Tensor, FromAndToStream) {
    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
    for (int i = 0; i < 6; ++i) {
-      ASSERT_EQ(dst_ptr[i], array[i]);
+      EXPECT_EQ(dst_ptr[i], array[i]);
    }
    delete gpu_place;
  }

--- a/paddle/fluid/framework/tensor_util_test.cu
+++ b/paddle/fluid/framework/tensor_util_test.cu
@@ -27,9 +27,9 @@ static __global__ void FillNAN(float* buf) {
 }
 static __global__ void FillInf(float* buf) {
-  buf[0] = 0.0;
+  buf[0] = INFINITY;
-  buf[1] = INFINITY;
+  buf[1] = 0.1;
-  buf[2] = 0.5;
+  buf[2] = 0.2;
 }
 static __global__ void FillNAN(platform::float16* buf) {
@@ -44,6 +44,18 @@ static __global__ void FillInf(platform::float16* buf) {
  buf[2] = 0.5;
 }
+static __global__ void FillFinite(float* buf) {
+  buf[0] = 0.0;
+  buf[1] = 0.1;
+  buf[2] = 0.2;
+}
+static __global__ void FillFinite(platform::float16* buf) {
+  buf[0] = 0.0;
+  buf[1] = 0.1;
+  buf[2] = 0.2;
+}
 TEST(TensorContainsNAN, GPU) {
  paddle::platform::CUDAPlace gpu(0);
  auto& pool = paddle::platform::DeviceContextPool::Instance();
@@ -86,5 +98,163 @@ TEST(TensorContainsInf, GPU) {
  }
 }
+TEST(TensorIsfinite, GPU) {
+  paddle::platform::CUDAPlace gpu(0);
+  using paddle::platform::float16;
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  // contains inf
+  {
+    Tensor tensor;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    EXPECT_TRUE(!TensorIsfinite(tensor));
+  }
+  {
+    Tensor tensor;
+    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    EXPECT_TRUE(!TensorIsfinite(tensor));
+  }
+  // contains nan
+  {
+    Tensor tensor;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    EXPECT_TRUE(!TensorIsfinite(tensor));
+  }
+  {
+    Tensor tensor;
+    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    EXPECT_TRUE(!TensorIsfinite(tensor));
+  }
+  // all element are finite
+  {
+    Tensor tensor;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    EXPECT_TRUE(TensorIsfinite(tensor));
+  }
+  {
+    Tensor tensor;
+    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    EXPECT_TRUE(TensorIsfinite(tensor));
+  }
+}
+TEST(TensorContainsInf, GPUWithoutWait) {
+  paddle::platform::CUDAPlace gpu(0);
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  {
+    Tensor tensor, out;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorContainsInf(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    ASSERT_EQ(tmp.data<bool>()[0], true);
+  }
+  {
+    Tensor tensor, out;
+    paddle::platform::float16* buf =
+        tensor.mutable_data<paddle::platform::float16>({3}, gpu);
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorContainsInf(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    ASSERT_EQ(tmp.data<bool>()[0], true);
+  }
+}
+TEST(TensorContainsNAN, GPUWithoutWait) {
+  paddle::platform::CUDAPlace gpu(0);
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  {
+    Tensor tensor, out;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorContainsNAN(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    ASSERT_EQ(tmp.data<bool>()[0], true);
+  }
+  {
+    Tensor tensor, out;
+    paddle::platform::float16* buf =
+        tensor.mutable_data<paddle::platform::float16>({3}, gpu);
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorContainsNAN(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    ASSERT_EQ(tmp.data<bool>()[0], true);
+  }
+}
+TEST(TensorIsfinite, GPUWithoutWait) {
+  paddle::platform::CUDAPlace gpu(0);
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  {
+    Tensor tensor, out;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorIsfinite(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    EXPECT_EQ(tmp.data<bool>()[0], false);
+  }
+  {
+    Tensor tensor, out;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorIsfinite(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    EXPECT_EQ(tmp.data<bool>()[0], false);
+  }
+  {
+    Tensor tensor, out;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorIsfinite(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    EXPECT_EQ(tmp.data<bool>()[0], true);
+  }
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -25,7 +25,6 @@ DEFINE_int32(dist_threadpool_size, 0,
 namespace paddle {
 namespace framework {
 std::unique_ptr<ThreadPool> ThreadPool::threadpool_(nullptr);
 std::once_flag ThreadPool::init_flag_;
@@ -47,8 +46,7 @@ void ThreadPool::Init() {
  }
 }
-ThreadPool::ThreadPool(int num_threads)
+ThreadPool::ThreadPool(int num_threads) : running_(true) {
-    : total_threads_(num_threads), idle_threads_(num_threads), running_(true) {
  threads_.resize(num_threads);
  for (auto& thread : threads_) {
    // TODO(Yancey1989): binding the thread on the specify CPU number
@@ -59,6 +57,7 @@ ThreadPool::ThreadPool(int num_threads)
 ThreadPool::~ThreadPool() {
  {
    // notify all threads to stop running
+    std::lock_guard<std::mutex> l(mutex_);
    running_ = false;
    scheduled_.notify_all();
  }
@@ -69,36 +68,24 @@ ThreadPool::~ThreadPool() {
  }
 }
-void ThreadPool::Wait() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  completed_.wait(lock, [=] { return Done() == true; });
-}
 void ThreadPool::TaskLoop() {
-  while (running_) {
+  while (true) {
    std::unique_lock<std::mutex> lock(mutex_);
-    scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
-    if (!running_) {
+    scheduled_.wait(
-      break;
+        lock, [this] { return !this->tasks_.empty() || !this->running_; });
+    if (!running_ || tasks_.empty()) {
+      return;
    }
    // pop a task from the task queue
    auto task = std::move(tasks_.front());
    tasks_.pop();
-    --idle_threads_;
    lock.unlock();
    // run the task
    task();
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      ++idle_threads_;
-      if (Done()) {
-        completed_.notify_all();
-      }
-    }
  }
 }

--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -57,15 +57,6 @@ class ThreadPool {
  ~ThreadPool();
-  // Returns the number of threads created by the constructor.
-  size_t Threads() const { return total_threads_; }
-  // Returns the number of currently idle threads.
-  size_t IdleThreads() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    return idle_threads_;
-  }
  // Run pushes a function to the task queue and returns a std::future
  // object.  To wait for the completion of the task, call
  // std::future::wait().
@@ -94,25 +85,13 @@ class ThreadPool {
    });
    std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
    tasks_.push(std::move(task));
-    lock.unlock();
    scheduled_.notify_one();
    return f;
  }
-  // Wait until all the tasks are completed.
-  void Wait();
 private:
  DISABLE_COPY_AND_ASSIGN(ThreadPool);
-  // If the task queue is empty and avaialbe is equal to the number of
-  // threads, means that all tasks are completed.  Note: this function
-  // is not thread-safe.  Returns true if all tasks are completed.
-  // Note: don't delete the data member total_threads_ and use
-  // threads_.size() instead; because you'd need to lock the mutex
-  // before accessing threads_.
-  bool Done() { return tasks_.empty() && idle_threads_ == total_threads_; }
  // The constructor starts threads to run TaskLoop, which retrieves
  // and runs tasks from the queue.
  void TaskLoop();
@@ -125,14 +104,11 @@ class ThreadPool {
  static std::once_flag init_flag_;
  std::vector<std::unique_ptr<std::thread>> threads_;
-  const size_t total_threads_;
-  size_t idle_threads_;
  std::queue<Task> tasks_;
  std::mutex mutex_;
  bool running_;
  std::condition_variable scheduled_;
-  std::condition_variable completed_;
 };
 class ThreadPoolIO : ThreadPool {

--- a/paddle/fluid/framework/threadpool_test.cc
+++ b/paddle/fluid/framework/threadpool_test.cc
@@ -19,10 +19,11 @@ limitations under the License. */
 namespace framework = paddle::framework;
-void do_sum(framework::ThreadPool* pool, std::atomic<int>* sum, int cnt) {
+void do_sum(std::vector<std::future<void>>* fs, std::mutex* mu,
-  std::vector<std::future<void>> fs;
+            std::atomic<int>* sum, int cnt) {
  for (int i = 0; i < cnt; ++i) {
-    fs.push_back(framework::Async([sum]() { sum->fetch_add(1); }));
+    std::lock_guard<std::mutex> l(*mu);
+    fs->push_back(framework::Async([sum]() { sum->fetch_add(1); }));
  }
 }
@@ -40,18 +41,21 @@ TEST(ThreadPool, ConcurrentInit) {
 }
 TEST(ThreadPool, ConcurrentRun) {
-  framework::ThreadPool* pool = framework::ThreadPool::GetInstance();
  std::atomic<int> sum(0);
  std::vector<std::thread> threads;
+  std::vector<std::future<void>> fs;
+  std::mutex fs_mu;
  int n = 50;
  // sum = (n * (n + 1)) / 2
  for (int i = 1; i <= n; ++i) {
-    std::thread t(do_sum, pool, &sum, i);
+    std::thread t(do_sum, &fs, &fs_mu, &sum, i);
    threads.push_back(std::move(t));
  }
  for (auto& t : threads) {
    t.join();
  }
-  pool->Wait();
+  for (auto& t : fs) {
+    t.wait();
+  }
  EXPECT_EQ(sum, ((n + 1) * n) / 2);
 }
--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
@@ -59,6 +59,7 @@ class VarDesc {
 public:
  explicit VarDesc(const std::string &name) {
    desc_.set_name(name);
+    // TODO(paddle-dev): Why default to lodtensor.
    desc_.mutable_type()->set_type(proto::VarType::LOD_TENSOR);
  }

--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -38,8 +38,12 @@ class Variable {
  template <typename T>
  T* GetMutable() {
-    if (!IsType<T>()) {
+    if (!holder_) {
      holder_.reset(new PlaceholderImpl<T>(new T()));
+    } else {
+      PADDLE_ENFORCE(IsType<T>(),
+                     "Variable must be type %s, the holding type is %s",
+                     typeid(T).name(), holder_->Type().name());
    }
    return static_cast<T*>(holder_->Ptr());
  }

--- a/paddle/fluid/framework/variable_test.cc
+++ b/paddle/fluid/framework/variable_test.cc
@@ -33,9 +33,10 @@ TEST(Variable, GetMutable) {
  const Tensor& tt = v->Get<Tensor>();
  EXPECT_EQ(1234, tt.content_);
-  std::string* s = v->GetMutable<std::string>();
+  try {
-  *s = "hello";
+    v->GetMutable<std::string>();
+  } catch (std::exception& e) {
-  const std::string& ss = v->Get<std::string>();
+    return;
-  EXPECT_EQ("hello", ss);
+  }
+  EXPECT_TRUE(false);
 }
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -19,9 +19,19 @@ cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
 add_subdirectory(api)
+set(STATIC_INFERENCE_APIS paddle_fluid_api paddle_inference_api analysis_predictor)
+set(SHARED_INFERENCE_SRCS
+    io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc)
+if (WITH_GPU AND TENSORRT_FOUND)
+  set(STATIC_INFERENCE_APIS ${STATIC_INFERENCE_APIS} paddle_inference_tensorrt_subgraph_engine)
+  set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/api/api_tensorrt_subgraph_engine.cc)
+endif()
 # Create static library
-cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api 
+cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
-           analysis_predictor zero_copy_tensor)
 if(NOT APPLE)
  # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
@@ -29,11 +39,8 @@ if(NOT APPLE)
 endif()
 # Create shared library
-cc_library(paddle_fluid_shared SHARED
+cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
-    SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
+    DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)
-    ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc
-    DEPS ${fluid_modules} paddle_fluid_api)
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 if(NOT APPLE)

--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -70,7 +70,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
      auto trt_teller = [&](const Node* node) {
        std::unordered_set<std::string> teller_set(
            {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
-             "depthwise_conv2d", "batch_norm", "concat", "tanh",
+             "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
             "elementwise_add", "dropout"});
        if (!node->IsFunction()) return false;
@@ -101,7 +101,16 @@ Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
 void Analyzer::Run(Argument* argument) {
  std::vector<std::string> passes;
-  for (auto& pass : all_ir_passes_) {
+#ifdef PADDLE_WITH_MKLDNN
+  if (use_mkldnn_) {
+    VLOG(3) << "Adding MKL-DNN placement pass";
+    passes.push_back("mkldnn_placement_pass");
+  }
+#endif
+  // infer_clean_graph_pass should be the first default pass
+  // after mkldnn_placement_pass.
+  passes.push_back("infer_clean_graph_pass");
+  for (auto& pass : ir_passes_) {
    if (!disabled_ir_passes_.count(pass)) {
      passes.push_back(pass);
      passes.push_back("graph_viz_pass");  // add graphviz for debug.
@@ -117,11 +126,26 @@ void Analyzer::Run(Argument* argument) {
  }
 }
+Analyzer& Analyzer::IncludeAllIrPasses() {
+  ir_passes_ = all_ir_passes_;
+  return *this;
+}
 Analyzer& Analyzer::DisableIrPasses(const std::vector<std::string>& passes) {
  disabled_ir_passes_.insert(passes.begin(), passes.end());
  return *this;
 }
+Analyzer& Analyzer::IncludeIrPasses(const std::vector<std::string>& passes) {
+  ir_passes_ = passes;
+  return *this;
+}
+Analyzer& Analyzer::SetUseMkldnn(bool use_mkldnn) {
+  use_mkldnn_ = use_mkldnn;
+  return *this;
+}
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -54,6 +54,9 @@ class Analyzer : public OrderedRegistry<PassManager> {
  void Run(Argument* argument);
  Analyzer& DisableIrPasses(const std::vector<std::string>& passes);
+  Analyzer& IncludeIrPasses(const std::vector<std::string>& passes);
+  Analyzer& IncludeAllIrPasses();
+  Analyzer& SetUseMkldnn(bool use_mkldnn);
  DISABLE_COPY_AND_ASSIGN(Analyzer);
@@ -64,21 +67,28 @@ class Analyzer : public OrderedRegistry<PassManager> {
  // larger fusion.
  const std::vector<std::string> all_ir_passes_{{
      // Manual update the passes here.
-      "infer_clean_graph_pass",       //
+      "attention_lstm_fuse_pass",       //
-      "attention_lstm_fuse_pass",     //
+      "seqconv_eltadd_relu_fuse_pass",  //
-      "embedding_fc_lstm_fuse_pass",  //
+      "embedding_fc_lstm_fuse_pass",    //
-      "fc_lstm_fuse_pass",            //
+      "fc_lstm_fuse_pass",              //
-      "mul_lstm_fuse_pass",           //
+      "mul_lstm_fuse_pass",             //
-      "fc_gru_fuse_pass",             //
+      "fc_gru_fuse_pass",               //
-      "mul_gru_fuse_pass",            //
+      "mul_gru_fuse_pass",              //
-      "seq_concat_fc_fuse_pass",      //
+      "seq_concat_fc_fuse_pass",        //
-      "fc_fuse_pass",                 //
+      "fc_fuse_pass",                   //
+      "conv_bn_fuse_pass",              //
+      "conv_eltwiseadd_bn_fuse_pass",   //
 #ifdef PADDLE_WITH_MKLDNN
-      "conv_relu_mkldnn_fuse_pass",  //
+      "conv_bias_mkldnn_fuse_pass",             //
+      "conv_relu_mkldnn_fuse_pass",             //
+      "conv_elementwise_add_mkldnn_fuse_pass",  //
 #endif
  }};
  std::unordered_set<std::string> disabled_ir_passes_;
+  // Ir passes to run
+  std::vector<std::string> ir_passes_;
+  bool use_mkldnn_;
 };
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -51,9 +51,7 @@ void TestWord2vecPrediction(const std::string& model_path) {
  config.model_dir = model_path;
  config.use_gpu = false;
  config.device = 0;
-  auto predictor =
+  auto predictor = ::paddle::CreatePaddlePredictor<NativeConfig>(config);
-      ::paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
-          config);
  // One single batch

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -18,7 +18,8 @@ if(APPLE)
 endif(APPLE)
-set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB})
+set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB}
+        )
 if(WITH_GPU AND TENSORRT_FOUND)
    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor)
@@ -31,10 +32,17 @@ function(inference_api_test TARGET_NAME)
        set(multiValueArgs ARGS)
        cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-        cc_test(${TARGET_NAME}
+	if (WITH_GPU)
-                SRCS ${inference_test_SRC}
+		cc_test(${TARGET_NAME}
-                DEPS "${inference_deps}"
+			SRCS ${inference_test_SRC}
-                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
+			DEPS "${inference_deps}"
+			ARGS --dirname=${PYTHON_TESTS_DIR}/book/ --fraction_of_gpu_memory_to_use=0.15)
+        else()
+		cc_test(${TARGET_NAME}
+			SRCS ${inference_test_SRC}
+			DEPS "${inference_deps}"
+			ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
+	endif()
        if(inference_test_ARGS)
            set_tests_properties(${TARGET_NAME}
                    PROPERTIES DEPENDS "${inference_test_ARGS}")
@@ -42,7 +50,8 @@ function(inference_api_test TARGET_NAME)
    endif(WITH_TESTING)
 endfunction(inference_api_test)
-cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope)
+cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope)
 cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
 cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
 cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api)

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -25,9 +25,11 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 DECLARE_bool(profile);
+DECLARE_int32(paddle_num_threads);
 namespace paddle {
@@ -47,6 +49,9 @@ bool AnalysisPredictor::Init(
  }
 #endif
+  // no matter with or without MKLDNN
+  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
  if (config_.use_gpu) {
    place_ = paddle::platform::CUDAPlace(config_.device);
    LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim "
@@ -72,15 +77,12 @@ bool AnalysisPredictor::Init(
    inference_program_ = program;
  }
-  if (config_._use_mkldnn) {
-    executor_->EnableMKLDNN(*inference_program_);
-  }
  executor_->Prepare(scope_.get(), *inference_program_, 0,
                     config_.use_feed_fetch_ops);
  // Get the feed_target_names and fetch_target_names
  PrepareFeedFetch();
  return true;
 }
@@ -108,6 +110,10 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
    return false;
  }
  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+  // Fix TensorArray reuse not cleaned bug.
+  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  tensor_array_batch_cleaner_.ResetTensorArray();
  return true;
 }
@@ -220,10 +226,24 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  argument_.origin_program_desc.reset(
      new ProgramDesc(*inference_program_->Proto()));
-  PADDLE_ENFORCE(
-      config_.ir_mode == contrib::AnalysisConfig::IrPassMode::kExclude,
+  switch (config_.ir_mode) {
-      "Only kExclude is supported yet.");
+    case contrib::AnalysisConfig::IrPassMode::kExclude:
-  Analyzer().DisableIrPasses(config_.ir_passes).Run(&argument_);
+      Analyzer()
+          .IncludeAllIrPasses()
+          .SetUseMkldnn(config_._use_mkldnn)
+          .DisableIrPasses(config_.ir_passes)
+          .Run(&argument_);
+      break;
+    case contrib::AnalysisConfig::IrPassMode::kInclude:
+      Analyzer()
+          .SetUseMkldnn(config_._use_mkldnn)
+          .IncludeIrPasses(config_.ir_passes)
+          .Run(&argument_);
+      break;
+    default:
+      LOG(ERROR) << "Only kExclude and kInclude modes are supoorted yet.";
+  }
  CHECK(argument_.transformed_program_desc);
  VLOG(5) << "to prepare executor";
@@ -307,6 +327,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 bool AnalysisPredictor::ZeroCopyRun() {
  executor_->Run();
+  // Fix TensorArray reuse not cleaned bug.
+  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  tensor_array_batch_cleaner_.ResetTensorArray();
  return true;
 }
@@ -335,6 +358,19 @@ bool AnalysisPredictor::LoadProgramDesc() {
  }
  return true;
 }
+AnalysisPredictor::~AnalysisPredictor() {
+#if !defined(_WIN32)
+  if (FLAGS_profile) {
+    platform::DisableProfiler(platform::EventSortingKey::kTotal,
+                              "./profile.log");
+  }
+#endif
+  if (sub_scope_) {
+    scope_->DeleteScope(sub_scope_);
+  }
+}
 std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
  auto *x = new AnalysisPredictor(config_);
  x->Init(scope_, inference_program_);

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -18,6 +18,7 @@
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/string/printf.h"
@@ -72,6 +73,7 @@ class AnalysisPredictor : public PaddlePredictor {
  template <typename T>
  void GetFetchOne(const framework::LoDTensor &fetchs,
                   PaddleTensor *output_data);
+  ~AnalysisPredictor();
 private:
  contrib::AnalysisConfig config_;
@@ -87,6 +89,7 @@ class AnalysisPredictor : public PaddlePredictor {
  // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
  // concurrency problems, so cache them.
  std::vector<framework::LoDTensor> feed_tensors_;
+  details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
 };
 }  // namespace paddle
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -27,9 +27,7 @@ TEST(AnalysisPredictor, ZeroCopy) {
  config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
  config.use_feed_fetch_ops = false;
-  auto predictor =
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);
  auto w0 = predictor->GetInputTensor("firstw");
  auto w1 = predictor->GetInputTensor("secondw");

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -22,10 +22,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 DEFINE_bool(profile, false, "Turn on profiler for fluid");
+DECLARE_int32(paddle_num_threads);
 namespace paddle {
 namespace {
@@ -72,6 +75,9 @@ bool NativePaddlePredictor::Init(
  }
 #endif
+  // no matter with or without MKLDNN
+  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
  if (config_.use_gpu) {
    place_ = paddle::platform::CUDAPlace(config_.device);
  } else {
@@ -152,6 +158,10 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
    return false;
  }
  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+  // Fix TensorArray reuse not cleaned bug.
+  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  tensor_array_batch_cleaner_.ResetTensorArray();
  return true;
 }

--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -26,11 +26,11 @@ limitations under the License. */
 #include <string>
 #include <vector>
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/init.h"
@@ -77,6 +77,7 @@ class NativePaddlePredictor : public PaddlePredictor {
  std::vector<framework::OpDesc *> fetchs_;
  // Do not use unique_ptr, use parent scope to delete
  framework::Scope *sub_scope_{nullptr};
+  details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
 };
 }  // namespace paddle
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -205,7 +205,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
      float* ref_data = refs[tid].data<float>();
      EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float)));
      for (int i = 0; i < refs[tid].numel(); ++i) {
-        EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF);
+        EXPECT_NEAR(ref_data[i], data[i], 2e-3);
      }
    });
  }

--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -185,3 +185,4 @@ USE_TRT_CONVERTER(softmax);
 USE_TRT_CONVERTER(batch_norm);
 USE_TRT_CONVERTER(concat);
 USE_TRT_CONVERTER(dropout);
+USE_TRT_CONVERTER(pad);
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
@@ -41,11 +41,8 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
  config1.device = 0;
  config1.max_batch_size = 10;
-  auto predictor0 =
+  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
+  auto predictor1 = CreatePaddlePredictor<MixedRTConfig>(config1);
-  auto predictor1 =
-      CreatePaddlePredictor<MixedRTConfig,
-                            PaddleEngineKind::kAutoMixedTensorRT>(config1);
  for (int batch_id = 0; batch_id < 1; batch_id++) {
    //# 2. Prepare input.

--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -3,6 +3,7 @@ project(cpp_inference_demo CXX C)
 option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL."       ON)
 option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
 option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
+option(USE_TENSORRT "Compile demo with TensorRT."   OFF)
 macro(safe_set_static_flag)
    foreach(flag_var
@@ -51,6 +52,7 @@ include_directories("${PADDLE_LIB}")
 include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
 include_directories("${PADDLE_LIB}/third_party/install/glog/include")
 include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
 if (NOT WIN32)
 include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
 include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
@@ -60,6 +62,13 @@ endif(NOT WIN32)
 include_directories("${PADDLE_LIB}/third_party/boost")
 include_directories("${PADDLE_LIB}/third_party/eigen3")
+if (NOT WIN32)
+  if (USE_TENSORRT AND WITH_GPU)
+      include_directories("${TENSORRT_INCLUDE_DIR}")
+      link_directories("${TENSORRT_LIB_DIR}")
+  endif()
+endif(NOT WIN32)
 if (NOT WIN32)
 link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
 link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
@@ -69,13 +78,14 @@ endif(NOT WIN32)
 link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
 link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
 link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
-link_directories("${PADDLE_LIB}/paddle/fluid/inference")
+link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
+link_directories("${PADDLE_LIB}/paddle/lib")
 add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
 if(WITH_MKL)
  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} 
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
  set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
  if(EXISTS ${MKLDNN_PATH})
@@ -89,17 +99,17 @@ endif()
 # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
 if(WITH_STATIC_LIB)
  set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
+      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
 else()
  set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
+      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
 endif()
 if (NOT WIN32)
 set(EXTERNAL_LIB "-lrt -ldl -lpthread")
 set(DEPS ${DEPS}
    ${MATH_LIB} ${MKLDNN_LIB}
-    glog gflags protobuf snappystream snappy z
+    glog gflags protobuf snappystream snappy z xxhash
    ${EXTERNAL_LIB})
 else()
 set(DEPS ${DEPS}
@@ -112,6 +122,10 @@ endif(NOT WIN32)
 if(WITH_GPU)
  if(NOT WIN32)
+    if (USE_TENSORRT)
+      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
+      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
+    endif()
    set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
  else()
    set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} )

--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -3,19 +3,28 @@ PADDLE_ROOT=$1
 TURN_ON_MKL=$2 # use MKL or Openblas
 TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
 DATA_DIR=$4 # dataset
+TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, defalut to /usr/local/TensorRT/include
+TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib
+inference_install_dir=${PADDLE_ROOT}/build/fluid_inference_install_dir
 cd `dirname $0`
 current_dir=`pwd`
 if [ $2 == ON ]; then
  # You can export yourself if move the install path
-  MKL_LIB=${PADDLE_ROOT}/build/fluid_install_dir/third_party/install/mklml/lib
+  MKL_LIB=${inference_install_dir}/third_party/install/mklml/lib
  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${MKL_LIB}
 fi
 if [ $3 == ON ]; then
  use_gpu_list='true false'
-else    
+else
  use_gpu_list='false'
 fi
+USE_TENSORRT=OFF
+if [ -d "$TENSORRT_INCLUDE_DIR" -a -d "$TENSORRT_LIB_DIR" ]; then
+  USE_TENSORRT=ON
+fi
 PREFIX=inference-vis-demos%2F
 URL_ROOT=http://paddlemodels.cdn.bcebos.com/${PREFIX}
@@ -47,11 +56,12 @@ cd build
 for WITH_STATIC_LIB in ON OFF; do
  # -----simple_on_word2vec-----
  rm -rf *
-  cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+  cmake .. -DPADDLE_LIB=${inference_install_dir} \
    -DWITH_MKL=$TURN_ON_MKL \
    -DDEMO_NAME=simple_on_word2vec \
    -DWITH_GPU=$TEST_GPU_CPU \
-    -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+    -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+    -DON_INFER=ON
  make -j
  word2vec_model=${PADDLE_ROOT}'/build/python/paddle/fluid/tests/book/word2vec.inference.model'
  if [ -d $word2vec_model ]; then
@@ -67,14 +77,15 @@ for WITH_STATIC_LIB in ON OFF; do
  fi
  # ---------vis_demo---------
  rm -rf *
-  cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+  cmake .. -DPADDLE_LIB=${inference_install_dir} \
    -DWITH_MKL=$TURN_ON_MKL \
    -DDEMO_NAME=vis_demo \
    -DWITH_GPU=$TEST_GPU_CPU \
-    -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+    -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+    -DON_INFER=ON
  make -j
  for use_gpu in $use_gpu_list; do
-    for vis_demo_name in $vis_demo_list; do 
+    for vis_demo_name in $vis_demo_list; do
      ./vis_demo \
        --modeldir=$DATA_DIR/$vis_demo_name/model \
        --data=$DATA_DIR/$vis_demo_name/data.txt \
@@ -86,5 +97,24 @@ for WITH_STATIC_LIB in ON OFF; do
      fi
    done
  done
+  # --------tensorrt mobilenet------
+  if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
+    rm -rf *
+    cmake .. -DPADDLE_LIB=${inference_install_dir} \
+      -DWITH_MKL=$TURN_ON_MKL \
+      -DDEMO_NAME=trt_mobilenet_demo \
+      -DWITH_GPU=$TEST_GPU_CPU \
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DUSE_TENSORRT=$USE_TENSORRT \
+      -DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \
+      -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR \
+      -DON_INFER=ON
+    make -j
+    ./trt_mobilenet_demo \
+      --modeldir=$DATA_DIR/mobilenet/model \
+      --data=$DATA_DIR/mobilenet/data.txt \
+      --refer=$DATA_DIR/mobilenet/result.txt 
+  fi
 done
 set +x
--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@@ -22,8 +22,8 @@ limitations under the License. */
 #include <algorithm>
 #include <memory>
 #include <thread>  //NOLINT
-#include "paddle/fluid/inference/paddle_inference_api.h"
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/include/paddle_inference_api.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
 DEFINE_bool(use_gpu, false, "Whether use gpu.");
@@ -42,8 +42,7 @@ void Main(bool use_gpu) {
  config.use_gpu = use_gpu;
  config.fraction_of_gpu_memory = 0.15;
  config.device = 0;
-  auto predictor =
+  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
  for (int batch_id = 0; batch_id < 3; batch_id++) {
    //# 2. Prepare input.
@@ -62,17 +61,17 @@ void Main(bool use_gpu) {
    CHECK(predictor->Run(slots, &outputs));
    //# 4. Get output.
-    PADDLE_ENFORCE(outputs.size(), 1UL);
+    CHECK_EQ(outputs.size(), 1UL);
    // Check the output buffer size and result of each tid.
-    PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+    CHECK_EQ(outputs.front().data.length(), 33168UL);
    float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
                       0.000932706};
    const size_t num_elements = outputs.front().data.length() / sizeof(float);
    // The outputs' buffers are in CPU memory.
    for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements);
         i++) {
-      PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
+      CHECK_NEAR(static_cast<float*>(outputs.front().data.data())[i], result[i],
-                     result[i]);
+                 0.001);
    }
  }
 }
@@ -85,8 +84,7 @@ void MainThreads(int num_threads, bool use_gpu) {
  config.use_gpu = use_gpu;
  config.fraction_of_gpu_memory = 0.15;
  config.device = 0;
-  auto main_predictor =
+  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
  std::vector<std::thread> threads;
  for (int tid = 0; tid < num_threads; ++tid) {
@@ -108,9 +106,9 @@ void MainThreads(int num_threads, bool use_gpu) {
        CHECK(predictor->Run(inputs, &outputs));
        // 4. Get output.
-        PADDLE_ENFORCE(outputs.size(), 1UL);
+        CHECK_EQ(outputs.size(), 1UL);
        // Check the output buffer size and result of each tid.
-        PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+        CHECK_EQ(outputs.front().data.length(), 33168UL);
        float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
                           0.000932706};
        const size_t num_elements =
@@ -118,8 +116,8 @@ void MainThreads(int num_threads, bool use_gpu) {
        // The outputs' buffers are in CPU memory.
        for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements);
             i++) {
-          PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
+          CHECK_NEAR(static_cast<float*>(outputs.front().data.data())[i],
-                         result[i]);
+                     result[i], 0.001);
        }
      }
    });

--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+/*
+ * This file contains demo of mobilenet for tensorrt.
+ */
+#include <gflags/gflags.h>
+#include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
+#include "utils.h"  // NOLINT
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DEFINE_string(modeldir, "", "Directory of the inference model.");
+DEFINE_string(refer, "", "path to reference result for comparison.");
+DEFINE_string(
+    data, "",
+    "path of data; each line is a record, format is "
+    "'<space splitted floats as data>\t<space splitted ints as shape'");
+namespace paddle {
+namespace demo {
+/*
+ * Use the tensorrt fluid engine to inference the demo.
+ */
+void Main() {
+  std::unique_ptr<PaddlePredictor> predictor;
+  paddle::contrib::MixedRTConfig config;
+  config.param_file = FLAGS_modeldir + "/__params__";
+  config.prog_file = FLAGS_modeldir + "/__model__";
+  config.use_gpu = true;
+  config.device = 0;
+  config.max_batch_size = 1;
+  config.fraction_of_gpu_memory = 0.1;  // set by yourself
+  predictor = CreatePaddlePredictor<paddle::contrib::MixedRTConfig>(config);
+  VLOG(3) << "begin to process data";
+  // Just a single batch of data.
+  std::string line;
+  std::ifstream file(FLAGS_data);
+  std::getline(file, line);
+  auto record = ProcessALine(line);
+  file.close();
+  // Inference.
+  PaddleTensor input;
+  input.shape = record.shape;
+  input.data =
+      PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
+  input.dtype = PaddleDType::FLOAT32;
+  VLOG(3) << "run executor";
+  std::vector<PaddleTensor> output;
+  predictor->Run({input}, &output, 1);
+  VLOG(3) << "output.size " << output.size();
+  auto& tensor = output.front();
+  VLOG(3) << "output: " << SummaryTensor(tensor);
+  // compare with reference result
+  CheckOutput(FLAGS_refer, tensor);
+}
+}  // namespace demo
+}  // namespace paddle
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::demo::Main();
+  return 0;
+}
--- a/paddle/fluid/inference/api/demo_ci/utils.h
+++ b/paddle/fluid/inference/api/demo_ci/utils.h
@@ -14,13 +14,20 @@
 #pragma once
 #include <algorithm>
+#include <fstream>
+#include <iostream>
 #include <string>
 #include <vector>
-#include "paddle/fluid/inference/paddle_inference_api.h"
+#include "paddle/include/paddle_inference_api.h"
 namespace paddle {
 namespace demo {
+struct Record {
+  std::vector<float> data;
+  std::vector<int32_t> shape;
+};
 static void split(const std::string& str, char sep,
                  std::vector<std::string>* pieces) {
  pieces->clear();
@@ -39,6 +46,58 @@ static void split(const std::string& str, char sep,
  }
 }
+Record ProcessALine(const std::string& line) {
+  VLOG(3) << "process a line";
+  std::vector<std::string> columns;
+  split(line, '\t', &columns);
+  CHECK_EQ(columns.size(), 2UL)
+      << "data format error, should be <data>\t<shape>";
+  Record record;
+  std::vector<std::string> data_strs;
+  split(columns[0], ' ', &data_strs);
+  for (auto& d : data_strs) {
+    record.data.push_back(std::stof(d));
+  }
+  std::vector<std::string> shape_strs;
+  split(columns[1], ' ', &shape_strs);
+  for (auto& s : shape_strs) {
+    record.shape.push_back(std::stoi(s));
+  }
+  VLOG(3) << "data size " << record.data.size();
+  VLOG(3) << "data shape size " << record.shape.size();
+  return record;
+}
+void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
+  std::string line;
+  std::ifstream file(referfile);
+  std::getline(file, line);
+  auto refer = ProcessALine(line);
+  file.close();
+  size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
+  VLOG(3) << "predictor output numel " << numel;
+  VLOG(3) << "reference output numel " << refer.data.size();
+  CHECK_EQ(numel, refer.data.size());
+  switch (output.dtype) {
+    case PaddleDType::INT64: {
+      for (size_t i = 0; i < numel; ++i) {
+        CHECK_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]);
+      }
+      break;
+    }
+    case PaddleDType::FLOAT32:
+      for (size_t i = 0; i < numel; ++i) {
+        CHECK_LT(
+            fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]),
+            1e-5);
+      }
+      break;
+  }
+}
 /*
 * Get a summary of a PaddleTensor content.
 */

--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -17,11 +17,8 @@ limitations under the License. */
 */
 #include <gflags/gflags.h>
-#include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
+#include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
-#include <fstream>
+#include "utils.h"  // NOLINT
-#include <iostream>
-#include "paddle/fluid/inference/demo_ci/utils.h"
-#include "paddle/fluid/platform/enforce.h"
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
@@ -37,71 +34,13 @@ DEFINE_bool(use_gpu, false, "Whether use gpu.");
 namespace paddle {
 namespace demo {
-struct Record {
+using contrib::AnalysisConfig;
-  std::vector<float> data;
-  std::vector<int32_t> shape;
-};
-void split(const std::string& str, char sep, std::vector<std::string>* pieces);
-Record ProcessALine(const std::string& line) {
-  VLOG(3) << "process a line";
-  std::vector<std::string> columns;
-  split(line, '\t', &columns);
-  CHECK_EQ(columns.size(), 2UL)
-      << "data format error, should be <data>\t<shape>";
-  Record record;
-  std::vector<std::string> data_strs;
-  split(columns[0], ' ', &data_strs);
-  for (auto& d : data_strs) {
-    record.data.push_back(std::stof(d));
-  }
-  std::vector<std::string> shape_strs;
-  split(columns[1], ' ', &shape_strs);
-  for (auto& s : shape_strs) {
-    record.shape.push_back(std::stoi(s));
-  }
-  VLOG(3) << "data size " << record.data.size();
-  VLOG(3) << "data shape size " << record.shape.size();
-  return record;
-}
-void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
-  std::string line;
-  std::ifstream file(referfile);
-  std::getline(file, line);
-  auto refer = ProcessALine(line);
-  file.close();
-  size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
-  VLOG(3) << "predictor output numel " << numel;
-  VLOG(3) << "reference output numel " << refer.data.size();
-  PADDLE_ENFORCE_EQ(numel, refer.data.size());
-  switch (output.dtype) {
-    case PaddleDType::INT64: {
-      for (size_t i = 0; i < numel; ++i) {
-        PADDLE_ENFORCE_EQ(static_cast<int64_t*>(output.data.data())[i],
-                          refer.data[i]);
-      }
-      break;
-    }
-    case PaddleDType::FLOAT32:
-      for (size_t i = 0; i < numel; ++i) {
-        PADDLE_ENFORCE_LT(
-            fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]),
-            1e-5);
-      }
-      break;
-  }
-}
 /*
- * Use the native fluid engine to inference the demo.
+ * Use the native and analysis fluid engine to inference the demo.
 */
 void Main(bool use_gpu) {
-  NativeConfig config;
+  std::unique_ptr<PaddlePredictor> predictor, analysis_predictor;
+  AnalysisConfig config;
  config.param_file = FLAGS_modeldir + "/__params__";
  config.prog_file = FLAGS_modeldir + "/__model__";
  config.use_gpu = use_gpu;
@@ -111,8 +50,8 @@ void Main(bool use_gpu) {
  }
  VLOG(3) << "init predictor";
-  auto predictor =
+  predictor = CreatePaddlePredictor<NativeConfig>(config);
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
  VLOG(3) << "begin to process data";
  // Just a single batch of data.
@@ -130,8 +69,8 @@ void Main(bool use_gpu) {
  input.dtype = PaddleDType::FLOAT32;
  VLOG(3) << "run executor";
-  std::vector<PaddleTensor> output;
+  std::vector<PaddleTensor> output, analysis_output;
-  predictor->Run({input}, &output);
+  predictor->Run({input}, &output, 1);
  VLOG(3) << "output.size " << output.size();
  auto& tensor = output.front();
@@ -139,6 +78,10 @@ void Main(bool use_gpu) {
  // compare with reference result
  CheckOutput(FLAGS_refer, tensor);
+  // the analysis_output has some diff with native_output,
+  // TODO(luotao): add CheckOutput for analysis_output later.
+  analysis_predictor->Run({input}, &analysis_output, 1);
 }
 }  // namespace demo
@@ -146,9 +89,10 @@ void Main(bool use_gpu) {
 int main(int argc, char** argv) {
  google::ParseCommandLineFlags(&argc, &argv, true);
-  paddle::demo::Main(false /* use_gpu*/);
  if (FLAGS_use_gpu) {
    paddle::demo::Main(true /*use_gpu*/);
+  } else {
+    paddle::demo::Main(false /*use_gpu*/);
  }
  return 0;
 }
--- a/paddle/fluid/inference/api/details/reset_tensor_array.cc
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc
--- a/paddle/fluid/inference/api/details/reset_tensor_array.h
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
--- a/paddle/fluid/operators/adadelta_op.cc
+++ b/paddle/fluid/operators/adadelta_op.cc
--- a/paddle/fluid/operators/adadelta_op.h
+++ b/paddle/fluid/operators/adadelta_op.h
--- a/paddle/fluid/operators/adagrad_op.h
+++ b/paddle/fluid/operators/adagrad_op.h
--- a/paddle/fluid/operators/adam_op.cc
+++ b/paddle/fluid/operators/adam_op.cc
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
--- a/paddle/fluid/operators/adamax_op.cc
+++ b/paddle/fluid/operators/adamax_op.cc
--- a/paddle/fluid/operators/adamax_op.h
+++ b/paddle/fluid/operators/adamax_op.h
--- a/paddle/fluid/operators/affine_channel_op.cc
+++ b/paddle/fluid/operators/affine_channel_op.cc
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
--- a/paddle/fluid/operators/argsort_op.cc
+++ b/paddle/fluid/operators/argsort_op.cc
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
--- a/paddle/fluid/operators/conv_shift_op.cc
+++ b/paddle/fluid/operators/conv_shift_op.cc
--- a/paddle/fluid/operators/cub_reduce.h
+++ b/paddle/fluid/operators/cub_reduce.h
--- a/paddle/fluid/operators/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/decayed_adagrad_op.cc
--- a/paddle/fluid/operators/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/decayed_adagrad_op.h
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
--- a/paddle/fluid/operators/detection/gpc.cc
+++ b/paddle/fluid/operators/detection/gpc.cc
--- a/paddle/fluid/operators/detection/gpc.h
+++ b/paddle/fluid/operators/detection/gpc.h
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
--- a/paddle/fluid/operators/detection/poly_util.cc
+++ b/paddle/fluid/operators/detection/poly_util.cc
--- a/paddle/fluid/operators/detection/poly_util.h
+++ b/paddle/fluid/operators/detection/poly_util.h
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
--- a/paddle/fluid/operators/ftrl_op.cc
+++ b/paddle/fluid/operators/ftrl_op.cc
--- a/paddle/fluid/operators/ftrl_op.h
+++ b/paddle/fluid/operators/ftrl_op.h
--- a/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
--- a/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc
--- a/paddle/fluid/operators/math/cpu_lstm_compute.h
+++ b/paddle/fluid/operators/math/cpu_lstm_compute.h
--- a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/fluid/operators/gather.h
--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
--- a/paddle/fluid/operators/hash_op.h
+++ b/paddle/fluid/operators/hash_op.h
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
--- a/paddle/fluid/operators/isfinite_op.cu
+++ b/paddle/fluid/operators/isfinite_op.cu
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
--- a/paddle/fluid/operators/math/algorithm.h
+++ b/paddle/fluid/operators/math/algorithm.h
--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
--- a/paddle/fluid/operators/math/concat.h
+++ b/paddle/fluid/operators/math/concat.h
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
--- a/paddle/fluid/operators/math/cpu_lstm_compute.cc
+++ b/paddle/fluid/operators/math/cpu_lstm_compute.cc
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
--- a/paddle/fluid/operators/math/jit_kernel_macro.h
+++ b/paddle/fluid/operators/math/jit_kernel_macro.h
--- a/paddle/fluid/operators/math/jit_kernel_rnn.cc
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
--- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
--- a/paddle/fluid/operators/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_concat_op.h
--- a/paddle/fluid/operators/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cc
--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
--- a/paddle/fluid/operators/sequence_reshape_op.cc
+++ b/paddle/fluid/operators/sequence_reshape_op.cc
--- a/paddle/fluid/operators/sequence_reverse_op.cc
+++ b/paddle/fluid/operators/sequence_reverse_op.cc
--- a/paddle/fluid/operators/sequence_reverse_op.cu
+++ b/paddle/fluid/operators/sequence_reverse_op.cu
--- a/paddle/fluid/operators/sequence_reverse_op.h
+++ b/paddle/fluid/operators/sequence_reverse_op.h
--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
--- a/paddle/fluid/operators/sequence_unpad_op.cc
+++ b/paddle/fluid/operators/sequence_unpad_op.cc
--- a/paddle/fluid/operators/sequence_unpad_op.cu
+++ b/paddle/fluid/operators/sequence_unpad_op.cu
--- a/paddle/fluid/operators/sequence_unpad_op.h
+++ b/paddle/fluid/operators/sequence_unpad_op.h
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
--- a/paddle/fluid/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
--- a/paddle/fluid/operators/transpose_op.cu.cc
+++ b/paddle/fluid/operators/transpose_op.cu.cc
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cu
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/train/demo/CMakeLists.txt
+++ b/paddle/fluid/train/demo/CMakeLists.txt
--- a/paddle/fluid/train/demo/README.md
+++ b/paddle/fluid/train/demo/README.md
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
--- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py
--- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
--- a/python/paddle/fluid/tests/unittests/test_fused_embedding_fc_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_embedding_fc_lstm_op.py
--- a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
--- a/python/paddle/fluid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py
--- a/python/paddle/fluid/tests/unittests/test_hash_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hash_op.py
--- a/python/paddle/fluid/tests/unittests/test_isfinite_op.py
+++ b/python/paddle/fluid/tests/unittests/test_isfinite_op.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_metrics.py
+++ b/python/paddle/fluid/tests/unittests/test_metrics.py
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
--- a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
+++ b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
--- a/python/paddle/fluid/tests/unittests/test_roi_align_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
--- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
--- a/python/paddle/fluid/tests/unittests/test_seq_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_reverse.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_reverse.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py
--- a/python/paddle/fluid/tests/unittests/test_slice_var.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_var.py
--- a/python/paddle/fluid/tests/unittests/test_top_k_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
--- a/python/paddle/utils/plot.py
+++ b/python/paddle/utils/plot.py