diff --git a/AUTHORS.md b/AUTHORS.md
index 41b7193677a0208ba2fa82b72862292572dcb6ef..4060f75613ac4dadf353ff53a73fd0647a8052be 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -43,6 +43,7 @@
 | qingqing01 | Qing-Qing Dang |
 | reyoung | Yang Yu |
 | Superjom | Chun-Wei Yan |
+| tensor-tang | Jian Tang |
 | tianbingsz | Tian-Bing Xu |
 | tpatejko | Tomasz Patejko |
 | typhoonzero | Yi Wu |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ed704585d8a6bf3befd9a549aa5a62a33fea3da9..bd5360407503de7f1ede1276904d59ac214940ef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,6 +41,7 @@ option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_F
 option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
+option(WITH_NGRAPH      "Compile PaddlePaddle with nGraph support."     OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -65,6 +66,8 @@ option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
+option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF)
+option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(ON_INFER         "Turn on inference optimization."               OFF)
@@ -103,6 +106,8 @@ if(ANDROID OR IOS)
         "Disable RDMA when cross-compiling for Android and iOS" FORCE)
     set(WITH_MKL OFF CACHE STRING
         "Disable MKL when cross-compiling for Android and iOS" FORCE)
+    set(WITH_NGRAPH OFF CACHE STRING
+        "Disable nGraph when cross-compiling for Android and iOS" FORCE)
     set(WITH_GOLANG OFF CACHE STRING
         "Disable golang when cross-compiling for Android and iOS" FORCE)
 
@@ -171,6 +176,7 @@ include(external/protobuf)  # download, build, install protobuf
 include(external/python)    # download, build, install python
 include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
+include(external/ngraph)    # download, build, install nGraph
 include(external/swig)      # download, build, install swig
 include(external/boost)     # download boost
 include(external/any)       # download libn::any
diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
index 84354c446e2f54fa13b90fa37221eed90968b251..06fc6061bc98eec8c4c71860333f7d3456952aeb 100644
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -58,19 +58,21 @@ ExternalProject_Add(
                         -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
                         -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
                         -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
+                        -DBUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN}
+                        -DBUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM}
                         ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
 )
 
 message(STATUS "Anakin for inference is enabled")
 message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
-
+add_dependencies(extern_anakin protobuf mklml)
 add_library(anakin_shared SHARED IMPORTED GLOBAL)
 set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB})
-add_dependencies(anakin_shared extern_anakin protobuf mklml)
+add_dependencies(anakin_shared extern_anakin)
 
 add_library(anakin_saber SHARED IMPORTED GLOBAL)
 set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
-add_dependencies(anakin_saber extern_anakin protobuf mklml)
+add_dependencies(anakin_saber extern_anakin)
 
 list(APPEND external_project_dependencies anakin_shared anakin_saber)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index baf253df2755657b01b67c410f63b7d8422d4df3..785148d4f9f44032e2ce5bf93f0dc80fc865808b 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -37,7 +37,6 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
 
 INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
-INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include mkldnn.h
 
 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
     SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
@@ -45,7 +44,7 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
 ELSE()
     MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
 ENDIF()
-SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result")
+SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds")
 SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
 SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
 SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
@@ -54,7 +53,7 @@ ExternalProject_Add(
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS             ${MKLDNN_DEPENDS}
     GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "64e03a1939e0d526aa8e9f2e3f7dc0ad8d372944"
+    GIT_TAG             "21fb5f2af1dd14e132af4f1b79160977ee487818"
     PREFIX              ${MKLDNN_SOURCES_DIR}
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..2e335579f32df4f146c8d88e05e684a9a8105e20
--- /dev/null
+++ b/cmake/external/ngraph.cmake
@@ -0,0 +1,92 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_library(ngraph INTERFACE)
+
+IF(WIN32 OR APPLE)
+    MESSAGE(WARNING
+        "Windows or Mac is not supported with nGraph in Paddle yet."
+        "Force WITH_NGRAPH=OFF")
+    SET(WITH_NGRAPH OFF CACHE STRING "Disable nGraph in Windows and MacOS" FORCE)
+ENDIF()
+
+IF(${WITH_NGRAPH} AND NOT ${WITH_MKLDNN})
+    MESSAGE(WARNING
+        "nGraph needs mkl-dnn to be enabled."
+        "Force WITH_NGRAPH=OFF")
+    SET(WITH_NGRAPH OFF CACHE STRING "Disable nGraph if mkl-dnn is disabled" FORCE)
+ENDIF()
+
+IF(NOT ${WITH_NGRAPH})
+    return()
+ENDIF()
+
+INCLUDE(ExternalProject)
+
+SET(NGRAPH_PROJECT         "extern_ngraph")
+SET(NGRAPH_VERSION         "0.9")
+SET(NGRAPH_GIT_TAG         "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
+SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
+SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
+SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
+SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION})
+SET(NGRAPH_CPU_LIB_NAME    libcpu_backend.so)
+SET(NGRAPH_TBB_LIB_NAME    libtbb.so.2)
+SET(NGRAPH_GIT_REPO        "https://github.com/NervanaSystems/ngraph.git")
+
+ExternalProject_Add(
+    ${NGRAPH_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    DEPENDS             ${MKLDNN_PROJECT} ${MKLML_PROJECT}
+    GIT_REPOSITORY      ${NGRAPH_GIT_REPO}
+    GIT_TAG             ${NGRAPH_GIT_TAG}
+    PREFIX              ${NGRAPH_SOURCES_DIR}
+    UPDATE_COMMAND      ""
+    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${NGRAPH_INSTALL_DIR}
+    CMAKE_ARGS          -DNGRAPH_UNIT_TEST_ENABLE=FALSE
+    CMAKE_ARGS          -DNGRAPH_TOOLS_ENABLE=FALSE
+    CMAKE_ARGS          -DNGRAPH_INTERPRETER_ENABLE=FALSE
+    CMAKE_ARGS          -DNGRAPH_DEX_ONLY=TRUE
+    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+    CMAKE_ARGS          -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
+    CMAKE_ARGS          -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
+)
+
+if(UNIX AND NOT APPLE)
+    include(GNUInstallDirs)
+    SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
+else()
+    SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/lib)
+endif()
+MESSAGE(STATUS "nGraph lib will be installed at: ${NGRAPH_LIB_DIR}")
+
+SET(NGRAPH_SHARED_LIB      ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
+SET(NGRAPH_CPU_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
+SET(NGRAPH_TBB_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
+
+# Workaround for nGraph expecting mklml to be in mkldnn install directory.
+ExternalProject_Add_Step(
+    ${NGRAPH_PROJECT}
+    PrepareMKL
+    COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_LIB} ${MKLDNN_INSTALL_DIR}/lib/libmklml_intel.so
+    COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_IOMP_LIB} ${MKLDNN_INSTALL_DIR}/lib/libiomp5.so
+    DEPENDEES download
+    DEPENDERS configure
+)
+
+add_dependencies(ngraph ${NGRAPH_PROJECT})
+target_compile_definitions(ngraph INTERFACE -DPADDLE_WITH_NGRAPH)
+target_include_directories(ngraph INTERFACE ${NGRAPH_INC_DIR})
+target_link_libraries(ngraph INTERFACE ${NGRAPH_SHARED_LIB})
+LIST(APPEND external_project_dependencies ngraph)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 550b0dada8e90c1e2b33705fd53c065672113b45..45ef9b4550291cadaa9571f05dbaefdf4a0c223a 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -30,66 +30,61 @@ UNSET_VAR(PROTOBUF_LITE_LIBRARY)
 UNSET_VAR(PROTOBUF_LIBRARY)
 UNSET_VAR(PROTOBUF_INCLUDE_DIR)
 UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
+function(protobuf_generate_python SRCS)
+    # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
+    if(NOT ARGN)
+        message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
+        return()
+    endif()
 
-if(NOT COMMAND protobuf_generate_python)  # before cmake 3.4, protobuf_genrerate_python is not defined.
-    function(protobuf_generate_python SRCS)
-        # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
-        if(NOT ARGN)
-            message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
-            return()
-        endif()
-
-        if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
-            # Create an include path for each file specified
-            foreach(FIL ${ARGN})
-                get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-                get_filename_component(ABS_PATH ${ABS_FIL} PATH)
-                list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
-                if(${_contains_already} EQUAL -1)
-                    list(APPEND _protobuf_include_path -I ${ABS_PATH})
-                endif()
-            endforeach()
-        else()
-            set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
-        endif()
-
-        if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
-            set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
-        endif()
-
-        if(DEFINED Protobuf_IMPORT_DIRS)
-            foreach(DIR ${Protobuf_IMPORT_DIRS})
-                get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
-                list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
-                if(${_contains_already} EQUAL -1)
-                    list(APPEND _protobuf_include_path -I ${ABS_PATH})
-                endif()
-            endforeach()
-        endif()
-
-        set(${SRCS})
+    if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
+        # Create an include path for each file specified
         foreach(FIL ${ARGN})
             get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-            get_filename_component(FIL_WE ${FIL} NAME_WE)
-            if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
-                get_filename_component(FIL_DIR ${FIL} DIRECTORY)
-                if(FIL_DIR)
-                    set(FIL_WE "${FIL_DIR}/${FIL_WE}")
-                endif()
+            get_filename_component(ABS_PATH ${ABS_FIL} PATH)
+            list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+            if(${_contains_already} EQUAL -1)
+                list(APPEND _protobuf_include_path -I ${ABS_PATH})
             endif()
+        endforeach()
+    else()
+        set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
+    endif()
+    if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
+        set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
+    endif()
 
-            list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
-            add_custom_command(
-                    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
-                    COMMAND  ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
-                    DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE}
-                    COMMENT "Running Python protocol buffer compiler on ${FIL}"
-                    VERBATIM )
+    if(DEFINED Protobuf_IMPORT_DIRS)
+        foreach(DIR ${Protobuf_IMPORT_DIRS})
+            get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
+            list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+            if(${_contains_already} EQUAL -1)
+                list(APPEND _protobuf_include_path -I ${ABS_PATH})
+            endif()
         endforeach()
+    endif()
 
-        set(${SRCS} ${${SRCS}} PARENT_SCOPE)
-    endfunction()
-endif()
+    set(${SRCS})
+    foreach(FIL ${ARGN})
+        get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+        get_filename_component(FIL_WE ${FIL} NAME_WE)
+        if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
+            get_filename_component(FIL_DIR ${FIL} DIRECTORY)
+            if(FIL_DIR)
+                set(FIL_WE "${FIL_DIR}/${FIL_WE}")
+            endif()
+        endif()
+        list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
+        add_custom_command(
+                OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
+                COMMAND  ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
+                DEPENDS ${ABS_FIL} ${PROTOBUF_PROTOC_EXECUTABLE}
+                COMMENT "Running Python protocol buffer compiler on ${FIL}"
+                VERBATIM )
+    endforeach()
+
+    set(${SRCS} ${${SRCS}} PARENT_SCOPE)
+endfunction()
 
 # Print and set the protobuf library information,
 # finish this cmake process and exit from this file.
@@ -126,6 +121,7 @@ macro(PROMPT_PROTOBUF_LIB)
     # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
     # make `protobuf_generate_cpp` happy.
     SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
+
     FOREACH(dep ${protobuf_DEPS})
         ADD_DEPENDENCIES(protobuf ${dep})
         ADD_DEPENDENCIES(protobuf_lite ${dep})
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index efdb093a7b28e19f3b2a774dd54f2e7f042e9ca7..3cc1e028e75f009c4bbf89d9f48d5b3992697002 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -164,7 +164,7 @@ endif()
 set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
   SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-       ${src_dir}/${module}/api/paddle_inference_api.h
+       ${src_dir}/${module}/api/paddle_*.h
        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
   DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
 )
@@ -202,10 +202,10 @@ copy(third_party DEPS fluid_lib_dist
   DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR}
 )
 
-# only need libpaddle_fluid.so/a and paddle_inference_api.h for inference-only library
+# only need libpaddle_fluid.so/a and paddle_*.h for inference-only library
 copy(inference_api_lib DEPS fluid_lib_dist
   SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-       ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_inference_api.h
+       ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_*.h
   DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include
 )
 
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
index fa0e834a1dfd6e60f0ec07945be9a4d84017316f..3dc7171551bfb7aff8d1e75083c98b00378d247f 100644
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -34,4 +34,5 @@ if(TENSORRT_FOUND)
         "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
     include_directories(${TENSORRT_INCLUDE_DIR})
     list(APPEND EXTERNAL_LIBS ${TENSORRT_LIBRARY})
+    add_definitions(-DPADDLE_WITH_TENSORRT)
 endif()
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index b6b7af951093e4d721e5d0c99e7bb818c67af749..3378d210cdf6a625e11b1dd5fe348aa04cdb9361 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -103,7 +103,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
-paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode'], varargs=None, keywords=None, defaults=(False, -100, False))
+paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, False, False))
 paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
@@ -118,9 +118,10 @@ paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon',
 paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
 paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None))
 paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
-paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR'))
+paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None))
 paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
-paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -174,13 +175,16 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
 paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
+paddle.fluid.layers.similarity_focus ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
 paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@@ -189,6 +193,7 @@ paddle.fluid.layers.batch ArgSpec(args=['reader', 'batch_size'], varargs=None, k
 paddle.fluid.layers.double_buffer ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True))
+paddle.fluid.layers.create_py_reader_by_data ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True))
 paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
@@ -198,6 +203,7 @@ paddle.fluid.layers.create_tensor ArgSpec(args=['dtype', 'name', 'persistable'],
 paddle.fluid.layers.create_parameter ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None))
 paddle.fluid.layers.create_global_var ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None))
 paddle.fluid.layers.cast ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.tensor_array_to_tensor ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.concat ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.sums ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.assign ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,))
@@ -268,6 +274,7 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k
 paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False))
+paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, None))
 paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False))
 paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 844291140602a7a0aac9d9d40256deaf9d8a4c60..50e0677c21054345a89ec7b03af38332fa64d4d1 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -136,6 +136,10 @@ cc_library(version SRCS version.cc)
 cc_test(version_test SRCS version_test.cc DEPS version)
 
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
+cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto)
+cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
+  shape_inference data_transform lod_tensor profiler)
+
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
@@ -163,10 +167,10 @@ if(WITH_DISTRIBUTE)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator)
   cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
- 
+
 if (NOT WIN32)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
index fee6ba40047053ed5662fe044eceb0c687bd4db9..57ff061fe5e612495add86df8f82fe7d9f9107dc 100644
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -18,8 +18,8 @@ namespace framework {
 
 void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
                      Tensor *out) {
-  VLOG(3) << "DeviceTransform in, src_place " << in.place()
-          << " dst_place: " << dst_place;
+  VLOG(30) << "DeviceTransform in, src_place " << in.place()
+           << " dst_place: " << dst_place;
 
   PADDLE_ENFORCE_NE(
       in.place().which(), dst_place.which(),
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index f2c55e533a2747325b1b16fdada37945a8ed3c42..21e0cb3f91cc0ae05513c3bbd470650ca71194d7 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -49,10 +49,10 @@ class TestOpWithKernel : public OperatorWithKernel {
   OpKernelType GetExpectedKernelType(
       const ExecutionContext& ctx) const override {
     if (Attr<bool>("use_gpu")) {
-      VLOG(3) << "force use gpu kernel";
+      VLOG(30) << "force use gpu kernel";
       return OpKernelType(proto::VarType::FP32, platform::CUDAPlace(0));
     } else {
-      VLOG(3) << "use default kernel";
+      VLOG(30) << "use default kernel";
       return OpKernelType(proto::VarType::FP32,
                           ctx.Input<Tensor>("input")->place());
     }
@@ -148,7 +148,7 @@ TEST(Operator, CPUtoGPU) {
   // get output
   auto* output2 = scope.Var("OUT2");
   gpu_op->Run(scope, cuda_place);
-  VLOG(3) << "after gpu_op run";
+  VLOG(30) << "after gpu_op run";
 
   // auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
   paddle::platform::DeviceContextPool& pool =
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 7f0d06c892541a2697a4ed083f6f4c0fc774a2a4..8e5e5427659387d63eac21a200c1a20da493e539 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -60,7 +60,7 @@ void BroadcastOpHandle::BroadcastOneVar(
   PADDLE_ENFORCE_NOT_NULL(in_var);
   Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
   if (UNLIKELY(!in_tensor.IsInitialized())) {
-    VLOG(3) << "in var " << in_var_handle.name_ << "not inited, return!";
+    VLOG(30) << "in var " << in_var_handle.name_ << "not inited, return!";
     return;
   }
 
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h
index 1a2a9ac328c4a9b89bfb89106af81b9fb3ed3028..4305eb65733a7c871450949ce2c48cab013bac81 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -37,8 +37,9 @@ struct TestBroadcastOpHandle {
   std::vector<Scope*> local_scopes_;
   std::vector<Scope*> param_scopes_;
   Scope g_scope_;
-  std::unique_ptr<OpHandleBase> op_handle_;
-  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  OpHandleBase* op_handle_;
+  std::vector<VarHandleBase*> vars_;
+  std::vector<std::unique_ptr<ir::Node>> nodes_;
   std::vector<p::Place> place_list_;
   bool use_gpu_;
 #ifdef PADDLE_WITH_CUDA
@@ -90,6 +91,7 @@ struct TestBroadcastOpHandle {
   }
 
   void InitBroadcastOp(size_t input_scope_idx) {
+    nodes_.clear();
     for (size_t j = 0; j < place_list_.size(); ++j) {
       local_scopes_.push_back(&(g_scope_.NewScope()));
       Scope& local_scope = local_scopes_.back()->NewScope();
@@ -101,39 +103,39 @@ struct TestBroadcastOpHandle {
     }
     param_scopes_[input_scope_idx]->Var("input");
 
-    std::unique_ptr<ir::Node> n =
-        ir::CreateNodeForTest("node0", ir::Node::Type::kOperation);
+    nodes_.emplace_back(
+        ir::CreateNodeForTest("node0", ir::Node::Type::kOperation));
     if (use_gpu_) {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_,
-                                             place_list_, nccl_ctxs_.get()));
+      op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
+                                         place_list_, nccl_ctxs_.get());
 #else
       PADDLE_THROW("CUDA is not support.");
 #endif
     } else {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_,
-                                             place_list_, nccl_ctxs_.get()));
+      op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
+                                         place_list_, nccl_ctxs_.get());
 #else
-      op_handle_.reset(
-          new BroadcastOpHandle(n.get(), local_scopes_, place_list_));
+      op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
+                                         place_list_);
 #endif
     }
 
-    std::unique_ptr<ir::Node> v =
-        ir::CreateNodeForTest("node1", ir::Node::Type::kVariable);
-    auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input",
-                                        place_list_[input_scope_idx]);
+    nodes_.emplace_back(
+        ir::CreateNodeForTest("node1", ir::Node::Type::kVariable));
+    auto* in_var_handle = new VarHandle(nodes_.back().get(), 1, input_scope_idx,
+                                        "input", place_list_[input_scope_idx]);
     vars_.emplace_back(in_var_handle);
     op_handle_->AddInput(in_var_handle);
 
     // add dummy var
 
-    std::unique_ptr<ir::Node> v2 =
-        ir::CreateNodeForTest("node2", ir::Node::Type::kVariable);
-    vars_.emplace_back(new DummyVarHandle(v2.get()));
+    nodes_.emplace_back(
+        ir::CreateNodeForTest("node2", ir::Node::Type::kVariable));
+    vars_.emplace_back(new DummyVarHandle(nodes_.back().get()));
     DummyVarHandle* dummy_var_handle =
-        static_cast<DummyVarHandle*>(vars_.back().get());
+        static_cast<DummyVarHandle*>(vars_.back());
     dummy_var_handle->ClearGeneratedOp();
     op_handle_->AddInput(dummy_var_handle);
 
@@ -141,20 +143,20 @@ struct TestBroadcastOpHandle {
       if (!use_gpu_) {
         op_handle_->SetDeviceContext(place_list_[j], ctxs_[j].get());
       }
-      std::unique_ptr<ir::Node> v3 =
-          ir::CreateNodeForTest("node3", ir::Node::Type::kVariable);
+      nodes_.emplace_back(
+          ir::CreateNodeForTest("node3", ir::Node::Type::kVariable));
       VarHandle* out_var_handle =
-          new VarHandle(v3.get(), 2, j, "out", place_list_[j]);
+          new VarHandle(nodes_.back().get(), 2, j, "out", place_list_[j]);
       vars_.emplace_back(out_var_handle);
       op_handle_->AddOutput(out_var_handle);
     }
 
     // add dummy var
-    std::unique_ptr<ir::Node> v4 =
-        ir::CreateNodeForTest("node4", ir::Node::Type::kVariable);
-    vars_.emplace_back(new DummyVarHandle(v4.get()));
+    nodes_.emplace_back(
+        ir::CreateNodeForTest("node4", ir::Node::Type::kVariable));
+    vars_.emplace_back(new DummyVarHandle(nodes_.back().get()));
     DummyVarHandle* out_dummy_var_handle =
-        static_cast<DummyVarHandle*>(vars_.back().get());
+        static_cast<DummyVarHandle*>(vars_.back());
     out_dummy_var_handle->ClearGeneratedOp();
     op_handle_->AddOutput(out_dummy_var_handle);
   }
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 2b2329b9698908fdbe3385f1d555d756c47fc5c0..949510e03705a4a0900f1c7b8758a8f7308aa44b 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -16,6 +16,7 @@
 #include <vector>
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 
 namespace paddle {
 namespace framework {
@@ -29,16 +30,14 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
       local_scopes_(local_scopes),
       places_(places),
       graph_(std::move(graph)),
-      pool_(strategy.num_threads_ +
-            1),  // add one more thread for generate op_deps
+      pool_(strategy.num_threads_),
+      prepare_pool_(1),  // add one more thread for generate op_deps
       fetch_ctxs_(places) {
-  auto &ops = graph_->Get<details::GraphOps>("ops");
-
-  for (auto &op : ops) {
+  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
     int dep = static_cast<int>(op->NotReadyInputSize());
-    op_deps_.emplace(op.get(), dep);
+    op_deps_.emplace(op, dep);
     if (dep == 0) {
-      bootstrap_ops_.emplace_back(op.get());
+      bootstrap_ops_.emplace_back(op);
     }
   }
 
@@ -54,13 +53,13 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
   paddle::framework::FeedFetchList fetches;
   fetches.resize(fetch_tensors.size());
   std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
-  std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
+  std::vector<FetchOpHandle *> fetch_ops;
 
   for (auto &fetch_var_name : fetch_tensors) {
     for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
       auto it = var_map.find(fetch_var_name);
       if (it != var_map.end()) {
-        fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
+        fetched_vars[fetch_var_name].push_back(*it->second.rbegin());
       }
     }
   }
@@ -110,7 +109,10 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
           complete_q->Pop();
         }
       }
-      exception_.ReThrow();
+      if (exception_.IsCaught()) {
+        ClearFetchOp(graph_.get(), &fetch_ops);
+        exception_.ReThrow();
+      }
     }
     num_complete += num_comp;
   }
@@ -158,7 +160,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
   });
 }
 void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
-  atomic_op_deps_ = pool_.enqueue([&] {
+  atomic_op_deps_ = prepare_pool_.enqueue([&] {
     auto *op_deps = new std::unordered_map<OpHandleBase *, std::atomic<int>>;
     for (auto &pair : op_deps_) {
       (*op_deps)[pair.first] = pair.second;
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index 8b8382447105c8caa36963214684d6ee9fa15200..949616f02d5168e6abab932d608e4b20ee64304a 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -46,6 +46,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<OpHandleBase *> bootstrap_ops_;
 
   ::ThreadPool pool_;
+  ::ThreadPool prepare_pool_;
   platform::DeviceContextPool fetch_ctxs_;
   std::atomic<int> remaining_;
 
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index fe18b2060c5cd7e157374da53c5a985f70545ab7..648adae06facb504042d8286f6eab5d98e99c015 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -28,11 +28,7 @@ FetchOpHandle::FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
       offset_(offset),
       local_scopes_(local_scopes) {}
 
-FetchOpHandle::~FetchOpHandle() {
-  for (auto *input_var : inputs_) {
-    input_var->RemoveOutput(this, this->Node());
-  }
-}
+FetchOpHandle::~FetchOpHandle() {}
 
 void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
   PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error");
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
index 0f12bd2b4e857648342aeb5ad33b6c0fe01c9c73..541993c74332cc483a8b854a6b8f227c7c9a19a9 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -22,8 +22,10 @@ namespace details {
 
 struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
   std::vector<std::string> out_varnames_;
+  std::vector<std::unique_ptr<ir::Node>> nodes_;
 
   void InitFusedBroadcastOp(std::vector<size_t> input_scope_idxes) {
+    nodes_.clear();
     // initialize scope and var
     for (size_t i = 0; i < place_list_.size(); ++i) {
       local_scopes_.push_back(&(g_scope_.NewScope()));
@@ -39,41 +41,41 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
     }
 
     // create op handle node
-    std::unique_ptr<ir::Node> n =
-        ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation);
+    nodes_.emplace_back(
+        ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation));
     if (use_gpu_) {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(new FusedBroadcastOpHandle(
-          n.get(), local_scopes_, place_list_, nccl_ctxs_.get()));
+      op_handle_ = new FusedBroadcastOpHandle(
+          nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
 #else
       PADDLE_THROW("CUDA is not supported.");
 #endif
     } else {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(new FusedBroadcastOpHandle(
-          n.get(), local_scopes_, place_list_, nccl_ctxs_.get()));
+      op_handle_ = new FusedBroadcastOpHandle(
+          nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
 #else
-      op_handle_.reset(
-          new FusedBroadcastOpHandle(n.get(), local_scopes_, place_list_));
+      op_handle_ = new FusedBroadcastOpHandle(nodes_.back().get(),
+                                              local_scopes_, place_list_);
 #endif
     }
 
     for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
       // add input var handle
-      std::unique_ptr<ir::Node> in_node =
-          ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable);
+      nodes_.emplace_back(
+          ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable));
       VarHandle* in_var_handle =
-          new VarHandle(in_node.get(), 1, input_scope_idxes[i], "in_var" + i,
-                        place_list_[input_scope_idxes[i]]);
+          new VarHandle(nodes_.back().get(), 1, input_scope_idxes[i],
+                        "in_var" + i, place_list_[input_scope_idxes[i]]);
       vars_.emplace_back(in_var_handle);
       op_handle_->AddInput(in_var_handle);
 
       // add output var handle
       for (size_t j = 0; j < place_list_.size(); ++j) {
-        std::unique_ptr<ir::Node> out_node =
-            ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable);
-        VarHandle* out_var_handle =
-            new VarHandle(out_node.get(), 2, j, "out_var" + i, place_list_[j]);
+        nodes_.emplace_back(
+            ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable));
+        VarHandle* out_var_handle = new VarHandle(
+            nodes_.back().get(), 2, j, "out_var" + i, place_list_[j]);
         vars_.emplace_back(out_var_handle);
         op_handle_->AddOutput(out_var_handle);
       }
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index ed67e88ff6a7fe9efd93e5dfd4d7bdf4c43aac2e..e8cb7feb8bea92a7486b8a9d84ba4b9e2b93dbfb 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -31,9 +31,10 @@ struct TestGatherOpHandle {
   std::vector<Scope*> local_scopes_;
   std::vector<Scope*> param_scopes_;
   Scope g_scope_;
-  std::unique_ptr<OpHandleBase> op_handle_;
-  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  OpHandleBase* op_handle_;
+  std::vector<VarHandleBase*> vars_;
   std::vector<p::Place> gpu_list_;
+  std::vector<std::unique_ptr<ir::Node>> nodes_;
 
   void WaitAll() {
     for (size_t j = 0; j < ctxs_.size(); ++j) {
@@ -70,7 +71,7 @@ struct TestGatherOpHandle {
   }
 
   void InitGatherOp(size_t input_scope_idx) {
-    std::vector<std::unique_ptr<ir::Node>> nodes;
+    nodes_.clear();
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
       local_scopes_.push_back(&(g_scope_.NewScope()));
       Scope& local_scope = local_scopes_.back()->NewScope();
@@ -82,44 +83,45 @@ struct TestGatherOpHandle {
     }
     param_scopes_[input_scope_idx]->Var("out");
 
-    nodes.emplace_back(
+    nodes_.emplace_back(
         ir::CreateNodeForTest("node", ir::Node::Type::kOperation).release());
-    op_handle_.reset(
-        new GatherOpHandle(nodes.back().get(), local_scopes_, gpu_list_));
+    op_handle_ =
+        new GatherOpHandle(nodes_.back().get(), local_scopes_, gpu_list_);
     // add input
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
       op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
-      nodes.emplace_back(
+      nodes_.emplace_back(
           ir::CreateNodeForTest("node1", ir::Node::Type::kVariable).release());
       auto* in_var_handle =
-          new VarHandle(nodes.back().get(), 1, j, "input", gpu_list_[j]);
+          new VarHandle(nodes_.back().get(), 1, j, "input", gpu_list_[j]);
       vars_.emplace_back(in_var_handle);
       op_handle_->AddInput(in_var_handle);
     }
 
     // add dummy var
-    nodes.emplace_back(
+    nodes_.emplace_back(
         ir::CreateNodeForTest("node2", ir::Node::Type::kVariable).release());
-    vars_.emplace_back(new DummyVarHandle(nodes.back().get()));
+    vars_.emplace_back(new DummyVarHandle(nodes_.back().get()));
     DummyVarHandle* in_dummy_var_handle =
-        static_cast<DummyVarHandle*>(vars_.back().get());
+        static_cast<DummyVarHandle*>(vars_.back());
     in_dummy_var_handle->ClearGeneratedOp();
     op_handle_->AddInput(in_dummy_var_handle);
 
     // add output
-    nodes.emplace_back(
+    nodes_.emplace_back(
         ir::CreateNodeForTest("node3", ir::Node::Type::kVariable).release());
-    auto* out_var_handle = new VarHandle(nodes.back().get(), 2, input_scope_idx,
-                                         "out", gpu_list_[input_scope_idx]);
+    auto* out_var_handle =
+        new VarHandle(nodes_.back().get(), 2, input_scope_idx, "out",
+                      gpu_list_[input_scope_idx]);
     vars_.emplace_back(out_var_handle);
     op_handle_->AddOutput(out_var_handle);
 
     // add dummy var
-    nodes.emplace_back(
+    nodes_.emplace_back(
         ir::CreateNodeForTest("node4", ir::Node::Type::kVariable).release());
-    vars_.emplace_back(new DummyVarHandle(nodes.back().get()));
+    vars_.emplace_back(new DummyVarHandle(nodes_.back().get()));
     DummyVarHandle* dummy_var_handle =
-        static_cast<DummyVarHandle*>(vars_.back().get());
+        static_cast<DummyVarHandle*>(vars_.back());
     op_handle_->AddOutput(dummy_var_handle);
   }
 
diff --git a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc
index 169ce3ae7ca497e40d99b1c16633e35e1e4f1009..bf3f3637b551a8a8084e6e4f1ca6a94b65361f17 100644
--- a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc
+++ b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/op_graph_view.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 
 namespace paddle {
 namespace framework {
@@ -35,17 +36,17 @@ static bool IsLockAndRecordEventFreeComputationOpHandle(
 
 std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl(
     std::unique_ptr<ir::Graph> ir_graph) const {
-  auto &all_ops = ir_graph->Get<GraphOps>(kGraphOps);
+  auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*ir_graph);
   OpGraphView graph_view(all_ops);
   for (auto &op : all_ops) {
-    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op.get());
+    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
     if (compute_op == nullptr) continue;
     bool is_lock_and_record_event_free =
         IsLockAndRecordEventFreeComputationOpHandle(compute_op, graph_view);
     compute_op->SetLockAndRecordEventFree(is_lock_and_record_event_free);
     if (is_lock_and_record_event_free) {
-      VLOG(10) << "Set is_lock_and_record_event_free be true in op "
-               << compute_op->DebugString();
+      VLOG(100) << "Set is_lock_and_record_event_free be true in op "
+                << compute_op->DebugString();
     }
   }
   return ir_graph;
diff --git a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
index c9c255864a2477ed29873f8521acce37fa928c06..c8ea18804630fea4ada98062256730dbf4c24860 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include <string>
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 
 namespace paddle {
 namespace framework {
@@ -36,20 +37,20 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
   for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
     for (auto &name_pair : var_map) {
       for (auto &version_pair : name_pair.second) {
-        insert_pending_var(version_pair.get());
+        insert_pending_var(version_pair);
       }
     }
   }
 
   for (auto &var : graph->Get<GraphDepVars>(kGraphDepVars)) {
-    insert_pending_var(var.get());
+    insert_pending_var(var);
   }
 
-  for (auto &op : graph->Get<GraphOps>(kGraphOps)) {
+  for (OpHandleBase *op : ir::FilterByNodeWrapper<OpHandleBase>(*graph)) {
     if (op->Inputs().empty()) {
-      ready_ops.insert(op.get());
+      ready_ops.insert(op);
     } else {
-      pending_ops.insert({op.get(), op.get()->NoDupInputSize()});
+      pending_ops.insert({op, op->NoDupInputSize()});
     }
   }
 
@@ -89,6 +90,4 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
 REGISTER_PASS(multi_devices_check_pass,
               paddle::framework::details::SSAGraghBuilderWithChecker)
     .RequireGraphAttr(paddle::framework::details::kGraphVars)
-    .RequireGraphAttr(paddle::framework::details::kGraphDepVars)
-    .RequireGraphAttr(paddle::framework::details::kGraphOps)
-    .RequireGraphAttr(paddle::framework::details::kShardedVarDevice);
+    .RequireGraphAttr(paddle::framework::details::kGraphDepVars);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index f3819887a196a7c8bf35897467bb9d68b428094e..8c98b781301e884d5d5c7d141f3d901d74d51285 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -34,7 +34,14 @@
 namespace paddle {
 namespace framework {
 namespace details {
+
 namespace {
+// TODO(panyx0718): Clean this up as well.
+// all operators. NOTE that even we use a vector here, the operators is
+// unordered.
+typedef std::vector<OpHandleBase *> GraphOps;
+const char kGraphOps[] = "ops";
+
 void PolishGraphToSupportDataHazards(ir::Graph *graph) {
   for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
     for (auto &name_pair : var_map) {
@@ -92,7 +99,7 @@ VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node,
     }
     var_holder.emplace_back(var);
   } else {
-    var = var_holder.rbegin()->get();
+    var = *var_holder.rbegin();
   }
   return var;
 }
@@ -154,7 +161,7 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result,
                                                 ir::Node *node,
                                                 size_t place_id) const {
   auto p = places_[place_id];
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
   op_handle->SetDeviceContext(p,
                               platform::DeviceContextPool::Instance().Get(p));
 
@@ -303,7 +310,6 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
   result.Set(kGraphVars, new GraphVars(places_.size()));
   result.Set(kGraphDepVars, new GraphDepVars);
   result.Set(kGraphOps, new GraphOps);
-  result.Set(kShardedVarDevice, new ShardedVarDevice);
 
   // find send/recv vars so that we can place the distributed training
   // related op in the place 0
@@ -317,11 +323,13 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
   bool is_forwarding = true;
   bool is_dist_train = false;
 
+  std::unordered_map<std::string, int> sharded_var_device;
+
   for (ir::Node *node : sorted_ops) {
     if (boost::get<int>(
             node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
         static_cast<int>(OpRole::kRPC)) {
-      int op_dev_id = CreateRPCOp(&result, node);
+      int op_dev_id = CreateRPCOp(&result, node, &sharded_var_device);
       PADDLE_ENFORCE(op_dev_id != -1,
                      "Can not schedule the RPC operator to the right place.");
       if (node->Op()->Type() == "recv") {
@@ -337,7 +345,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
     } else if (boost::get<int>(node->Op()->GetAttr(
                    OpProtoAndCheckerMaker::OpRoleAttrName())) ==
                static_cast<int>(OpRole::kDist)) {
-      int op_dev_id = CreateDistTrainOp(&result, node);
+      int op_dev_id = CreateDistTrainOp(&result, node, &sharded_var_device);
       if (node->Op()->Type() == "concat") {
         auto origin_param_name = node->Op()->OutputArgumentNames()[0];
         bcast_var_name_set[op_dev_id].emplace(origin_param_name);
@@ -356,12 +364,11 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
       // the block.
       is_forwarding = false;
     } else {
-      int op_dev_id = GetOpDeviceID(result, node);
+      int op_dev_id = GetOpDeviceID(result, node, sharded_var_device);
       if (op_dev_id != -1) {  // This op only runs on one specific device.
         CreateComputationalOp(&result, node, op_dev_id);
         for (ir::Node *n : node->outputs) {
-          graph->Get<ShardedVarDevice>(kShardedVarDevice)
-              .emplace(n->Name(), op_dev_id);
+          sharded_var_device.emplace(n->Name(), op_dev_id);
         }
       } else {
         // This op runs on all devices, and its output may have parameter's
@@ -392,14 +399,13 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
               for (size_t i = 0; i < backward_vars.size(); i += 2) {
                 auto &p_name = backward_vars[i];
                 auto &g_name = backward_vars[i + 1];
-                VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
+                VLOG(100) << "Bcast " << g_name << " for parameter " << p_name;
 
                 switch (strategy_.reduce_) {
                   case BuildStrategy::ReduceStrategy::kReduce:
                     cur_device_id = GetAppropriateDeviceID({g_name});
                     CreateReduceOp(&result, g_name, cur_device_id);
-                    graph->Get<ShardedVarDevice>(kShardedVarDevice)
-                        .emplace(g_name, cur_device_id);
+                    sharded_var_device.emplace(g_name, cur_device_id);
                     if (!is_dist_train) {
                       bcast_var_name_set[cur_device_id].emplace(p_name);
                     }
@@ -458,7 +464,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
    * Only variables should be the leaves of graph.
    */
   AddOutputToLeafOps(&result);
-  PADDLE_ENFORCE(!ir::HasCircle(result));
+  result.Erase<GraphOps>(kGraphOps);
   return graph;
 }
 
@@ -498,7 +504,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
   result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
 
   auto *in =
-      result->Get<GraphVars>(kGraphVars).at(src_dev_id).at(p_name).back().get();
+      result->Get<GraphVars>(kGraphVars).at(src_dev_id).at(p_name).back();
   op_handle->AddInput(in);
 
   for (size_t i = 0; i < places_.size(); ++i) {
@@ -535,7 +541,7 @@ void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp(
   for (size_t dev_id = 0; dev_id < bcast_varnames.size(); ++dev_id) {
     for (auto &p_name : bcast_varnames[dev_id]) {
       auto *in =
-          result->Get<GraphVars>(kGraphVars).at(dev_id).at(p_name).back().get();
+          result->Get<GraphVars>(kGraphVars).at(dev_id).at(p_name).back();
       op_handle->AddInput(in);
       for (size_t out_dev_id = 0; out_dev_id < places_.size(); ++out_dev_id) {
         auto &p = places_[out_dev_id];
@@ -571,7 +577,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
       result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
       local_scopes_, places_));
 #endif
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
 
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &p = places_[i];
@@ -579,7 +585,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
     auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
     PADDLE_ENFORCE(!vars.empty());
     auto &prev_grad = vars.back();
-    op_handle->AddInput(prev_grad.get());
+    op_handle->AddInput(prev_grad);
 
     auto var =
         new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
@@ -600,14 +606,14 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
       result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
       local_scopes_, places_));
 #endif
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &p = places_[i];
     SetCommunicationContext(op_handle, p);
     for (const std::string &d_name : datas) {
       auto &vars = result->Get<GraphVars>(kGraphVars)[i][d_name];
       PADDLE_ENFORCE(!vars.empty());
-      op_handle->AddInput(vars.back().get());
+      op_handle->AddInput(vars.back());
       auto var = new VarHandle(
           result->CreateEmptyNode(d_name, ir::Node::Type::kVariable),
           vars.size(), i, d_name, p);
@@ -617,8 +623,9 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
   }
 }
 
-int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph,
-                                           ir::Node *node) const {
+int MultiDevSSAGraphBuilder::GetOpDeviceID(
+    const ir::Graph &graph, ir::Node *node,
+    const std::unordered_map<std::string, int> &sharded_var_device) const {
   if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
     return -1;
   }
@@ -631,16 +638,22 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph,
       node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
 
   PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
-  int dev_id = GetVarDeviceID(graph, param_grad[1]);
+  int dev_id = GetVarDeviceID(graph, param_grad[1], sharded_var_device);
   PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]",
                     node->Op()->Type(), param_grad[0], param_grad[1]);
   return dev_id;
 }
 
-int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph,
-                                            const std::string &varname) const {
-  auto &sharded_var_device = graph.Get<ShardedVarDevice>(kShardedVarDevice);
+int MultiDevSSAGraphBuilder::GetVarDeviceID(
+    const ir::Graph &graph, const std::string &varname,
+    const std::unordered_map<std::string, int> &sharded_var_device) const {
   auto got = sharded_var_device.find(varname);
+  if (got == sharded_var_device.end()) {
+    auto pos = varname.find(framework::kNewGradSuffix);
+    if (pos != std::string::npos) {
+      got = sharded_var_device.find(varname.substr(0, pos));
+    }
+  }
   return got == sharded_var_device.end() ? -1 : got->second;
 }
 
@@ -690,7 +703,7 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
       result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
       local_scopes_, places_));
 #endif
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
 
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &p = places_[i];
@@ -698,7 +711,7 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
     auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
     PADDLE_ENFORCE(!vars.empty());
     auto &prev_grad = vars.back();
-    op_handle->AddInput(prev_grad.get());
+    op_handle->AddInput(prev_grad);
   }
   auto &vars = result->Get<GraphVars>(kGraphVars)[dst_dev_id][og];
   auto var =
@@ -709,8 +722,9 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
   return var;
 }
 
-int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
-                                               ir::Node *node) const {
+int MultiDevSSAGraphBuilder::CreateDistTrainOp(
+    ir::Graph *result, ir::Node *node,
+    std::unordered_map<std::string, int> *sharded_var_device) const {
   int op_dev_id = -1;
   std::vector<std::string> input_var_names;
   std::vector<std::string> output_var_names;
@@ -725,23 +739,22 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
       node->Op()->Type() == "split_selected_rows" ||
       node->Op()->Type() == "split_ids") {
     // TODO(paddle-dev): getting the first var is not safe.
-    op_dev_id = GetVarDeviceID(*result, input_var_names[0]);
+    op_dev_id =
+        GetVarDeviceID(*result, input_var_names[0], *sharded_var_device);
     if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
       op_dev_id = GetAppropriateDeviceID(input_var_names);
       for (auto &varname : input_var_names) {
-        result->Get<ShardedVarDevice>(kShardedVarDevice)
-            .emplace(varname, op_dev_id);
+        sharded_var_device->emplace(varname, op_dev_id);
       }
     }
     for (auto &varname : output_var_names) {
-      result->Get<ShardedVarDevice>(kShardedVarDevice)
-          .emplace(varname, op_dev_id);
+      sharded_var_device->emplace(varname, op_dev_id);
     }
   } else if (node->Op()->Type() == "concat") {
-    op_dev_id = GetVarDeviceID(*result, input_var_names[0]);
+    op_dev_id =
+        GetVarDeviceID(*result, input_var_names[0], *sharded_var_device);
     for (auto &varname : output_var_names) {
-      result->Get<ShardedVarDevice>(kShardedVarDevice)
-          .emplace(varname, op_dev_id);
+      sharded_var_device->emplace(varname, op_dev_id);
     }
   } else {
     LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type();
@@ -759,14 +772,14 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
 }
 
 void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
   for (ir::Node *input : node->inputs) {
     VarHandle *var = nullptr;
     for (int place_offset = 0; place_offset < num_places; ++place_offset) {
       auto &var_holders = result->Get<GraphVars>(kGraphVars)[place_offset];
       auto &var_holder = var_holders[input->Name()];
       if (!var_holder.empty()) {
-        var = var_holder.rbegin()->get();
+        var = *var_holder.rbegin();
         op_handle->AddInput(var);
       }
     }
@@ -774,12 +787,14 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
 }
 
 // Create RPC related op handles that connects its in ops and out ops.
-int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
-                                         ir::Node *node) const {
+int MultiDevSSAGraphBuilder::CreateRPCOp(
+    ir::Graph *result, ir::Node *node,
+    std::unordered_map<std::string, int> *sharded_var_device) const {
   int op_dev_id = -1;
   if (node->Op()->Type() == "send") {
     // TODO(paddle-dev): getting the first var is not safe.
-    op_dev_id = GetVarDeviceID(*result, node->inputs[0]->Name());
+    op_dev_id =
+        GetVarDeviceID(*result, node->inputs[0]->Name(), *sharded_var_device);
     PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]),
                    "This hack no longer holds, please fix.");
     // the variable name which contains .block means it was splited by
@@ -794,14 +809,12 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
           node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
       PADDLE_ENFORCE_EQ(send_param_grad.size(), 2U);
       op_dev_id = GetAppropriateDeviceID({send_param_grad[1]});
-      VLOG(10) << "send grad " << input_var_names[0] << " origin "
-               << send_param_grad[1] << " place: " << op_dev_id;
+      VLOG(100) << "send grad " << input_var_names[0] << " origin "
+                << send_param_grad[1] << " place: " << op_dev_id;
       for (auto &varname : input_var_names) {
-        result->Get<ShardedVarDevice>(kShardedVarDevice)
-            .emplace(varname, op_dev_id);
+        sharded_var_device->emplace(varname, op_dev_id);
       }
-      result->Get<ShardedVarDevice>(kShardedVarDevice)
-          .emplace(send_param_grad[1], op_dev_id);
+      sharded_var_device->emplace(send_param_grad[1], op_dev_id);
     }
   } else if (node->Op()->Type() == "recv") {
     std::vector<std::string> output_var_names;
@@ -811,16 +824,16 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
     auto recv_param_grad = boost::get<std::vector<std::string>>(
         node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
     if (recv_param_grad.size() == 2U) {
-      op_dev_id = GetVarDeviceID(*result, recv_param_grad[1]);
-      VLOG(10) << "recv param " << recv_param_grad[0]
-               << " get grad place: " << recv_param_grad[1]
-               << " place: " << op_dev_id;
+      op_dev_id =
+          GetVarDeviceID(*result, recv_param_grad[1], *sharded_var_device);
+      VLOG(100) << "recv param " << recv_param_grad[0]
+                << " get grad place: " << recv_param_grad[1]
+                << " place: " << op_dev_id;
     } else {
       op_dev_id = GetAppropriateDeviceID(output_var_names);
     }
     for (auto &varname : output_var_names) {
-      result->Get<ShardedVarDevice>(kShardedVarDevice)
-          .emplace(varname, op_dev_id);
+      sharded_var_device->emplace(varname, op_dev_id);
     }
   } else {
     // send_barrier, fetch_barrier will run on place 0;
@@ -839,7 +852,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
     // send_barrier, recv, fetch_barrier's inputs are deps var, get them from
     // all places
     auto p = places_[op_dev_id];
-    auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+    auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
     op_handle->SetDeviceContext(p,
                                 platform::DeviceContextPool::Instance().Get(p));
 
@@ -847,7 +860,8 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
     for (ir::Node *output : node->outputs) {
       int outvar_dev_id = op_dev_id;
       if (node->Op()->Type() == "fetch_barrier") {
-        outvar_dev_id = GetVarDeviceID(*result, output->Name());
+        outvar_dev_id =
+            GetVarDeviceID(*result, output->Name(), *sharded_var_device);
         PADDLE_ENFORCE_NE(outvar_dev_id, -1);
       }
       p = places_[outvar_dev_id];
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 03b2de2f04da4bac8d342a76c80fd12beaeba4b7..f3ec2d29415240b7012f458070223469d0947166 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -44,12 +44,18 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
   mutable platform::NCCLContextMap *nccl_ctxs_;
 #endif
 
-  int GetVarDeviceID(const ir::Graph &graph, const std::string &varname) const;
+  int GetVarDeviceID(
+      const ir::Graph &graph, const std::string &varname,
+      const std::unordered_map<std::string, int> &sharded_var_device) const;
 
   bool IsScaleLossOp(ir::Node *node) const;
 
-  int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
-  int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
+  int CreateRPCOp(
+      ir::Graph *result, ir::Node *node,
+      std::unordered_map<std::string, int> *sharded_var_device) const;
+  int CreateDistTrainOp(
+      ir::Graph *result, ir::Node *node,
+      std::unordered_map<std::string, int> *sharded_var_device) const;
 
   std::vector<std::string> FindDistTrainSendVars(
       const std::vector<ir::Node *> &nodes) const;
@@ -69,7 +75,9 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
   void CreateComputationalOp(ir::Graph *result, ir::Node *node,
                              int dev_id) const;
 
-  int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const;
+  int GetOpDeviceID(
+      const ir::Graph &graph, ir::Node *node,
+      const std::unordered_map<std::string, int> &sharded_var_device) const;
 
   void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
 
diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
index 361c91dc78c08a2cbf84ee88211d389c1e2312e5..8f92f0948d7d397ab0f20c01eae9e313f739adec 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include <string>
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 
 namespace paddle {
 namespace framework {
@@ -62,7 +63,7 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
   });
 
   size_t op_id = 0;
-  for (auto &op : graph.Get<GraphOps>(kGraphOps)) {
+  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(graph)) {
     std::string op_name = "op_" + std::to_string(op_id++);
     sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
          << std::endl;
diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h
index 175c5a9950be69d7bf6ae9e386af762007a18a51..1a2b75fbc0c28984ce5cf00e0a2ce0f804349bb1 100644
--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -35,23 +35,14 @@ namespace details {
 // The outside vector is the device vector. Each element of this vector is a
 // map from variable name to variables. The variables, who have the same name,
 // will have a differsent version. The offset in the
-// `std::vector<std::unique_ptr<VarHandle>>` is the version of varaibles.
-typedef std::vector<
-    std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>
+// `std::vector<VarHandle*>` is the version of varaibles.
+typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle*>>>
     GraphVars;
 const char kGraphVars[] = "vars";
 
 // aux variables to represent dependency. Useful to resolve data hazard.
-typedef std::unordered_set<std::unique_ptr<VarHandleBase>> GraphDepVars;
+typedef std::unordered_set<VarHandleBase*> GraphDepVars;
 const char kGraphDepVars[] = "dep_vars";
-
-// all operators. NOTE that even we use a vector here, the operators is
-// unordered.
-typedef std::vector<std::unique_ptr<OpHandleBase>> GraphOps;
-const char kGraphOps[] = "ops";
-
-typedef std::unordered_map<std::string, int> ShardedVarDevice;
-const char kShardedVarDevice[] = "sharded_var_device";
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_graph_view.cc b/paddle/fluid/framework/details/op_graph_view.cc
index 65dafd376f7c687410270e35f105ff595fe78f59..4838c4198ff35ba3fb562f3a7c0563ee60179e3b 100644
--- a/paddle/fluid/framework/details/op_graph_view.cc
+++ b/paddle/fluid/framework/details/op_graph_view.cc
@@ -20,19 +20,16 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-OpGraphView::OpGraphView(
-    const std::vector<std::unique_ptr<OpHandleBase>> &ops) {
-  Build(ops);
-}
+OpGraphView::OpGraphView(const std::vector<OpHandleBase *> &ops) { Build(ops); }
 
-void OpGraphView::Build(const std::vector<std::unique_ptr<OpHandleBase>> &ops) {
+void OpGraphView::Build(const std::vector<OpHandleBase *> &ops) {
   for (auto &op : ops) {
-    preceding_ops_[op.get()];
-    pending_ops_[op.get()];
+    preceding_ops_[op];
+    pending_ops_[op];
     for (auto &var : op->Outputs()) {
       for (auto &pending_op : var->PendingOps()) {
-        preceding_ops_[pending_op].insert(op.get());
-        pending_ops_[op.get()].insert(pending_op);
+        preceding_ops_[pending_op].insert(op);
+        pending_ops_[op].insert(pending_op);
       }
     }
   }
@@ -41,8 +38,6 @@ void OpGraphView::Build(const std::vector<std::unique_ptr<OpHandleBase>> &ops) {
       "There are duplicate ops in graph.");
 }
 
-size_t OpGraphView::OpNumber() const { return preceding_ops_.size(); }
-
 std::unordered_set<OpHandleBase *> OpGraphView::AllOps() const {
   std::unordered_set<OpHandleBase *> ret;
   for (auto &pair : preceding_ops_) {
@@ -60,12 +55,6 @@ void OpGraphView::EnforceHasOp(OpHandleBase *op) const {
                  op == nullptr ? "nullptr" : op->DebugString());
 }
 
-const std::unordered_set<OpHandleBase *> &OpGraphView::PrecedingOps(
-    OpHandleBase *op) const {
-  EnforceHasOp(op);
-  return preceding_ops_.at(op);
-}
-
 const std::unordered_set<OpHandleBase *> &OpGraphView::PendingOps(
     OpHandleBase *op) const {
   EnforceHasOp(op);
diff --git a/paddle/fluid/framework/details/op_graph_view.h b/paddle/fluid/framework/details/op_graph_view.h
index 398c019be00a6ff5f5b39fdcbe97339341b1685b..afb3e8e59461eeba10d7027fc70b89cc170c1805 100644
--- a/paddle/fluid/framework/details/op_graph_view.h
+++ b/paddle/fluid/framework/details/op_graph_view.h
@@ -26,21 +26,16 @@ namespace details {
 
 class OpGraphView {
  public:
-  explicit OpGraphView(const std::vector<std::unique_ptr<OpHandleBase>> &ops);
-
-  size_t OpNumber() const;
+  explicit OpGraphView(const std::vector<OpHandleBase *> &ops);
 
   std::unordered_set<OpHandleBase *> AllOps() const;
 
-  const std::unordered_set<OpHandleBase *> &PrecedingOps(
-      OpHandleBase *op) const;
-
   const std::unordered_set<OpHandleBase *> &PendingOps(OpHandleBase *op) const;
 
   bool HasOp(OpHandleBase *op) const;
 
  private:
-  void Build(const std::vector<std::unique_ptr<OpHandleBase>> &ops);
+  void Build(const std::vector<OpHandleBase *> &ops);
   void EnforceHasOp(OpHandleBase *op) const;
 
   std::unordered_map<OpHandleBase *, std::unordered_set<OpHandleBase *>>
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index d09b94a3fd32952985a37cf4246c7640d2db4f56..ba12ca3c61c05b3e856fffa8353d4ec5bf79bc39 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -31,7 +31,10 @@ constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
 // It's responsible for populating necessary fields of ir::Node.
 class OpHandleBase {
  public:
-  explicit OpHandleBase(ir::Node *node) : node_(node) {}
+  // Owned by `node`. No need to be deleted explicitly.
+  explicit OpHandleBase(ir::Node *node) : node_(node) {
+    node_->WrappedBy(this);
+  }
 
   virtual ~OpHandleBase();
 
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
index 3a9a58412391b188c5e804b41fa47b3607a36bd1..72299c0bfa916d3b92e1c5020ddd69dadad3701d 100644
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -30,8 +30,8 @@ struct TestReduceOpHandle {
   Scope g_scope_;
   std::vector<Scope *> local_scopes_;
   std::vector<Scope *> param_scopes_;
-  std::unique_ptr<OpHandleBase> op_handle_;
-  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  OpHandleBase *op_handle_;
+  std::vector<VarHandleBase *> vars_;
   std::vector<p::Place> gpu_list_;
   std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
 
diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc
index 0b994ced7f751f056fec076e3dea8d14d0bed991..28443cc886e4c3f5db707d6d8fe9971618d8c2f7 100644
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -19,6 +19,7 @@
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/reference_count_pass.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 
 namespace paddle {
 namespace framework {
@@ -71,14 +72,13 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
   // Step 2: Find all variables in non-computation ops which refers to variables
   // in computation ops
   std::unordered_set<std::string> names;
-  std::unordered_map<OpHandleBase *, std::unique_ptr<ReferenceCountOpHandle>>
+  std::unordered_map<OpHandleBase *, ReferenceCountOpHandle *>
       compute_ref_cnt_map;
 
   auto get_ref_cnts_from_compute_op = [&](
-      const std::unique_ptr<OpHandleBase> &op,
-      const std::vector<VarHandleBase *> &vars) {
+      OpHandleBase *op, const std::vector<VarHandleBase *> &vars) {
     std::vector<std::string> var_names_in_op;
-    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op.get());
+    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
     if (compute_op == nullptr ||
         !platform::is_gpu_place(compute_op->GetPlace()))
       return var_names_in_op;
@@ -121,9 +121,8 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
   };
 
   auto update_ref_cnts_from_non_compute_op = [&](
-      const std::unique_ptr<OpHandleBase> &op,
-      const std::vector<VarHandleBase *> &vars) {
-    if (dynamic_cast<ComputationOpHandle *>(op.get()) != nullptr) return;
+      OpHandleBase *op, const std::vector<VarHandleBase *> &vars) {
+    if (dynamic_cast<ComputationOpHandle *>(op) != nullptr) return;
     for (VarHandleBase *var_handle_base : vars) {
       auto *var_handle = dynamic_cast<VarHandle *>(var_handle_base);
       if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue;
@@ -141,8 +140,8 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
         if (next_compute_op != nullptr) {
           if (compute_ref_cnt_map.count(next_compute_op)) {
             compute_ref_cnt_map[next_compute_op]->AddVar(var_name);
-            VLOG(5) << "Add reference count of " << var_name << " to Operator "
-                    << next_compute_op->Name();
+            VLOG(50) << "Add reference count of " << var_name << " to Operator "
+                     << next_compute_op->Name();
           } else {
             // Create new reference_count_op_handle
             ir::Node *ref_cnt_node = graph->CreateEmptyNode(
@@ -151,21 +150,21 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
                 ref_cnt_node, next_compute_op->GetScope(), place, {var_name},
                 gcs[place.device].get(), cur_ref_cnts[place.device].get());
             AddDependencyBetween(next_compute_op, ref_cnt_handle, graph.get());
-            compute_ref_cnt_map[next_compute_op].reset(ref_cnt_handle);
+            compute_ref_cnt_map[next_compute_op] = ref_cnt_handle;
           }
         }
       }
     }
   };
 
-  auto &all_ops = graph->Get<GraphOps>(kGraphOps);
+  auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
   for (auto &op : all_ops) {
     auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs());
     auto out_var_names = get_ref_cnts_from_compute_op(op, op->Outputs());
     if (in_var_names.empty() && out_var_names.empty()) continue;
     in_var_names.insert(in_var_names.end(), out_var_names.begin(),
                         out_var_names.end());
-    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op.get());
+    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
     auto place = boost::get<platform::CUDAPlace>(compute_op->GetPlace());
     ir::Node *ref_cnt_node =
         graph->CreateEmptyNode("reference_count", ir::Node::Type::kOperation);
@@ -173,7 +172,7 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
         ref_cnt_node, compute_op->GetScope(), place, in_var_names,
         gcs[place.device].get(), cur_ref_cnts[place.device].get());
     AddDependencyBetween(compute_op, ref_cnt_handle, graph.get());
-    compute_ref_cnt_map[compute_op].reset(ref_cnt_handle);
+    compute_ref_cnt_map[compute_op] = ref_cnt_handle;
   }
 
   for (auto &op : all_ops) {
@@ -181,11 +180,11 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
     update_ref_cnts_from_non_compute_op(op, op->Outputs());
   }
 
-  std::vector<std::unique_ptr<OpHandleBase>> new_all_ops;
+  std::vector<OpHandleBase *> new_all_ops;
   new_all_ops.reserve(compute_ref_cnt_map.size() + all_ops.size());
   for (auto &op : all_ops) {
     new_all_ops.emplace_back(std::move(op));
-    auto it = compute_ref_cnt_map.find(new_all_ops.back().get());
+    auto it = compute_ref_cnt_map.find(new_all_ops.back());
     if (it != compute_ref_cnt_map.end()) {
       // Add LeafNode to ReferenceCountOpHandle
       auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index ef1626599795a553e654fe5d3ed74ef3a3a67d78..6ab6cb2332b0af3fa16b986f115513ee098fae4f 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() {
                         ->stream();
       memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
                    platform::CPUPlace(), &coeff_, sizeof(float), stream);
-      VLOG(10) << place_ << "RUN Scale loss grad op";
+      VLOG(100) << place_ << "RUN Scale loss grad op";
     });
 #endif
   }
diff --git a/paddle/fluid/framework/details/sequential_execution_pass.cc b/paddle/fluid/framework/details/sequential_execution_pass.cc
index cc2c8bfef9f9f54c2e499467df0d22ce3f69d6b8..f78a47bb78e6f1d81db6abed11a7762f21dd2226 100644
--- a/paddle/fluid/framework/details/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/details/sequential_execution_pass.cc
@@ -94,8 +94,8 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
     op_node_list[i - 1]->outputs.push_back(dep_var);
     dep_var->outputs.push_back(op_node_list[i]);
     dep_var->inputs.push_back(op_node_list[i - 1]);
-    VLOG(10) << "Add dependencies between " << op_node_list[i - 1]->Name()
-             << " and " << op_node_list[i]->Name();
+    VLOG(100) << "Add dependencies between " << op_node_list[i - 1]->Name()
+              << " and " << op_node_list[i]->Name();
   }
   return graph;
 }
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc
index 780da5478ff34ecd7096d0ef62b72bf1088dd221..af2cbd5c876fdd7c27cd679f7e9412d1b0604ecc 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/ssa_graph_executor.cc
@@ -19,14 +19,16 @@ namespace framework {
 namespace details {
 SSAGraphExecutor::~SSAGraphExecutor() {}
 
-void ClearFetchOp(ir::Graph* graph,
-                  std::vector<std::unique_ptr<FetchOpHandle>>* fetch_ops) {
+void ClearFetchOp(ir::Graph* graph, std::vector<FetchOpHandle*>* fetch_ops) {
   if (fetch_ops->empty()) return;
 
   for (auto& op : *fetch_ops) {
     for (auto& out_var : op->Node()->outputs) {
       graph->RemoveNode(out_var);
     }
+    for (auto& in_var : op->Inputs()) {
+      in_var->RemoveOutput(op, op->Node());
+    }
     graph->RemoveNode(op->Node());
   }
   fetch_ops->clear();
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h
index d5cf7737d565c523995e6685b73c57e5a6f0197b..860eaa25b58e4579ad792ff18618de3b90707e8d 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -38,8 +38,7 @@ class SSAGraphExecutor {
   virtual FeedFetchList Run(const std::vector<std::string>& fetch_tensors) = 0;
 };
 
-void ClearFetchOp(ir::Graph* graph,
-                  std::vector<std::unique_ptr<FetchOpHandle>>* fetch_ops);
+void ClearFetchOp(ir::Graph* graph, std::vector<FetchOpHandle*>* fetch_ops);
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 2d2bdb604f2d08adbaa0b38d04b8e377b2e6ab6c..f781f02a076594b5a70fd4863ebf273e88607dfd 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
@@ -51,25 +52,25 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
     for (auto &name_pair : var_map) {
       for (auto &version_pair : name_pair.second) {
-        InsertPendingVar(&pending_vars, ready_vars.get(), version_pair.get());
+        InsertPendingVar(&pending_vars, ready_vars.get(), version_pair);
       }
     }
   }
   for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
-    InsertPendingVar(&pending_vars, ready_vars.get(), var.get());
+    InsertPendingVar(&pending_vars, ready_vars.get(), var);
   }
 
-  for (auto &op : graph_->Get<details::GraphOps>(details::kGraphOps)) {
+  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
     if (op->Inputs().empty()) {  // Special case, Op has no input.
-      ready_ops.insert(op.get());
+      ready_ops.insert(op);
     } else {
-      InsertPendingOp(&pending_ops, op.get());
+      InsertPendingOp(&pending_ops, op);
     }
   }
 
   // Step 2. Insert FetchOps
-  std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
-  std::unordered_set<std::unique_ptr<VarHandleBase>> fetch_dependencies;
+  std::vector<FetchOpHandle *> fetch_ops;
+  std::unordered_set<VarHandleBase *> fetch_dependencies;
   FeedFetchList fetch_data(fetch_tensors.size());
 
   InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops,
@@ -109,6 +110,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
         for (auto &run_op_future : run_op_futures_) {
           run_op_future.wait();
         }
+        ClearFetchOp(graph_.get(), &fetch_ops);
         exception_holder_.ReThrow();
       } else {
         continue;
@@ -140,8 +142,8 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 
 void ThreadedSSAGraphExecutor::InsertFetchOps(
     const std::vector<std::string> &fetch_tensors,
-    std::vector<std::unique_ptr<FetchOpHandle>> *fetch_ops,
-    std::unordered_set<std::unique_ptr<VarHandleBase>> *fetch_dependencies,
+    std::vector<FetchOpHandle *> *fetch_ops,
+    std::unordered_set<VarHandleBase *> *fetch_dependencies,
     std::unordered_map<OpHandleBase *, size_t> *pending_ops,
     std::unordered_set<VarHandleBase *> *pending_vars,
     BlockingQueue<VarHandleBase *> *ready_vars, FeedFetchList *fetch_data) {
@@ -151,7 +153,7 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
     for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
       auto it = var_map.find(fetch_var_name);
       if (it != var_map.end()) {
-        fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
+        fetched_vars[fetch_var_name].push_back(*it->second.rbegin());
       }
     }
   }
@@ -208,16 +210,16 @@ void ThreadedSSAGraphExecutor::RunOp(
     details::OpHandleBase *op) {
   auto op_run = [ready_var_q, op, this] {
     try {
-      if (VLOG_IS_ON(10)) {
-        VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
+      if (VLOG_IS_ON(100)) {
+        VLOG(100) << op << " " << op->Name() << " : " << op->DebugString();
       }
       if (LIKELY(!strategy_.dry_run_)) {
         op->Run(strategy_.use_cuda_);
       }
-      VLOG(10) << op << " " << op->Name() << " Done ";
+      VLOG(100) << op << " " << op->Name() << " Done ";
       running_ops_--;
       ready_var_q->Extend(op->Outputs());
-      VLOG(10) << op << " " << op->Name() << "Signal posted";
+      VLOG(100) << op << " " << op->Name() << "Signal posted";
     } catch (...) {
       exception_holder_.Catch(std::current_exception());
     }
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 5c0bc169eaf3f54596eb8e08b7bf80a82253c9b2..24da56c09e3e0f3894d58e5af8838c98e3e1e67c 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -70,13 +70,13 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
                         BlockingQueue<VarHandleBase *> *ready_vars,
                         VarHandleBase *var) const;
 
-  void InsertFetchOps(
-      const std::vector<std::string> &fetch_tensors,
-      std::vector<std::unique_ptr<FetchOpHandle>> *fetch_ops,
-      std::unordered_set<std::unique_ptr<VarHandleBase>> *fetch_dependencies,
-      std::unordered_map<OpHandleBase *, size_t> *pending_ops,
-      std::unordered_set<VarHandleBase *> *pending_vars,
-      BlockingQueue<VarHandleBase *> *ready_vars, FeedFetchList *fetch_data);
+  void InsertFetchOps(const std::vector<std::string> &fetch_tensors,
+                      std::vector<FetchOpHandle *> *fetch_ops,
+                      std::unordered_set<VarHandleBase *> *fetch_dependencies,
+                      std::unordered_map<OpHandleBase *, size_t> *pending_ops,
+                      std::unordered_set<VarHandleBase *> *pending_vars,
+                      BlockingQueue<VarHandleBase *> *ready_vars,
+                      FeedFetchList *fetch_data);
 
  private:
   ExecutionStrategy strategy_;
diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc
index 5457870e9ff5d7cf67c9c7076b9aae94eeada779..30da029ca2a90e7faa6288557ff2f1aeb21cc1c6 100644
--- a/paddle/fluid/framework/details/var_handle.cc
+++ b/paddle/fluid/framework/details/var_handle.cc
@@ -20,6 +20,8 @@ namespace details {
 
 VarHandleBase::~VarHandleBase() {}
 
+VarHandle::~VarHandle() { VLOG(4) << "deleting var handle " << DebugString(); }
+
 std::string VarHandle::DebugString() const {
   std::stringstream ss;
   ss << name_ << ":" << place_;
@@ -27,6 +29,10 @@ std::string VarHandle::DebugString() const {
 }
 
 std::string DummyVarHandle::DebugString() const { return node_->Name(); }
+
+DummyVarHandle::~DummyVarHandle() {
+  VLOG(4) << "deleting dummy var handle " << DebugString();
+}
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index a1f458c660ce9f73bc9ac2ed194091ad0b8f8400..3b007d7b1a52df765a2dbd41939f8f865123cb43 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -35,7 +35,10 @@ class OpHandleBase;
 // A variable can only be generated by a single operator. i.e.
 // This is a single assignment graph.
 struct VarHandleBase {
-  explicit VarHandleBase(ir::Node* node) : node_(node) {}
+  // Owned by `node`. No need to be deleted explicitly.
+  explicit VarHandleBase(ir::Node* node) : node_(node) {
+    node_->WrappedBy(this);
+  }
 
   virtual ~VarHandleBase();
 
@@ -94,6 +97,8 @@ struct VarHandleBase {
 struct VarHandle : public VarHandleBase {
   explicit VarHandle(ir::Node* node) : VarHandleBase(node) {}
 
+  virtual ~VarHandle();
+
   std::string DebugString() const override;
 
   VarHandle(ir::Node* node, size_t version, size_t scope_index,
@@ -121,6 +126,8 @@ struct VarHandle : public VarHandleBase {
 struct DummyVarHandle : public VarHandleBase {
   explicit DummyVarHandle(ir::Node* node) : VarHandleBase(node) {}
 
+  virtual ~DummyVarHandle();
+
   std::string DebugString() const override;
 };
 
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 8ed0ba1dfa68b3e22f370c3f2dd0f83c3e5506b0..0313a6a1e3d11b9c43714544db15b092bbc586b3 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/ngraph_operator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/operators/detail/macros.h"
@@ -25,6 +26,7 @@ limitations under the License. */
 
 DECLARE_bool(benchmark);
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
+DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
 
 namespace paddle {
 namespace framework {
@@ -43,7 +45,7 @@ ExecutorPrepareContext::ExecutorPrepareContext(
 }
 
 ExecutorPrepareContext::~ExecutorPrepareContext() {
-  VLOG(5) << "destroy ExecutorPrepareContext";
+  VLOG(50) << "destroy ExecutorPrepareContext";
 }
 
 template <typename RefCntMap>
@@ -60,7 +62,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
         if ((it->second)-- == 1) {
           auto* var = scope.FindVar(name);
           if (var != nullptr) {
-            VLOG(10) << "Erase tensor \'" << name << "\'";
+            VLOG(100) << "Erase tensor \'" << name << "\'";
             if (var->IsType<LoDTensor>()) {
               erase_tensors.insert(var->GetMutable<LoDTensor>());
             } else if (var->IsType<SelectedRows>()) {
@@ -81,6 +83,24 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
   }
 }
 
+static void EnableFusedOp(ExecutorPrepareContext* ctx) {
+#ifdef PADDLE_WITH_NGRAPH
+  VLOG(3) << "use_ngraph=True";
+  auto intervals = FusedOperator::FusedOpIntervals(&ctx->ops_);
+  for (auto& interval : intervals) {
+    auto* fused_op = new FusedOperator(ctx->prog_, ctx->block_id_,
+                                       interval.at(0), interval.at(1));
+    *interval[0] = std::unique_ptr<OperatorBase>(fused_op);
+  }
+  for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
+    ctx->ops_.erase(it->at(0) + 1, it->at(1));
+  }
+#else
+  LOG(WARNING)
+      << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
+#endif
+}
+
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
 void Executor::Close() {
@@ -141,21 +161,21 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
       if (var->Persistable()) {
         auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var->Name());
         InitializeVariable(ptr, var->GetType());
-        VLOG(3) << "Create Variable " << var->Name()
-                << " global, which pointer is " << ptr;
+        VLOG(30) << "Create Variable " << var->Name()
+                 << " global, which pointer is " << ptr;
       } else {
         auto* ptr = scope->Var(var->Name());
         InitializeVariable(ptr, var->GetType());
-        VLOG(3) << "Create Variable " << var->Name()
-                << " locally, which pointer is " << ptr;
+        VLOG(30) << "Create Variable " << var->Name()
+                 << " locally, which pointer is " << ptr;
       }
     }
   } else {
     for (auto& var : global_block.AllVars()) {
       auto* ptr = scope->Var(var->Name());
       InitializeVariable(ptr, var->GetType());
-      VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
-              << ptr;
+      VLOG(30) << "Create variable " << var->Name() << ", which pointer is "
+               << ptr;
     }
   }
 }
@@ -286,7 +306,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
     int i = 0;
     for (auto& feed_target : (*feed_targets)) {
       std::string var_name = feed_target.first;
-      VLOG(3) << "feed target's name: " << var_name;
+      VLOG(30) << "feed target's name: " << var_name;
 
       // prepend feed op
       auto* op = global_block->PrependOp();
@@ -309,7 +329,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
     int i = 0;
     for (auto& fetch_target : (*fetch_targets)) {
       std::string var_name = fetch_target.first;
-      VLOG(3) << "fetch target's name: " << var_name;
+      VLOG(30) << "fetch target's name: " << var_name;
 
       // append fetch op
       auto* op = global_block->AppendOp();
@@ -338,6 +358,7 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
   for (auto& op_desc : block.AllOps()) {
     ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
   }
+  if (FLAGS_use_ngraph) EnableFusedOp(ctx.get());
   return ctx;
 }
 
@@ -359,6 +380,7 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                                   bool create_local_scope, bool create_vars,
                                   bool keep_kids) {
+  PADDLE_ENFORCE_NOT_NULL(scope);
   Scope* local_scope = scope;
   if (create_vars) {
     if (create_local_scope) {
@@ -398,8 +420,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
     }
 
     if (FLAGS_benchmark) {
-      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
-              << memory::memory_usage(place_);
+      VLOG(20) << "Memory used after operator " + op->Type() + " running: "
+               << memory::memory_usage(place_);
     }
   }
 
@@ -424,10 +446,10 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }
 
   if (FLAGS_benchmark) {
-    VLOG(2) << "-------------------------------------------------------";
-    VLOG(2) << "Memory used after deleting local scope: "
-            << memory::memory_usage(place_);
-    VLOG(2) << "-------------------------------------------------------";
+    VLOG(20) << "-------------------------------------------------------";
+    VLOG(20) << "Memory used after deleting local scope: "
+             << memory::memory_usage(place_);
+    VLOG(20) << "-------------------------------------------------------";
   }
 }
 
@@ -471,7 +493,7 @@ void Executor::RunPreparedContext(
 
 void Executor::EnableMKLDNN(const ProgramDesc& program) {
 #ifdef PADDLE_WITH_MKLDNN
-  VLOG(3) << "use_mkldnn=True";
+  VLOG(30) << "use_mkldnn=True";
   for (size_t bid = 0; bid < program.Size(); ++bid) {
     auto* block = const_cast<ProgramDesc&>(program).MutableBlock(bid);
     for (auto* op : block->AllOps()) {
@@ -485,6 +507,5 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) {
       << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
 #endif
 }
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 3e9353f5cf67d8de62c5551f12ea786e49190549..1f3c19c0d5901cec9acc4ac9c5dab538d620c956 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -25,7 +25,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
                      const std::string& var_name, size_t index) {
   // If var_name Variable is not found in GlobalScope, a new variable will
   // be created.
-  VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
+  VLOG(30) << "SetFeedVariable name=" << var_name << " index=" << index;
   Variable* g_feed_value = scope->Var(var_name);
   auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
   if (index >= feed_inputs.size()) {
@@ -47,8 +47,8 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
                  typeid(FeedFetchList).name());
   auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
   auto& tensor = fetch_outputs[index];
-  VLOG(3) << "Fetch " << var_name << " with index " << index
-          << " shape= " << tensor.dims();
+  VLOG(30) << "Fetch " << var_name << " with index " << index
+           << " shape= " << tensor.dims();
   PADDLE_ENFORCE_LT(index, fetch_outputs.size());
   return tensor;
 }
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 28231a53bad50fe9f19cfe3e73c3dc09aa3762cf..504f7e6d6c13d6c40d72a53e52fec920457f2dae 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -5,6 +5,7 @@ file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")
 
 
 # Usage: pass_library(target inference) will append to paddle_inference_pass.h
+unset(INFER_IR_PASSES CACHE) # clear the global variable
 function(pass_library TARGET DEST)
     set(options "")
     set(oneValueArgs "")
@@ -15,10 +16,11 @@ function(pass_library TARGET DEST)
     if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
         message(STATUS "add pass ${TARGET} ${DEST}")
         file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
-        set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE)
+        set(INFER_IR_PASSES ${INFER_IR_PASSES} ${TARGET} CACHE INTERNAL "")
     endif()
 endfunction()
 
+
 cc_library(node SRCS node.cc DEPS proto_desc)
 cc_library(graph SRCS graph.cc DEPS node pretty_log)
 cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
@@ -53,6 +55,7 @@ set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
 
 cc_library(pass_builder SRCS pass_builder.cc DEPS pass)
 
+cc_test(node_test SRCS node_test.cc DEPS node)
 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
index 6090f1fe76a49dddad0640123b1fa4db8c489634..8668007da1d4ef75a1b95f8fe86e52ae0159c899 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -91,10 +91,10 @@ void FindWhileOp(Graph* graph) {
 #undef OP_SET_IN
 #undef OP_SET_OUT
 
-  auto* X = graph->RetriveNode(34);
-  auto* LSTMOUT = graph->RetriveNode(81);
-  auto* cell_init = graph->RetriveNode(6);
-  auto* hidden_init = graph->RetriveNode(8);
+  auto* X = graph->RetrieveNode(34);
+  auto* LSTMOUT = graph->RetrieveNode(81);
+  auto* cell_init = graph->RetrieveNode(6);
+  auto* hidden_init = graph->RetrieveNode(8);
 
   auto* lstm_op = graph->CreateOpNode(&op_desc);
   PrepareParameters(graph, param);
@@ -147,19 +147,19 @@ void PrepareParameters(Graph* graph, const Param& param) {
   scope->Var(param.LSTMX)->GetMutable<LoDTensor>();
   scope->Var(param.LSTMOUT)->GetMutable<LoDTensor>();
 
-#define GATE_W(name__)                                               \
-  auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0");            \
-  auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1");            \
-  auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0");            \
-  CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0);       \
-  VLOG(4) << #name__ "_w0"                                           \
-          << " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \
-  VLOG(4) << #name__ "_w1"                                           \
-          << " shape: " << W_##name__##_w1->Get<LoDTensor>().dims(); \
-  VLOG(4) << #name__ "_b0"                                           \
-          << " shape: " << W_##name__##_b0->Get<LoDTensor>().dims(); \
-  auto& W_##name__##_w0_t = W_##name__##_w0->Get<LoDTensor>();       \
-  auto& W_##name__##_w1_t = W_##name__##_w1->Get<LoDTensor>();       \
+#define GATE_W(name__)                                                \
+  auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0");             \
+  auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1");             \
+  auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0");             \
+  CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0);        \
+  VLOG(40) << #name__ "_w0"                                           \
+           << " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \
+  VLOG(40) << #name__ "_w1"                                           \
+           << " shape: " << W_##name__##_w1->Get<LoDTensor>().dims(); \
+  VLOG(40) << #name__ "_b0"                                           \
+           << " shape: " << W_##name__##_b0->Get<LoDTensor>().dims(); \
+  auto& W_##name__##_w0_t = W_##name__##_w0->Get<LoDTensor>();        \
+  auto& W_##name__##_w1_t = W_##name__##_w1->Get<LoDTensor>();        \
   auto& W_##name__##_b0_t = W_##name__##_b0->Get<LoDTensor>();
 
   GATE_W(forget);
@@ -208,7 +208,7 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
   int D = W_forget_w0.dims()[0];
   int M = W_forget_w1.dims()[0];
   out->Resize(make_ddim({D + M, 4 * D}));
-  VLOG(3) << "LSTMWeight resized to " << out->dims();
+  VLOG(30) << "LSTMWeight resized to " << out->dims();
 
   float* out_data = out->mutable_data<float>(platform::CPUPlace());
   std::array<const float*, 4> tensors(
diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
index 449cc78be15bcd2575ce2e6846b41e475f8921f6..c9c4d5afe5a0cd67ea14ae7abcf2b2bad1407e39 100644
--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
@@ -57,7 +57,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
   int found_conv_bias_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(4) << "handle ConvBias fuse";
+    VLOG(40) << "handle ConvBias fuse";
     GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
                               conv_bias_pattern);                      // Filter
     GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_bias_pattern);  // tmp
@@ -74,7 +74,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
     // check if fuse can be done and if MKL-DNN should be used
     FuseOptions fuse_option = FindFuseOption(*conv, *eltwise);
     if (fuse_option == DO_NOT_FUSE || fuse_option == FUSE_NATIVE) {
-      VLOG(3) << "do not perform conv+bias fuse";
+      VLOG(30) << "do not perform conv+bias fuse";
       return;
     }
 
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 846a14e365e6bd7f056d409130a3b246371931da..34b4c26ae3a8c281cd2729f67e49c78a8f440cc5 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -121,7 +121,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
   int found_conv_bn_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(4) << "handle ConvBN fuse";
+    VLOG(40) << "handle ConvBN fuse";
 
     // conv, batch_norm,
     // conv_weight, conv_out,
@@ -133,7 +133,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
     // check if fuse can be done and if MKL-DNN should be used
     FuseOptions fuse_option = FindFuseOption(*conv, *batch_norm);
     if (fuse_option == DO_NOT_FUSE) {
-      VLOG(3) << "do not perform conv+bn fuse";
+      VLOG(30) << "do not perform conv+bn fuse";
       return;
     }
 
@@ -241,7 +241,7 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl(
   int found_conv_bn_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(4) << "handle ConvBN fuse";
+    VLOG(40) << "handle ConvBN fuse";
 
     // conv, batch_norm,
     // conv_weight, conv_out,
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
index e359a3832ee8d549f8c58d63bc1cc6564ecadede..048868e1f913e9df3d985b9e66c075a02a7f0bcb 100644
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
   int found_conv_relu_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(4) << "handle ConvReLU fuse";
+    VLOG(40) << "handle ConvReLU fuse";
     GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
                               conv_relu_pattern);                      // Filter
     GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern);  // tmp
@@ -48,7 +48,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
 
     FuseOptions fuse_option = FindFuseOption(*conv, *relu);
     if (fuse_option == DO_NOT_FUSE) {
-      VLOG(3) << "do not perform conv+relu fuse";
+      VLOG(30) << "do not perform conv+relu fuse";
       return;
     }
 
diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
index 19056e18aa892dbc83dfbf7305b6ad8b6b6bc51c..5f3334578d10f64b197215bfc11d08e30747cb90 100644
--- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
@@ -39,7 +39,7 @@ std::unique_ptr<ir::Graph> DepthwiseConvMKLDNNPass::ApplyImpl(
   int found_depthwise_conv_mkldnn_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(3) << "handle DepthwiseConvMKLDNN fuse";
+    VLOG(30) << "handle DepthwiseConvMKLDNN fuse";
     GET_NODE(depthwise_conv, (*pattern));
     depthwise_conv->Op()->SetType("conv2d");
     found_depthwise_conv_mkldnn_count++;
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index ca704c7f5631bbaa88f1bc2caaa22fd021de11c4..3348abb19b3339b2b3e8b50485133b15a1973a32 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -39,7 +39,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
   int found_fc_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(4) << "handle FC fuse";
+    VLOG(40) << "handle FC fuse";
     GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
index 648acc4a759417240d9a39749b059289182ebb1e..8ed68905beed2faedc34f194070cc76e8ff3c32d 100644
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
@@ -61,7 +61,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
 
   auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
                      Graph *g) {
-    VLOG(4) << "handle FuseElewiseAddAct fuse";
+    VLOG(40) << "handle FuseElewiseAddAct fuse";
     GET_IR_NODE_FROM_SUBGRAPH(ele_y, ele_y, elewise_add_act_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out,
                               elewise_add_act_pattern);
@@ -77,10 +77,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
     Node *elewise_add_act_node = CreateFuseElewiseAddActNode(
         g, act, ele_add, ele_x_n, ele_y_n, ele_out_n, act_out_n);
 
-    VLOG(4) << "\n\t " << ele_x_n << " and " << ele_y_n << " -> "
-            << ele_add->Name() << " -> " << ele_out_n << "\n"
-            << "\t " << ele_out_n << " -> " << act->Name() << " -> "
-            << act_out_n;
+    VLOG(40) << "\n\t " << ele_x_n << " and " << ele_y_n << " -> "
+             << ele_add->Name() << " -> " << ele_out_n << "\n"
+             << "\t " << ele_out_n << " -> " << act->Name() << " -> "
+             << act_out_n;
 
     ReLinkNodes(g, ele_out, ele_add, act, elewise_add_act_node);
     found_elewise_add_act_count++;
@@ -113,7 +113,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
 
   auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
                      Graph *g) {
-    VLOG(4) << "handle FuseElewiseAddAct fuse";
+    VLOG(40) << "handle FuseElewiseAddAct fuse";
     GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, act_elewise_add_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(ele_x, ele_x, act_elewise_add_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out,
@@ -129,9 +129,9 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
     Node *elewise_add_act_node = CreateFuseElewiseAddActNode(
         g, ele_add, act, elewise_add_x_n, act_i_n, act_o_n, elewise_add_out_n);
 
-    VLOG(4) << "\n\t " << act_i_n << " -> " << act->Name() << " -> " << act_o_n
-            << "\n\t " << act_o_n << " and " << elewise_add_x_n << " -> "
-            << ele_add->Name() << " -> " << elewise_add_out_n;
+    VLOG(40) << "\n\t " << act_i_n << " -> " << act->Name() << " -> " << act_o_n
+             << "\n\t " << act_o_n << " and " << elewise_add_x_n << " -> "
+             << ele_add->Name() << " -> " << elewise_add_out_n;
 
     ReLinkNodes(g, act_out, act, ele_add, elewise_add_act_node);
     found_elewise_add_act_count++;
@@ -165,7 +165,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
 
   auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
                      Graph *g) {
-    VLOG(4) << "handle FuseElewiseAddActGrad1 fuse";
+    VLOG(40) << "handle FuseElewiseAddActGrad1 fuse";
     GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, elewise_add_act_grad_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(act_grad, act_grad, elewise_add_act_grad_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(d_itermediate_out, d_itermediate_out,
@@ -208,10 +208,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
 
     auto fused_node = g->CreateOpNode(&desc);
 
-    VLOG(4) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> "
-            << act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t "
-            << d_itermediate_out_n << " and " << act_out_n << " -> "
-            << ele_add_grad->Name() << " -> " << d_itermediate_out_n;
+    VLOG(40) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> "
+             << act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t "
+             << d_itermediate_out_n << " and " << act_out_n << " -> "
+             << ele_add_grad->Name() << " -> " << d_itermediate_out_n;
 
     ReLinkNodes(g, d_itermediate_out, act_grad, ele_add_grad, fused_node);
     found_elewise_add_act_count++;
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 132159b8b272f311060a39b58919c26822bf50ee..ae0e42ff5e89466013382ab97650e6afeeff3d2d 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -84,15 +84,13 @@ void CheckProgram(const ProgramDesc &program) {
 
 Graph::Graph(const ProgramDesc &program) : program_(program) {
   CheckProgram(program_);
-  // Make the nodes id start from 0.
-  Node::ResetId();
   auto var_nodes = InitFromProgram(program_);
   ResolveHazard(var_nodes);
 }
 
 std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
     const ProgramDesc &program) {
-  VLOG(3) << "block in program:" << program_.Size();
+  VLOG(30) << "block in program:" << program_.Size();
   std::unordered_map<std::string, VarDesc *> all_vars;
   // var nodes for each var name, will have multiple versions in SSA
   std::map<std::string, std::vector<ir::Node *>> var_nodes;
@@ -160,7 +158,7 @@ void Graph::ResolveHazard(
     auto it_old = versions.rbegin();
     ++it_old;
     for (; it_old != versions.rend(); it_new = it_old, ++it_old) {
-      VLOG(3) << "deal with var: " << (*it_new)->Name();
+      VLOG(30) << "deal with var: " << (*it_new)->Name();
       ir::Node *write_op =
           (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
       const auto &read_ops = (*it_old)->outputs;
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 9d7aa5d32deb274fbf29481b0d4754c05d1e21b5..0c856f8e610077c69416ccfb8a763d4b8ae881b8 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -89,7 +89,7 @@ class Graph {
                    attr_name);
     attrs_[attr_name] = attr;
     attr_dels_[attr_name] = [attr, attr_name]() {
-      VLOG(3) << "deleting " << attr_name;
+      VLOG(30) << "deleting " << attr_name;
       delete attr;
     };
   }
@@ -102,18 +102,31 @@ class Graph {
     attr_dels_[attr_name] = []() {};
   }
 
+  template <typename AttrType>
+  void Erase(const std::string &attr_name) {
+    PADDLE_ENFORCE(attrs_.count(attr_name) != 0, "%s not set in the graph",
+                   attr_name);
+    attr_dels_[attr_name]();
+    attrs_.erase(attr_name);
+    attr_dels_.erase(attr_name);
+  }
+
   const std::unordered_set<ir::Node *> &Nodes() const { return node_set_; }
 
   // Create a normal variable with non-null VarDesc.
   ir::Node *CreateVarNode(VarDesc *var_desc) {
     PADDLE_ENFORCE(var_desc);
-    return AddNode(new ir::Node(var_desc));
+    auto *x = AddNode(new ir::Node(var_desc));
+    x->SetId(num_node_created_++);
+    return x;
   }
 
   // Create a normal runnable operator with OpDesc.
   ir::Node *CreateOpNode(OpDesc *op_desc) {
     PADDLE_ENFORCE(op_desc);
-    return AddNode(new ir::Node(op_desc));
+    auto *x = AddNode(new ir::Node(op_desc));
+    x->SetId(num_node_created_++);
+    return x;
   }
 
   // Create a control dependency var that connects 2 operations. The
@@ -123,13 +136,17 @@ class Graph {
     // TODO(panyx0718): control var name should be really unique.
     const std::string name = string::Sprintf(
         "%s@%llu", ir::Node::kControlDepVarName, node_set_.size());
-    return AddNode(new ir::Node(name, ir::Node::Type::kVariable));
+    auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable));
+    x->SetId(num_node_created_++);
+    return x;
   }
 
   // A more free style way of creating a graph node. Mostly use for test
   // or "copy" from another node. Avoid using it if possible.
   ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type) {
-    return AddNode(new ir::Node(name, type));
+    auto *x = AddNode(new ir::Node(name, type));
+    x->SetId(num_node_created_++);
+    return x;
   }
 
   // Clear all node information of the graph and return the ownership of the
@@ -151,7 +168,7 @@ class Graph {
   }
 
   // NOTE low performance, but simple and secure.
-  Node *RetriveNode(int id) {
+  Node *RetrieveNode(int id) {
     for (auto &node : nodes_) {
       if (node.second->id() == id) {
         return node.second.get();
@@ -160,6 +177,7 @@ class Graph {
     return nullptr;
   }
 
+  const ProgramDesc &program() const { return program_; }
   std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
       const ProgramDesc &program);
 
@@ -181,6 +199,7 @@ class Graph {
   std::map<std::string, std::function<void(void)>> attr_dels_;
   std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_;
   std::unordered_set<ir::Node *> node_set_;
+  size_t num_node_created_{0};  // help to generate a unique node id.
 };
 
 bool IsControlDepVar(const ir::Node &var);
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index 01e878089171e4620f32b57a65d92d1c86d307db..98112c1ed317c230cb5150e7cbc6d0d173256601 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -33,8 +33,9 @@ void SortHelper(
     }
   }
 
-  VLOG(3) << "topology sort insert: " << node->Name()
-          << reinterpret_cast<void *>(node) << " input " << node->inputs.size();
+  VLOG(30) << "topology sort insert: " << node->Name()
+           << reinterpret_cast<void *>(node) << " input "
+           << node->inputs.size();
   ret->push_back(node);
 }
 
@@ -103,9 +104,9 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
     for (auto &var : n->inputs) {
       for (auto &adj_n : var->inputs) {
         PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
-        VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
-                << " -> " << n->Name() << reinterpret_cast<void *>(n)
-                << "  via " << var->Name() << reinterpret_cast<void *>(var);
+        VLOG(40) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
+                 << " -> " << n->Name() << reinterpret_cast<void *>(n)
+                 << "  via " << var->Name() << reinterpret_cast<void *>(var);
         adj_list[n].insert(adj_n);
       }
     }
@@ -163,10 +164,10 @@ size_t GraphNum(const Graph &graph) {
     graph_nodes.emplace_back(g_nodes);
   }
 
-  if (VLOG_IS_ON(10)) {
-    VLOG(10) << "graph_num: " << graph_nodes.size();
+  if (VLOG_IS_ON(100)) {
+    VLOG(100) << "graph_num: " << graph_nodes.size();
     for (auto &g_n : graph_nodes) {
-      VLOG(10) << "graph_nodes: " << g_n.size();
+      VLOG(100) << "graph_nodes: " << g_n.size();
       if (g_n.size() < 10) {
         std::stringstream out;
         for (auto &node : g_n) {
@@ -180,7 +181,7 @@ size_t GraphNum(const Graph &graph) {
           }
           out << "]";
         }
-        VLOG(10) << out.str();
+        VLOG(100) << out.str();
       }
     }
   }
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
index ec46b38c01b8c369ab37b4fbd5497ec120d8db91..8d92c406689ab3a97596a8666ceb452aec4be170 100644
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -37,6 +37,15 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph);
 std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
     const Graph &graph);
 
+template <typename T>
+std::vector<T *> FilterByNodeWrapper(const Graph &graph) {
+  std::vector<T *> ret;
+  for (ir::Node *n : graph.Nodes()) {
+    if (n->IsWrappedBy<T>()) ret.push_back(&n->Wrapper<T>());
+  }
+  return ret;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index b20d70132256bd5df7411c46ff4eb246b1f14ba8..b534a5509279ef7bfc5fc92ec726224e6c5ed16f 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <algorithm>
 #include <array>
 #include <string>
 #include <vector>
@@ -91,19 +92,19 @@ void GraphPatternDetector::operator()(Graph *graph,
   PrettyLogEndl(Style::detail(), "---  detect %d subgraphs", subgraphs.size());
   int id = 0;
   for (auto &g : subgraphs) {
-    VLOG(3) << "optimizing #" << id++ << " subgraph";
+    VLOG(30) << "optimizing #" << id++ << " subgraph";
     handler(g, graph);
   }
 }
 
 bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
-  VLOG(3) << "mark pdnodes in graph";
+  VLOG(30) << "mark pdnodes in graph";
   if (graph.Nodes().empty()) return false;
 
   for (auto &node : GraphTraits::DFS(graph)) {
     for (const auto &pdnode : pattern_.nodes()) {
       if (pdnode->Tell(&node)) {
-        VLOG(4) << "pdnode " << pdnode->name() << " marked";
+        VLOG(40) << "pdnode " << pdnode->name() << " marked";
         pdnodes2nodes_[pdnode.get()].insert(&node);
       }
     }
@@ -111,7 +112,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
   // Check to early stop if some PDNode can't find matched Node.
   for (auto &pdnode : pattern_.nodes()) {
     if (!pdnodes2nodes_.count(pdnode.get())) {
-      VLOG(4) << pdnode->name() << " can't find matched Node, early stop";
+      VLOG(40) << pdnode->name() << " can't find matched Node, early stop";
       // return false;
     }
   }
@@ -120,7 +121,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
       GetMarkedNodes(const_cast<Graph *>(&graph)).insert(n);
     }
   }
-  VLOG(3) << pdnodes2nodes_.size() << " nodes marked";
+  VLOG(30) << pdnodes2nodes_.size() << " nodes marked";
 
   return !pdnodes2nodes_.empty();
 }
@@ -166,10 +167,12 @@ struct HitGroup {
 
   bool Match(Node *node, PDNode *pat) {
     if (nodes_.count(node)) {
-      if (!roles.count(pat)) return false;
-      return roles[pat] == node;
+      if (roles.count(pat) && roles[pat] == node) return true;
+      return false;
+    } else {
+      if (roles.count(pat) && roles[pat] != node) return false;
+      return true;
     }
-    return !roles.count(pat) || roles.at(pat) == node;
   }
 
   void Register(Node *node, PDNode *pat) {
@@ -197,7 +200,6 @@ GraphPatternDetector::DetectPatterns() {
   std::vector<GraphPatternDetector::subgraph_t> result;
   std::vector<HitGroup> init_groups;
   std::array<std::vector<HitGroup>, 2> bi_records;
-  // PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed");
   auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get()
                                                : pattern_.edges().front().first;
   if (!pdnodes2nodes_.count(first_pnode)) return result;
@@ -213,7 +215,7 @@ GraphPatternDetector::DetectPatterns() {
   // Extend a PDNode to subgraphs by deducing the connection relations defined
   // in edges of PDNodes.
   for (const auto &edge : pattern_.edges()) {
-    VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name();
+    VLOG(40) << "check " << edge.first->name() << " -> " << edge.second->name();
     // TODO(Superjomn) Fix bug here, the groups might be duplicate here.
     // Each role has two PDNodes, which indicates two roles.
     // Detect two Nodes that can match these two roles and they are connected.
@@ -224,14 +226,15 @@ GraphPatternDetector::DetectPatterns() {
     // source -> target
     for (Node *source : pdnodes2nodes_[edge.first]) {
       for (Node *target : pdnodes2nodes_[edge.second]) {
-        VLOG(8) << "check " << source->id() << " -- " << target->id();
+        VLOG(80) << "check " << source->id() << " -- " << target->id();
         // TODO(Superjomn) add some prune strategies.
         for (const auto &group : pre_groups) {
-          HitGroup new_group = group;
-          if (IsNodesLink(source, target) &&
-              new_group.Match(source, edge.first)) {
-            new_group.Register(source, edge.first);
-            if (new_group.Match(target, edge.second)) {
+          if (IsNodesLink(source, target)) {
+            HitGroup new_group = group;
+            bool flag = new_group.Match(source, edge.first) &&
+                        new_group.Match(target, edge.second);
+            if (flag) {
+              new_group.Register(source, edge.first);
               new_group.Register(target, edge.second);
               cur_groups.push_back(new_group);
               // TODO(Superjomn) need to unique
@@ -240,12 +243,13 @@ GraphPatternDetector::DetectPatterns() {
         }
       }
     }
-    VLOG(3) << "step " << step << " get records: " << cur_groups.size();
+    VLOG(30) << "step " << step << " get records: " << cur_groups.size();
     for (auto &group : cur_groups) {
       for (auto &item : group.roles) {
-        VLOG(4) << "node " << item.second->id() << " as " << item.first->name();
+        VLOG(40) << "node " << item.second->id() << " as "
+                 << item.first->name();
       }
-      VLOG(4) << "=========================================================";
+      VLOG(40) << "=========================================================";
     }
   }
 
@@ -259,14 +263,16 @@ GraphPatternDetector::DetectPatterns() {
   return result;
 }
 
-bool GraphItemCMP(const std::pair<PDNode *, Node *> &a,
+struct GraphItemLessThan {
+  bool operator()(const std::pair<PDNode *, Node *> &a,
                   const std::pair<PDNode *, Node *> &b) {
-  if (a.first != b.first) {
-    return a.first < b.first;
-  } else {
-    return a.second < b.second;
+    if (a.first != b.first) {
+      return a.first < b.first;
+    } else {
+      return a.second < b.second;
+    }
   }
-}
+};
 
 // TODO(Superjomn) enhance the function as it marks unique unique as duplicates
 // see https://github.com/PaddlePaddle/Paddle/issues/13550
@@ -280,7 +286,7 @@ void GraphPatternDetector::UniquePatterns(
   for (auto &g : *subgraphs) {
     // Sort the items in the sub-graph, and transform to a string key.
     std::vector<std::pair<PDNode *, Node *>> sorted_keys(g.begin(), g.end());
-    std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemCMP);
+    std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemLessThan());
     std::stringstream ss;
     for (auto &item : sorted_keys) {
       ss << item.first << ":" << item.second;
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 9e462ac671ee931fc17a31f32a76049a0990341f..1c5155df7867f95fb403d51bf633084a6c400f12 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -310,8 +310,8 @@ void GraphSafeRemoveNodes(Graph* graph,
                           const std::unordered_set<const Node*>& nodes);
 
 // Some pre-defined patterns those can be reused in multiple passes.
-// The related Fluid Layer or Op should be one pattern here for better reusage
-// accross different fusion.
+// The related Fluid Layer or Op should be one pattern here for better re-usage
+// across different fusion.
 namespace patterns {
 
 struct KeyCounter {
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc
index 414d8f79b15de091c62af5fe099ffae144156e4e..36f36933265c69fcd450894a3e32bbb3e547b62c 100644
--- a/paddle/fluid/framework/ir/graph_to_program_pass.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc
@@ -35,10 +35,11 @@ std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
       new proto::ProgramDesc(*program.Proto()));
 
   auto block = program_pb->mutable_blocks(kRootBlockIndex);
+  block->set_idx(kRootBlockIndex);
   block->clear_vars();
   std::unordered_set<std::string> visited_vars;
   for (ir::Node* n : graph->Nodes()) {
-    if (n->NodeType() == ir::Node::Type::kVariable) {
+    if (n->IsVar()) {
       if (n->Var() && visited_vars.count(n->Var()->Name()) == 0) {
         visited_vars.insert(n->Var()->Name());
         block->add_vars()->MergeFrom(*n->Var()->Proto());
diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc
index 084a4ba2def87eaa8badb3ca2c39865c6e5cb981..2ee12cc410393d1e1aa5fc9e5374d858eca1b901 100644
--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@@ -66,6 +66,76 @@ NodesDFSIterator &NodesDFSIterator::operator=(const NodesDFSIterator &other) {
 }
 Node *NodesDFSIterator::operator->() { return stack_.top(); }
 
+inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
+  return node.inputs.size() == n;
+}
+
+NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
+  PADDLE_ENFORCE(!source.empty(),
+                 "Start points of topological sorting should not be empty!");
+  // CHECK all the inputs' in-degree is 0
+  for (auto *node : source) {
+    PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
+  }
+
+  std::unordered_set<Node *> visited;
+  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
+
+  std::vector<Node *> inlink_visited;
+  while (!to_visit.empty()) {
+    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
+    for (auto *p : queue) {
+      inlink_visited.clear();
+
+      std::copy_if(p->inputs.begin(), p->inputs.end(),
+                   std::back_inserter(inlink_visited),
+                   [&](Node *x) -> bool { return visited.count(x) != 0; });
+
+      if (inlink_visited.size() == p->inputs.size()) {
+        sorted_.push_back(p);
+        for (auto *_ : p->outputs) {
+          if (!visited.count(_)) {
+            to_visit.insert(_);
+          }
+        }
+
+        to_visit.erase(p);
+        visited.insert(p);
+      }
+    }
+  }
+}
+
+NodesTSIterator::NodesTSIterator(const NodesTSIterator &other)
+    : sorted_(other.sorted_), cursor_(other.cursor_) {}
+
+Node &NodesTSIterator::operator*() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return *sorted_[cursor_];
+}
+
+NodesTSIterator &NodesTSIterator::operator++() {
+  if (++cursor_ >= sorted_.size()) {
+    sorted_.clear();
+    cursor_ = 0;
+  }
+  return *this;
+}
+NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) {
+  cursor_ = other.cursor_;
+  sorted_ = other.sorted_;
+  return *this;
+}
+
+bool NodesTSIterator::operator==(const NodesTSIterator &other) {
+  return sorted_ == other.sorted_ && cursor_ == other.cursor_;
+}
+
+Node *NodesTSIterator::operator->() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return sorted_[cursor_];
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_traits.h b/paddle/fluid/framework/ir/graph_traits.h
index f42bab20ed97e372d2da0c4a492a4458ab94e0a0..f6772f9a37567c83c49bd44d551481edda1a74ae 100644
--- a/paddle/fluid/framework/ir/graph_traits.h
+++ b/paddle/fluid/framework/ir/graph_traits.h
@@ -62,6 +62,32 @@ struct NodesDFSIterator
   std::unordered_set<Node *> visited_;
 };
 
+// Topological sorting iterator on nodes.
+struct NodesTSIterator
+    : public std::iterator<std::forward_iterator_tag, Node *> {
+  NodesTSIterator() = default;
+  NodesTSIterator(const std::vector<Node *> &source);
+  NodesTSIterator(NodesTSIterator &&other)
+      : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
+    other.cursor_ = 0;
+  }
+  NodesTSIterator(const NodesTSIterator &other);
+
+  Node &operator*();
+  NodesTSIterator &operator++();
+  // TODO(Superjomn) current implementation just compare the first
+  // element, need to compare the graph and all the elements in the queue and
+  // set.
+  NodesTSIterator &operator=(const NodesTSIterator &other);
+  bool operator==(const NodesTSIterator &other);
+  bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
+  Node *operator->();
+
+ private:
+  std::vector<Node *> sorted_;
+  size_t cursor_{0};
+};
+
 /*
  * GraphTraits contains some graph traversal algorithms.
  *
@@ -76,6 +102,14 @@ struct GraphTraits {
                                             NodesDFSIterator());
   }
 
+  static iterator_range<NodesTSIterator> TS(const Graph &g) {
+    auto start_points = ExtractStartPoints(g);
+    PADDLE_ENFORCE(!start_points.empty());
+    NodesTSIterator x(start_points);
+    return iterator_range<NodesTSIterator>(NodesTSIterator(start_points),
+                                           NodesTSIterator());
+  }
+
  private:
   // The nodes those have no input will be treated as start points.
   static std::vector<Node *> ExtractStartPoints(const Graph &g) {
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
index 31ed98db72c8fd4af8c970861d386687962001ce..13dd354dc59b2bf00a741c565a4c97719eac76c3 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -41,7 +41,7 @@ std::string FormatName(const Node* node) {
 std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   const std::string graph_viz_path = Get<std::string>(kGraphVizPath);
-  VLOG(3) << "draw IR graph viz to " << graph_viz_path;
+  VLOG(30) << "draw IR graph viz to " << graph_viz_path;
   std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path));
   PADDLE_ENFORCE(fout->good());
   std::ostream& sout = *fout;
diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
index 65be69b7f5b5e363d5d0753c45f9ff9e3f329fbe..145a3a455c8ae2c1e6a5bc4fefa3491f420af5ba 100644
--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
@@ -20,7 +20,7 @@ namespace ir {
 
 std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
-  VLOG(3) << "Aplies MKL-DNN placement strategy.";
+  VLOG(30) << "Aplies MKL-DNN placement strategy.";
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) {
       n->Op()->SetAttr("use_mkldnn", true);
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
index bd5b76426eb55cebdabfccd700439a4c418a10f0..532961e4d59ad3611dc93b20738080d1755290e8 100644
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -62,7 +62,7 @@ VarDesc UpdateGradVarDesc(
         string::Sprintf("%s.repeat.%d", var_desc->Name(), repeat);
     VarDesc repeated_var = CopyVarDesc(var_desc);
     repeated_var.SetName(new_gname);
-    VLOG(3) << "update " << var_desc->Name() << " to repeat " << repeat;
+    VLOG(30) << "update " << var_desc->Name() << " to repeat " << repeat;
     return repeated_var;
   }
   return *var_desc;
@@ -78,7 +78,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
 
   std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
   auto origin_nodes = graph->ReleaseNodes();
-  VLOG(3) << "origin nodes count: " << origin_nodes.size();
+  VLOG(30) << "origin nodes count: " << origin_nodes.size();
   ir::Graph& result = *graph;
 
   // 1. record op nodes of different roles
@@ -137,8 +137,8 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
             "%s.repeat.%d", repeated_op.Input("Variance")[0], i);
         bn_vars_need_rename.insert(repeated_op.Input("Mean")[0]);
         bn_vars_need_rename.insert(repeated_op.Input("Variance")[0]);
-        VLOG(3) << "renaming " << repeated_op.Input("Mean")[0] << " to "
-                << new_mean_name;
+        VLOG(30) << "renaming " << repeated_op.Input("Mean")[0] << " to "
+                 << new_mean_name;
         repeated_op.RenameInput(repeated_op.Input("Mean")[0], new_mean_name);
         repeated_op.RenameInput(repeated_op.Input("Variance")[0], new_var_name);
         repeated_op.RenameOutput(repeated_op.Output("MeanOut")[0],
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
index 9277abe8c1b79c5f76f4610d0554bf337f329518..fe5d27bc4ffaa0f8eba54aec11b661971d0cb94b 100644
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -18,7 +18,6 @@ namespace paddle {
 namespace framework {
 namespace ir {
 constexpr char Node::kControlDepVarName[];
-int Node::count_ = 0;
 
 std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
                                         Node::Type type) {
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index d6d42f5e92080aa57445e2d6ce59aa3faa89d22d..594bfc7363130804e3e0c43fbcb43ecdaf5a1ea2 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -15,7 +15,10 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+#include <typeindex>
+#include <typeinfo>
 #include <vector>
+
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/macros.h"
@@ -24,9 +27,33 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-// Node should normally created by Graph::CreateXXXNode().
+// Node should only created by Graph::CreateXXXNode().
+// 1. Every Node should be part of a graph. No dangling Node exists.
+// 2. Node only contains members necessary for building graph structure.
+//    It doesn't contain other unrelated members, such as device, etc.
+//
+// Sometimes, for specific usages, Node needs to have additional members,
+// such as device_placement, version in order to be executed. It is suggested
+// to use composition pattern.
+//
+// class RunnableOp {
+//    RunnableOp(ir::Node* n) : n_(n) { n_.WrappedBy(this); }
+//
+//    int any_thing_;
+// }
+//
+// RunnableOp is owned by the ir::Node that composes it. In other words.
+// ir::Node will be responsible for deleting RunnableOp, say, when ir::Node
+// is deleted from the graph.
 class Node {
  public:
+  virtual ~Node() {
+    if (!wrapper_.empty()) {
+      VLOG(4) << "ir::Node deleting a wrapper node " << Name();
+      wrapper_deleter_();
+    }
+  }
+
   enum class Type { kOperation, kVariable };
   static constexpr char kControlDepVarName[] = "__control_var";
 
@@ -44,6 +71,29 @@ class Node {
     return op_desc_.get();
   }
 
+  // Set the `wrapper` that wraps the Node. `wrapper` is owned by Node.
+  template <typename T>
+  void WrappedBy(T* wrapper) {
+    if (!wrapper_.empty()) {
+      wrapper_deleter_();
+    }
+    wrapper_ = wrapper;
+    wrapper_deleter_ = [wrapper]() { delete wrapper; };
+    wrapper_type_ = std::type_index(typeid(T));
+  }
+
+  // Return a reference to the `wrapper`.
+  template <typename T>
+  T& Wrapper() {
+    return *boost::any_cast<T*>(wrapper_);
+  }
+
+  // Test if the Node is wrapped by type T.
+  template <typename T>
+  bool IsWrappedBy() {
+    return std::type_index(typeid(T)) == wrapper_type_;
+  }
+
   // Please don't use this API!
   int id() const { return id_; }
 
@@ -65,36 +115,34 @@ class Node {
   int id_;
 
  private:
+  // ID can only set by a Graph.
+  void SetId(int id) { id_ = id; }
+
   friend class Graph;
   friend std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
                                                  Node::Type type);
 
   explicit Node(const std::string& name, Type type)
-      : name_(name),
-        var_desc_(nullptr),
-        op_desc_(nullptr),
-        type_(type),
-        id_(count_++) {}
+      : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}
 
   explicit Node(VarDesc* var_desc)
       : name_(var_desc->Name()),
         var_desc_(new VarDesc(*var_desc)),
         op_desc_(nullptr),
-        type_(Type::kVariable),
-        id_(count_++) {}
+        type_(Type::kVariable) {}
 
   explicit Node(OpDesc* op_desc)
       : name_(op_desc->Type()),
         var_desc_(nullptr),
         op_desc_(new OpDesc(*op_desc, op_desc->Block())),
-        type_(Type::kOperation),
-        id_(count_++) {}
+        type_(Type::kOperation) {}
 
   Node() = delete;
 
-  static int count_;
-  // Please don't use this API or make this public.
-  static void ResetId() { count_ = 0; }
+  boost::any wrapper_;
+  std::function<void(void)> wrapper_deleter_;
+  std::type_index wrapper_type_ = std::type_index(typeid(void));
+
   DISABLE_COPY_AND_ASSIGN(Node);
 };
 
diff --git a/paddle/fluid/framework/ir/node_test.cc b/paddle/fluid/framework/ir/node_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..694efadda078169c993457181c00f7b357a09e87
--- /dev/null
+++ b/paddle/fluid/framework/ir/node_test.cc
@@ -0,0 +1,80 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class RunnableOp {
+ public:
+  RunnableOp(Node* node, bool* alive) : node_(node), alive_(alive) {
+    node_->WrappedBy(this);
+  }
+
+  virtual ~RunnableOp() { *alive_ = false; }
+
+ private:
+  Node* node_;
+  bool* alive_;
+};
+
+class RunnableOp2 {
+ public:
+  RunnableOp2(Node* node, bool* alive) : node_(node), alive_(alive) {
+    node_->WrappedBy(this);
+  }
+
+  virtual ~RunnableOp2() { *alive_ = false; }
+
+ private:
+  Node* node_;
+  bool* alive_;
+};
+
+TEST(NodeTest, Basic) {
+  bool alive1 = true;
+  bool alive2 = true;
+  std::unique_ptr<Node> n1(CreateNodeForTest("n1", Node::Type::kVariable));
+  std::unique_ptr<Node> n2(CreateNodeForTest("n2", Node::Type::kVariable));
+
+  EXPECT_FALSE(n1->IsWrappedBy<RunnableOp>());
+  EXPECT_FALSE(n1->IsWrappedBy<RunnableOp2>());
+  EXPECT_FALSE(n2->IsWrappedBy<RunnableOp>());
+  EXPECT_FALSE(n2->IsWrappedBy<RunnableOp2>());
+
+  new RunnableOp(n1.get(), &alive1);
+  new RunnableOp2(n2.get(), &alive2);
+
+  EXPECT_TRUE(n1->IsWrappedBy<RunnableOp>());
+  EXPECT_FALSE(n1->IsWrappedBy<RunnableOp2>());
+  EXPECT_FALSE(n2->IsWrappedBy<RunnableOp>());
+  EXPECT_TRUE(n2->IsWrappedBy<RunnableOp2>());
+
+  EXPECT_TRUE(alive1);
+  EXPECT_TRUE(alive2);
+
+  n1.reset(nullptr);
+  n2.reset(nullptr);
+  EXPECT_FALSE(alive1);
+  EXPECT_FALSE(alive2);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 9570c59cff2a6afeb1c607f7219b7b455974d6ce..e38c7ee192710747d76029d39ce3b64ca3845f5d 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -76,7 +76,7 @@ class Pass {
                    attr_name);
     attrs_[attr_name] = attr;
     attr_dels_[attr_name] = [attr, attr_name]() {
-      VLOG(3) << "deleting " << attr_name;
+      VLOG(30) << "deleting " << attr_name;
       delete attr;
     };
   }
@@ -93,6 +93,7 @@ class Pass {
  protected:
   virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const {
     LOG(FATAL) << "Calling virtual Pass not implemented.";
+    return graph;
   }
 
  private:
diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
index a7d5161c35db804703415066990f34da8109fbd9..b7687d61de3eacd47ff1208ba14c3f482215c1d4 100644
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
@@ -12,10 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h"
+#include <set>
+#include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
@@ -159,10 +162,7 @@ PDNode* BuildFCPattern(PDPattern* pattern, PDNode* fc_x) {
 
   std::set<std::string> acts({"sigmoid", "tanh", "relu", "identity"});
   PDNode* act = pattern->NewNode(
-      [=](Node* x) {
-        return x && x->IsOp() && acts.count(x->Op()->Type());
-
-      },
+      [=](Node* x) { return x && x->IsOp() && acts.count(x->Op()->Type()); },
       "act");
 
   PDNode* fc_out = pattern->NewNode(
@@ -196,7 +196,7 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
 
   detector(graph.get(), [&](const GraphPatternDetector::subgraph_t& subgraph,
                             Graph* graph) {
-    VLOG(4) << "get one concat pattern";
+    VLOG(40) << "get one concat pattern";
     // fc
     GET_NODE(fc_w, detector.pattern());
     GET_NODE(fc_bias, detector.pattern());
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
index 0a1f65d274708dd208d7783c6273160c4c61738a..015b5e3c6363cc96e31e21095fbbb007543c99af 100644
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
@@ -60,7 +60,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) {
 
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(4) << "handle SeqConv EltAdd Relu fuse";
+    VLOG(40) << "handle SeqConv EltAdd Relu fuse";
     GET_IR_NODE_FROM_SUBGRAPH(seqconv, seqconv, fuse_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(seqconv_weight, seqconv_weight, fuse_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(seqconv_out, seqconv_out, fuse_pattern);
diff --git a/paddle/fluid/framework/lod_rank_table.cc b/paddle/fluid/framework/lod_rank_table.cc
index 6bc795b642bf79b7556869c5ebe9b0323d3cc5fc..660ce2ec85131bafae27e8b7800fbfa3c238b59a 100644
--- a/paddle/fluid/framework/lod_rank_table.cc
+++ b/paddle/fluid/framework/lod_rank_table.cc
@@ -31,7 +31,7 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
     TableItem item;
     item.index = i;
     item.length = vec[i + 1] - vec[i];
-    VLOG(10) << "Add item to rank table " << item.index << " " << item.length;
+    VLOG(100) << "Add item to rank table " << item.index << " " << item.length;
     items_.emplace_back(item);
   }
   // NOTE(yuyang18):
diff --git a/paddle/fluid/framework/mixed_vector_test.cc b/paddle/fluid/framework/mixed_vector_test.cc
index 0599c8d384641606b0a5ebb5ba1781b56f539e63..0330cae377c32b2d49d409eff42b968d81356d49 100644
--- a/paddle/fluid/framework/mixed_vector_test.cc
+++ b/paddle/fluid/framework/mixed_vector_test.cc
@@ -51,7 +51,7 @@ TEST(mixed_vector, InitWithCount) {
 TEST(mixed_vector, ForEach) {
   vec<int> tmp;
   for (auto& v : tmp) {
-    VLOG(3) << v;
+    VLOG(30) << v;
   }
 }
 
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 7fb42feb95b4d54aec693228721c052f683f4d80..c384456b648d4497bf4bd003b183b773186e0f15 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -57,60 +57,58 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
   }
 }
 
-void NaiveExecutor::Prepare(Scope *parent_scope,
-                            const ProgramDesc &program_desc, int block_id,
-                            bool with_feed_fetch_ops) {
-  if (!parent_scope) {
+void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
+                            int block_id, bool with_feed_fetch_ops) {
+  if (!scope) {
     scope_ = new framework::Scope;
   } else {
-    scope_ = &parent_scope->NewScope();
+    scope_ = scope;
   }
-  CreateVariables(program_desc, scope_, block_id);
+
+  VLOG(3) << "NaiveExecutor init with scope " << scope;
   CreateOps(program_desc, block_id, with_feed_fetch_ops);
 }
 
 void NaiveExecutor::Run() {
   for (auto &op : ops_) {
-    VLOG(4) << "run " << op->Type();
+    VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
+            << " on scope " << scope_;
     op->Run(*scope_, place_);
   }
 }
 
-void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope,
-                                    int block_id) {
-  PADDLE_ENFORCE(scope);
+void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
+                                    bool persistable, Scope *scope) {
+  PADDLE_ENFORCE_NOT_NULL(scope);
+
   auto &global_block = desc.Block(block_id);
 
-  const Scope *ancestor_scope = scope;
-  while (ancestor_scope->parent()) {
-    ancestor_scope = ancestor_scope->parent();
+  const auto *anc = scope;
+  PADDLE_ENFORCE(anc->parent() != anc);
+  while (anc->parent()) {
+    anc = anc->parent();
   }
 
-  if (ancestor_scope != scope) {
-    for (auto &var : global_block.AllVars()) {
-      if (var->Name() == framework::kEmptyVarName) {
-        continue;
-      }
-      // Create persistable vars in ancestor scope.
-      if (var->Persistable()) {
-        auto *ptr = const_cast<Scope *>(ancestor_scope)->Var(var->Name());
-        InitializeVariable(ptr, var->GetType());
-        VLOG(3) << "Create Variable " << var->Name()
-                << " global, which pointer is " << ptr;
-      } else {  // Create temporary variables in local scope.
-        auto *ptr = scope->Var(var->Name());
+  for (auto &var : global_block.AllVars()) {
+    if (var->Name() == framework::kEmptyVarName) {
+      continue;
+    }
+
+    if (persistable == var->Persistable()) {
+      if (persistable) {
+        if (!anc->FindVar(var->Name())) {
+          auto *ptr = const_cast<Scope *>(anc)->Var(var->Name());
+          VLOG(3) << scope << " Create persistable variable " << var->Name()
+                  << ", which pointer is " << ptr;
+          InitializeVariable(ptr, var->GetType());
+        }
+      } else {
+        auto *ptr = const_cast<Scope *>(scope)->Var(var->Name());
+        VLOG(3) << scope << " Create variable " << var->Name()
+                << ", which pointer is " << ptr;
         InitializeVariable(ptr, var->GetType());
-        VLOG(3) << "Create Variable " << var->Name()
-                << " locally, which pointer is " << ptr;
       }
     }
-  } else {
-    for (auto &var : global_block.AllVars()) {
-      auto *ptr = scope->Var(var->Name());
-      InitializeVariable(ptr, var->GetType());
-      VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
-              << ptr;
-    }
   }
 }
 
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
index ddfa6e1f4d8b73f594fc381ab505797491cdd378..5e673f68574c4ddaa4c9260367d09e9f62f6b751 100644
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -35,8 +35,14 @@ class NaiveExecutor {
   // Create child scope.
   // Create variables.
   // @with_feed_fetch_ops: whether to work with the feed and fetch operators.
-  void Prepare(Scope* parent_scope, const ProgramDesc& program_desc,
-               int block_id, bool with_feed_fetch_ops);
+  void Prepare(Scope* scope, const ProgramDesc& program_desc, int block_id,
+               bool with_feed_fetch_ops);
+
+  // Create variables before head.
+  // Create parameters if persistable is ture, or create the temporary variables
+  // instead.
+  void CreateVariables(const ProgramDesc& desc, int block_id, bool persistable,
+                       Scope* scope);
 
   // Run all the operators.
   void Run();
@@ -49,8 +55,6 @@ class NaiveExecutor {
   void CleanFeedFetchOps();
 
  protected:
-  void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id);
-
   void CreateOps(const ProgramDesc& desc, int block_id,
                  bool with_feed_fetch_ops);
 
diff --git a/paddle/fluid/framework/naive_executor_test.cc b/paddle/fluid/framework/naive_executor_test.cc
index 6b9f79b9d398bc5a0ee6ba66587924daad0dbbc5..c917630666b082ab7148550707f9f1f720aa25d3 100644
--- a/paddle/fluid/framework/naive_executor_test.cc
+++ b/paddle/fluid/framework/naive_executor_test.cc
@@ -39,7 +39,7 @@ TEST(NaiveExecutor, Basic) {
 
   auto place = platform::CPUPlace();
   NaiveExecutor exe(place);
-  exe.Prepare(nullptr, program, 0, false /*with feed fetch ops*/);
+  exe.Prepare(nullptr, program, 0, false);
   auto* a_tensor = exe.FindTensor("a");
   auto* b_tensor = exe.FindTensor("b");
   auto* c_tensor = exe.FindTensor("c");
diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8177436d0bd90c3bcf8f91d5c55b66be188b19f9
--- /dev/null
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_NGRAPH
+#include <algorithm>
+#include <functional>
+
+#include "paddle/fluid/framework/ngraph_bridge.h"
+
+#include "ngraph/ngraph.hpp"
+
+namespace paddle {
+namespace framework {
+
+std::map<std::string,
+         std::function<void(const std::shared_ptr<OperatorBase>&,
+                            std::shared_ptr<std::unordered_map<
+                                std::string, std::shared_ptr<ngraph::Node>>>)>>
+    NgraphBridge::NG_NODE_MAP = {};
+
+void NgraphBridge::build_graph(const std::shared_ptr<OperatorBase>& op) {
+  auto& op_type = op->Type();
+  NG_NODE_MAP[op_type](op, ngb_node_map);
+}
+
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/framework/ngraph_bridge.h
new file mode 100644
index 0000000000000000000000000000000000000000..55bf0d21f3471013b1fb780e852d813313345f03
--- /dev/null
+++ b/paddle/fluid/framework/ngraph_bridge.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_NGRAPH
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "ngraph/ngraph.hpp"
+
+namespace paddle {
+namespace framework {
+
+class NgraphBridge {
+ public:
+  static std::map<
+      std::string,
+      std::function<void(const std::shared_ptr<OperatorBase>&,
+                         std::shared_ptr<std::unordered_map<
+                             std::string, std::shared_ptr<ngraph::Node>>>)>>
+      NG_NODE_MAP;
+
+  explicit NgraphBridge(
+      std::shared_ptr<
+          std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+          var_node_map)
+      : ngb_node_map(var_node_map) {}
+
+  void build_graph(const std::shared_ptr<OperatorBase>& op);
+
+ private:
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      ngb_node_map;
+};
+
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d967b2780c21713a2f9a73a3402964103f44269e
--- /dev/null
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -0,0 +1,220 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_NGRAPH
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <map>
+
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/ngraph_operator.h"
+#include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/framework/var_type.h"
+
+namespace paddle {
+namespace framework {
+
+static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = {
+    {proto::VarType::FP32, ngraph::element::f32},
+    {proto::VarType::FP64, ngraph::element::f64},
+    {proto::VarType::INT32, ngraph::element::i32},
+    {proto::VarType::INT64, ngraph::element::i64},
+    {proto::VarType::BOOL, ngraph::element::boolean},
+};
+
+typedef enum {                /* nGraph support state on ops          */
+               FULL_TRAIN,    /* Support full ops for train           */
+               PARTIAL_TRAIN, /* Support partial ops for train        */
+               FULL_TEST,     /* Support full list of ops for test    */
+               PARTIAL_TEST   /* Support partial list of ops for test */
+} op_state;
+
+class NgraphOperator {
+ public:
+  explicit NgraphOperator(const Scope& scope, const platform::Place& place,
+                          const std::vector<std::shared_ptr<OperatorBase>>& ops,
+                          const std::unordered_map<
+                              std::string, ngraph::element::Type>& var_type_map,
+                          const std::unordered_set<std::string>& persist,
+                          const std::unordered_set<std::string>& fetches,
+                          const std::unordered_set<std::string>& post_op_inputs,
+                          op_state ng_op_state)
+      : scope_(scope),
+        place_(place),
+        fused_ops_(ops),
+        var_type_map_(var_type_map),
+        persistables_(persist),
+        fetches_(fetches),
+        post_op_inputs_(post_op_inputs),
+        ng_op_state_(ng_op_state) {}
+
+  void Run(const Scope& scope, const platform::Place& place) const;
+
+ private:
+  static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
+      func_cache;
+  const Scope& scope_;
+  const platform::Place& place_;
+  std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
+  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
+  std::unordered_set<std::string> persistables_;
+  std::unordered_set<std::string> fetches_;
+  std::unordered_set<std::string> post_op_inputs_;
+  op_state ng_op_state_;
+};
+
+std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
+FusedOperator::FusedOpIntervals(
+    std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops) {
+  std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
+      intervals;
+  if (ops->empty()) {
+    return intervals;
+  }
+  size_t size = ops->size();
+  size_t left = 0;
+  while (left < size && ops.at(left)->Type() != kFeedOpType) {
+    ++left;
+  }
+  if (left == size) {
+    return intervals;
+  }
+  while (left < size && ops->at(left)->Type() == kFeedOpType) {
+    ++left;
+  }
+
+  size_t right = left;
+  while (right < size && ops->at(right)->Type() != kFetchOpType) {
+    ++right;
+  }
+  if (right == size) {
+    return intervals;
+  }
+  if (left >= right) return intervals;
+
+  // (left, right - 1) represents indices between feed and fetch
+  size_t pivot = left;
+  while (pivot < right) {
+    auto op_type = ops->at(pivot)->Type();
+    if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) ==
+        paddle::framework::NgraphBridge::NG_NODE_MAP.end()) {
+      ++pivot;
+    } else {
+      size_t start = pivot, end = start;
+      while (pivot < right &&
+             (paddle::framework::NgraphBridge::NG_NODE_MAP.find(
+                  ops.at(pivot)->Type()) !=
+              paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
+        ++pivot;
+        ++end;
+      }
+      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>
+          interval = {ops->begin() + start, ops->begin() + end};
+      intervals.push_back(interval);
+    }
+  }  // end while
+
+  return intervals;
+}
+
+FusedOperator::FusedOperator(
+    const ProgramDesc& prog, size_t block_id,
+    std::vector<std::unique_ptr<OperatorBase>>::iterator start,
+    std::vector<std::unique_ptr<OperatorBase>>::iterator end,
+    const std::string& type, const VariableNameMap& inputs,
+    const VariableNameMap& outputs, const AttributeMap& attrs)
+    : OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) {
+  for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start;
+       it != end; ++it) {
+    fused_ops_.push_back(std::move(*it));
+  }
+
+  for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = end;
+       (*it)->Type() != kFetchOpType; ++it) {
+    for (auto& var_name_item : (*it)->Inputs()) {
+      for (auto& var_name : var_name_item.second) {
+        post_op_inputs_.insert(var_name);
+      }
+    }
+  }
+
+  if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) {
+    is_complete = true;
+  }
+
+  Process();
+}
+
+void FusedOperator::Process() {
+  auto& bdesc = pdesc_.Block(block_);
+  for (auto& var : bdesc.AllVars()) {
+    if (!(var->GetType() == proto::VarType::SELECTED_ROWS ||
+          var->GetType() == proto::VarType::LOD_TENSOR ||
+          var->GetType() == proto::VarType::LOD_TENSOR_ARRAY)) {
+      continue;
+    }
+
+    auto var_name = var->Name();
+    if (var->Name() == framework::kEmptyVarName) {
+      continue;
+    }
+
+    if (var_name != "fetch" && var_name != "feed") {
+      auto pd_type = var->GetDataType();
+      if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) {
+        PADDLE_THROW("Data type of var %s not found in pd2ng_type_map",
+                     var_name);
+      }
+      var_type_map_[var_name] = pd2ng_type_map[pd_type];
+    }
+
+    if (var->Persistable()) {
+      persistables_.insert(var->Name());
+    }
+  }
+
+  for (auto* op : bdesc.AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      std::string fetch_target_name = op->Input("X")[0];
+      fetches_.insert(fetch_target_name);
+    }
+  }
+}
+
+void FusedOperator::RunImpl(const Scope& scope,
+                            const platform::Place& place) const {
+  op_state ng_op_state = PARTIAL_TEST;
+  auto& bdesc = pdesc_.Block(block_);
+  for (auto* op : bdesc.AllOps()) {
+    if (op->Type().find("_grad") != std::string::npos) {
+      ng_op_state = PARTIAL_TRAIN;
+      break;
+    }
+  }
+
+  if (is_full) {
+    ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
+  }
+
+  NgraphOperator ngraph_op(scope, place, fused_ops_, var_type_map_,
+                           persistables_, fetches_, post_op_inputs_,
+                           ng_op_state);
+  ngraph_op.Run(scope, place);
+}
+
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f655cef1dde624bcf4944b5c096279097e1c8ae
--- /dev/null
+++ b/paddle/fluid/framework/ngraph_operator.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_NGRAPH
+
+#include <algorithm>
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/ngraph_bridge.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/variant.h"
+
+#include "ngraph/ngraph.hpp"
+
+namespace paddle {
+namespace framework {
+
+class FusedOperator : public OperatorBase {
+ public:
+  static std::vector<
+      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
+  FusedOpIntervals(
+      std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops);
+
+  explicit FusedOperator(
+      const ProgramDesc& prog, size_t block_id,
+      std::vector<std::unique_ptr<OperatorBase>>::iterator start,
+      std::vector<std::unique_ptr<OperatorBase>>::iterator end,
+      const std::string& type = "fused_op", const VariableNameMap& inputs = {},
+      const VariableNameMap& outputs = {}, const AttributeMap& attrs = {});
+
+  void RunImpl(const Scope& scope, const platform::Place& place) const final;
+
+ private:
+  const ProgramDesc pdesc_;
+  size_t block_;
+  std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
+  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
+  std::unordered_set<std::string> persistables_;
+  std::unordered_set<std::string> fetches_;
+  std::unordered_set<std::string> post_op_inputs_;
+  bool is_full_ = false;
+
+  void Process();
+};
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 8ece618f3f72552fedcffab3e03ebb30476b7cab..fbaa169df6324761ef9136aa173dce4e2182ed38 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -82,7 +82,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
     auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
     auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
     if (in_var->GetType() != proto::VarType::LOD_TENSOR) {
-      VLOG(3) << "input " << in << " is not LodTensor";
+      VLOG(30) << "input " << in << " is not LodTensor";
       return;
     }
     out_var->SetLoDLevel(in_var->GetLoDLevel());
@@ -241,32 +241,32 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
     const proto::OpProto::Attr &attr = GetProtoAttr(name);
     switch (attr.type()) {
       case proto::AttrType::BOOLEANS: {
-        VLOG(11) << "SetAttr: " << Type() << ", " << name
-                 << " from INTS to BOOLEANS";
+        VLOG(110) << "SetAttr: " << Type() << ", " << name
+                  << " from INTS to BOOLEANS";
         this->attrs_[name] = std::vector<bool>();
         break;
       }
       case proto::AttrType::INTS: {
-        VLOG(11) << "SetAttr: " << Type() << ", " << name
-                 << " from INTS to INTS";
+        VLOG(110) << "SetAttr: " << Type() << ", " << name
+                  << " from INTS to INTS";
         this->attrs_[name] = std::vector<int>();
         break;
       }
       case proto::AttrType::FLOATS: {
-        VLOG(11) << "SetAttr: " << Type() << ", " << name
-                 << " from INTS to FLOATS";
+        VLOG(110) << "SetAttr: " << Type() << ", " << name
+                  << " from INTS to FLOATS";
         this->attrs_[name] = std::vector<float>();
         break;
       }
       case proto::AttrType::STRINGS: {
-        VLOG(11) << "SetAttr: " << Type() << ", " << name
-                 << " from INTS to STRINGS";
+        VLOG(110) << "SetAttr: " << Type() << ", " << name
+                  << " from INTS to STRINGS";
         this->attrs_[name] = std::vector<std::string>();
         break;
       }
       case proto::AttrType::BLOCKS: {
-        VLOG(11) << "SetAttr: " << Type() << ", " << name
-                 << " from INTS to BLOCKS";
+        VLOG(110) << "SetAttr: " << Type() << ", " << name
+                  << " from INTS to BLOCKS";
         this->SetBlocksAttr(name, std::vector<BlockDesc *>());
         return;
       }
@@ -499,13 +499,13 @@ void OpDesc::CheckAttrs() {
 }
 
 void OpDesc::InferShape(const BlockDesc &block) const {
-  VLOG(3) << "CompileTime infer shape on " << Type();
+  VLOG(30) << "CompileTime infer shape on " << Type();
   InitInferShapeFuncs();
   auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
   PADDLE_ENFORCE(static_cast<bool>(infer_shape),
                  "%s's infer_shape has not been registered", this->Type());
   CompileTimeInferShapeContext ctx(*this, block);
-  if (VLOG_IS_ON(10)) {
+  if (VLOG_IS_ON(100)) {
     std::ostringstream sout;
     auto inames = this->InputArgumentNames();
     sout << " From [";
@@ -516,7 +516,7 @@ void OpDesc::InferShape(const BlockDesc &block) const {
     std::copy(onames.begin(), onames.end(),
               std::ostream_iterator<std::string>(sout, ", "));
     sout << "]";
-    VLOG(10) << sout.str();
+    VLOG(100) << sout.str();
   }
   infer_shape(&ctx);
 }
@@ -607,7 +607,7 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
     auto shape = var->GetShape();
     res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape);
   } catch (...) {
-    VLOG(5) << "GetDim of variable " << name << " error";
+    VLOG(50) << "GetDim of variable " << name << " error";
     std::rethrow_exception(std::current_exception());
   }
   return res;
@@ -624,7 +624,7 @@ std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
       res.push_back(s.empty() ? make_ddim({0UL}) : make_ddim(s));
     }
   } catch (...) {
-    VLOG(5) << "GetRepeatedDim of variable " << name << " error.";
+    VLOG(50) << "GetRepeatedDim of variable " << name << " error.";
     std::rethrow_exception(std::current_exception());
   }
   return res;
diff --git a/paddle/fluid/framework/op_registry.cc b/paddle/fluid/framework/op_registry.cc
index bfc411ca2c4a483e344b368da089392d8e4a87c1..4a841bae8323f5733ba413a2c623a8147ec32f67 100644
--- a/paddle/fluid/framework/op_registry.cc
+++ b/paddle/fluid/framework/op_registry.cc
@@ -46,9 +46,9 @@ static VariableNameMap ConvertOpDescVarsToVarNameMap(
 
 std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
     const proto::OpDesc& op_desc) {
-  VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be"
-             "used in unit tests. Use CreateOp(const OpDesc& op_desc) "
-             "instead.";
+  VLOG(10) << "CreateOp directly from OpDesc is deprecated. It should only be"
+              "used in unit tests. Use CreateOp(const OpDesc& op_desc) "
+              "instead.";
   VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
   VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
   AttributeMap attrs;
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 45fc36c70633204dbfadbd10757c08b009d2cc74..5624878d439873e5f6aee6ec9234e31d5c77ff97 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -140,7 +140,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
 }
 
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  VLOG(4) << place << " " << DebugStringEx(&scope);
+  VLOG(40) << place << " " << DebugStringEx(&scope);
   if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
     PADDLE_THROW("Cannot run operator on place %s", place);
@@ -160,7 +160,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
   } else {
     RunImpl(scope, place);
   }
-  VLOG(3) << place << " " << DebugStringEx(&scope);
+  VLOG(30) << place << " " << DebugStringEx(&scope);
 }
 
 bool OperatorBase::HasInputs(const std::string& name) const {
@@ -259,6 +259,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
           if (row_size >= 0) {
             ss << "[row_size=" << row_size << "]";
           }
+          std::string dtype = GetDtype(*scope, output.second[i]);
+          ss << ":" << dtype;
           ss << "[" << GetDims(*scope, var_name, true) << "]";
           ss << "(" << GetLoD(*scope, var_name) << ")";
         }
@@ -358,7 +360,7 @@ static bool VarIsTensor(const Variable& var) {
   return var.IsType<LoDTensor>() || var.IsType<SelectedRows>();
 }
 
-const Tensor* GetTensorFromVar(const Variable& var) {
+const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) {
   if (var.IsType<LoDTensor>()) {
     return static_cast<const Tensor*>(&(var.Get<LoDTensor>()));
   } else if (var.IsType<SelectedRows>()) {
@@ -369,7 +371,7 @@ const Tensor* GetTensorFromVar(const Variable& var) {
   }
 }
 
-static Tensor* GetMutableTensorFromVar(Variable* var) {
+Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) {
   if (var->IsType<LoDTensor>()) {
     return var->GetMutable<LoDTensor>();
   } else if (var->IsType<SelectedRows>()) {
@@ -414,8 +416,7 @@ bool ExecutionContext::HasOutput(const std::string& name) const {
 
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
-  auto* var = InputVar(name);
-  return var == nullptr ? nullptr : GetTensorFromVar(*var);
+  return Input<LoDTensor>(name);
 }
 
 template <>
@@ -425,17 +426,21 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
   std::vector<const Tensor*> res;
   res.reserve(names.size());
   std::transform(names.begin(), names.end(), std::back_inserter(res),
-                 [&](const std::string& sub_name) {
+                 [&](const std::string& sub_name) -> const Tensor* {
                    auto var = scope_.FindVar(sub_name);
-                   return var == nullptr ? nullptr : GetTensorFromVar(*var);
+                   if (var == nullptr) return nullptr;
+                   PADDLE_ENFORCE(
+                       var->IsType<LoDTensor>(),
+                       "%s should be LoDTensor, but the received type is %s",
+                       sub_name, var->Type().name());
+                   return &(var->Get<LoDTensor>());
                  });
   return res;
 }
 
 template <>
 Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
-  auto var = OutputVar(name);
-  return var == nullptr ? nullptr : GetMutableTensorFromVar(var);
+  return Output<LoDTensor>(name);
 }
 
 template <>
@@ -445,10 +450,14 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
   std::vector<Tensor*> res;
   res.reserve(names.size());
   std::transform(names.begin(), names.end(), std::back_inserter(res),
-                 [&](const std::string& sub_name) {
+                 [&](const std::string& sub_name) -> Tensor* {
                    auto var = scope_.FindVar(sub_name);
-                   return var == nullptr ? nullptr
-                                         : GetMutableTensorFromVar(var);
+                   if (var == nullptr) return nullptr;
+                   PADDLE_ENFORCE(
+                       var->IsType<LoDTensor>(),
+                       "%s should be LoDTensor, but the received type is %s",
+                       sub_name, var->Type().name());
+                   return var->GetMutable<LoDTensor>();
                  });
   return res;
 }
@@ -708,14 +717,14 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 
   auto expected_kernel_key =
       this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+  VLOG(30) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_MKLDNN
   // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
   if (kernel_iter == kernels.end() &&
       expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
-    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
+    VLOG(30) << "missing MKLDNN kernel: fallbacking to PLAIN one";
     expected_kernel_key.library_type_ = LibraryType::kPlain;
     expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
     kernel_iter = kernels.find(expected_kernel_key);
@@ -767,12 +776,14 @@ void OperatorWithKernel::TransferInplaceVarsBack(
     const Scope& scope, const std::vector<std::string>& inplace_vars,
     const Scope& transfer_scope) const {
   for (auto& var_name : inplace_vars) {
-    VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
-    auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
+    VLOG(30) << "share inplace var " + var_name +
+                    " back to it's original scope";
+    auto* original_tensor =
+        GetMutableLoDTensorOrSelectedRowsValueFromVar(scope.FindVar(var_name));
     auto* var = transfer_scope.FindVar(var_name);
     PADDLE_ENFORCE(var != nullptr, "The var[%s] should not be nullptr",
                    var_name);
-    auto* transformed_tensor = GetTensorFromVar(*var);
+    auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
     original_tensor->ShareDataWith(*transformed_tensor);
   }
 }
@@ -789,7 +800,7 @@ Scope* OperatorWithKernel::TryTransferData(
         continue;
       }
 
-      auto* tensor_in = GetTensorFromVar(*var);
+      auto* tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
       if (!tensor_in->IsInitialized()) {
         continue;
       }
@@ -807,8 +818,8 @@ Scope* OperatorWithKernel::TryTransferData(
         transfered_inplace_vars->emplace_back(var_name);
       }
 
-      VLOG(3) << "Transform Variable " << var_name << " from "
-              << kernel_type_for_var << " to " << expected_kernel_key;
+      VLOG(30) << "Transform Variable " << var_name << " from "
+               << kernel_type_for_var << " to " << expected_kernel_key;
 
       if (new_scope == nullptr) {
         new_scope = &scope.NewScope();
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 96ad3205235b921a7cf60ed674a8350f74d18509..40b0130b265471a1288d966c4cbcd4f0e1bdb9f1 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -54,6 +54,9 @@ constexpr char kGradVarSuffix[] = "@GRAD";
 /// Variables with this suffix are supposed to be filled up with zeros.
 constexpr char kZeroVarSuffix[] = "@ZERO";
 
+/// Variables with this suffix are the new Gradient.
+constexpr char kNewGradSuffix[] = "@NEWGRAD@";
+
 // define some kernel priority
 /* Define multiple kernel type fallback order*/
 extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
@@ -63,7 +66,8 @@ inline std::string GradVarName(const std::string& var_name) {
 }
 
 proto::VarType::Type GetDataTypeOfVar(const Variable* var);
-const Tensor* GetTensorFromVar(const Variable& var);
+const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var);
+Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var);
 
 class OperatorBase;
 class ExecutionContext;
@@ -224,7 +228,7 @@ class ExecutionContext {
     std::vector<const T*> res;
     res.reserve(names.size());
     std::transform(names.begin(), names.end(), std::back_inserter(res),
-                   [&](const std::string& sub_name) {
+                   [&](const std::string& sub_name) -> const T* {
                      auto var = scope_.FindVar(sub_name);
                      return var == nullptr ? nullptr : &var->Get<T>();
                    });
@@ -237,7 +241,7 @@ class ExecutionContext {
     std::vector<T*> res;
     res.reserve(names.size());
     std::transform(names.begin(), names.end(), std::back_inserter(res),
-                   [&](const std::string& sub_name) {
+                   [&](const std::string& sub_name) -> T* {
                      auto var = scope_.FindVar(sub_name);
                      return var == nullptr ? nullptr : var->GetMutable<T>();
                    });
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index dfb107688ad7281765049cd9849d56b8a61bdd37..39b47415ff7e378cabc79e668fe2be63eb71d87f 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -199,7 +199,7 @@ void ParallelExecutor::BCastParamsToDevices(
 
     auto &main_tensor = main_var->Get<LoDTensor>();
     if (!main_tensor.IsInitialized()) {
-      VLOG(3) << "one in var not inited, return!";
+      VLOG(30) << "one in var not inited, return!";
       continue;
     }
     auto &dims = main_tensor.dims();
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index a4abd1b1283f08fb8431fbeea0cea17c8439fdd7..bbeef150254f8f7a1f382a5b81055a6a5589eee1 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 
 #include <memory>  // for unique_ptr
+#include <queue>
 #include <set>
+#include <unordered_set>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/string/printf.h"
@@ -36,6 +38,16 @@ DEFINE_double(
     "Memory size threshold (GB) when the garbage collector clear tensors."
     "Disabled when this value is less than 0");
 
+// When in inference scenario, the scopes will not be written by two threads in
+// a mean time, but a scope may be read by multiple threads concurrently, and
+// the mutex will cause serious performance issue.
+// So the mutex is disabled when `ON_INFER`.
+#ifdef ON_INFER
+#define SCOPE_LOCK_GUARD
+#else
+#define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);
+#endif
+
 namespace paddle {
 namespace framework {
 
@@ -49,18 +61,18 @@ int64_t GetEagerDeletionThreshold() {
 Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   kids_.push_back(new Scope(this));
   return *kids_.back();
 }
 
 Variable* Scope::Var(const std::string& name) {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   return VarInternal(name);
 }
 
 Variable* Scope::Var(std::string* name) {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   if (name != nullptr) {
     *name = new_name;
@@ -69,34 +81,34 @@ Variable* Scope::Var(std::string* name) {
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   return FindVarInternal(name);
 }
 
 Variable* Scope::FindLocalVar(const std::string& name) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   return FindVarLocally(name);
 }
 
 const Scope* Scope::FindScope(const Variable* var) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   return FindScopeInternal(var);
 }
 
 void Scope::DropKids() {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   for (Scope* s : kids_) delete s;
   kids_.clear();
 }
 
 bool Scope::HasKid(const Scope* scope) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   return it != this->kids_.end();
 }
 
 std::vector<std::string> Scope::LocalVarNames() const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   std::vector<std::string> known_vars;
   known_vars.reserve(this->vars_.size());
   for (auto& p : vars_) {
@@ -106,9 +118,10 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }
 
 void Scope::DeleteScope(Scope* scope) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
-  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
+  PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
+                 this, scope);
   this->kids_.erase(it);
   // When making memory benchmark on Fluid, we have to delete scope sync.
   if (FLAGS_benchmark || FLAGS_eager_delete_scope) {
@@ -119,7 +132,7 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   std::set<std::string> var_set(var_names.begin(), var_names.end());
   for (auto it = vars_.begin(); it != vars_.end();) {
     if (var_set.find(it->first) != var_set.end()) {
@@ -132,12 +145,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 
 void Scope::Rename(const std::string& origin_name,
                    const std::string& new_name) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   RenameInternal(origin_name, new_name);
 }
 
 std::string Scope::Rename(const std::string& origin_name) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   RenameInternal(origin_name, new_name);
   return new_name;
@@ -149,7 +162,7 @@ Variable* Scope::VarInternal(const std::string& name) {
 
   v = new Variable();
   vars_[name].reset(v);
-  VLOG(3) << "Create variable " << name;
+  VLOG(30) << "Create variable " << name;
   v->name_ = &(vars_.find(name)->first);
   return v;
 }
@@ -189,5 +202,46 @@ Variable* Scope::FindVarLocally(const std::string& name) const {
   return nullptr;
 }
 
+std::string GenScopeTreeDebugInfo(Scope* root) {
+  std::stringstream os;
+
+  if (!root) return "";
+
+  // level traversal
+  std::queue<Scope*> queue;
+  queue.push(root);
+
+  std::vector<Scope*> scopes;
+
+  while (!queue.empty()) {
+    auto* end = queue.back();
+    Scope* q = nullptr;
+    while (q != end) {
+      q = queue.front();
+      queue.pop();
+      os << q << " ";
+      scopes.push_back(q);
+
+      for (auto* c : q->kids()) {
+        queue.push(c);
+      }
+    }
+    // end of a level
+    os << "\n------------------------------------------\n";
+  }
+
+  os << "\nDetails:\n\n";
+
+  for (Scope* q : scopes) {
+    os << "====\n";
+    os << q << ":\n";
+    for (auto& var : q->LocalVarNames()) {
+      os << "  - " << var << "\n";
+    }
+  }
+
+  return os.str();
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 9462620e829ec815e1553f6378a67463ea3b8aa3..1901ffbe57e0d85193c3a218f06eba06a0f287a5 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -78,11 +78,11 @@ class Scope {
   /// Drop all kids scopes belonged to this scope.
   void DropKids();
 
-  std::list<Scope*>& kids() const { return kids_; }
-
   /// Find if a scope exists in the kid scopes
   bool HasKid(const Scope* scope) const;
 
+  const std::list<Scope*>& kids() const { return kids_; }
+
   // enumerate all the variables current contains.
   std::vector<std::string> LocalVarNames() const;
 
@@ -118,12 +118,17 @@ class Scope {
 
   // Scope in `kids_` are owned by this class.
   mutable std::list<Scope*> kids_;
-  Scope const* parent_{nullptr};
+  const Scope* parent_{nullptr};
 
   DISABLE_COPY_AND_ASSIGN(Scope);
 
  private:
   mutable std::mutex mutex_;
 };
+
+// Generate some debug string about the inherience structure of scope, quite
+// naive.
+std::string GenScopeTreeDebugInfo(Scope*);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 8c290bb095d554a973e66a3a19606a06759fd668..f4f2b769d5e47d8fba8d08476df4cd8e54133551 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -63,6 +63,26 @@ struct TensorCopyVisitor {
   int64_t size_;
 };
 
+struct TensorFillVisitor {
+  TensorFillVisitor(framework::Tensor* dst, int64_t dst_offset, int64_t size,
+                    float value)
+      : dst_(dst), dst_offset_(dst_offset), size_(size) {}
+
+  template <typename T>
+  void apply() const {
+    // TODO(qiao): support other place
+    platform::CPUPlace cpu;
+    auto* tensor_data = dst_->mutable_data<T>(cpu);
+    auto* start = tensor_data + dst_offset_;
+    auto* end = start + size_;
+    std::fill(start, end, static_cast<T>(0.0));
+  }
+
+  framework::Tensor* dst_;
+  int64_t dst_offset_;
+  int64_t size_;
+};
+
 void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
                        const platform::DeviceContext& dev_ctx) {
   {  // the 1st field, uint32_t version
@@ -120,7 +140,17 @@ bool SelectedRows::HasKey(int64_t key) const {
                                                                    : true;
 }
 
-int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown) {
+int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown,
+                                     bool is_test) {
+  if (is_test) {
+    auto iter = id_to_index_.find(key);
+    if (iter == id_to_index_.end()) {
+      return -1;
+    } else {
+      return iter->second;
+    }
+  }
+
   rwlock_->RDLock();
   auto iter = id_to_index_.find(key);
   if (iter == id_to_index_.end()) {
@@ -172,22 +202,30 @@ void SelectedRows::SyncIndex() {
 }
 
 void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
-                       bool auto_grown) {
+                       bool auto_grown, bool is_test) {
   PADDLE_ENFORCE(value->IsInitialized(),
                  "The value tensor should be initialized.");
   if (ids.numel() == 0) {
-    VLOG(3) << "keys is empty, please check data!";
+    VLOG(30) << "keys is empty, please check data!";
   } else {
     int64_t value_width = value_->numel() / value_->dims()[0];
     PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
                       "output tensor should have the same shape with table "
                       "except the dims[0].");
     for (int i = 0; i < ids.numel(); ++i) {
-      int64_t index = AutoGrownIndex(ids.data<int64_t>()[i], auto_grown);
-      framework::VisitDataType(
-          framework::ToDataType(value_->type()),
-          TensorCopyVisitor(value, i * value_width, *value_.get(),
-                            index * value_width, value_width));
+      auto id = ids.data<int64_t>()[i];
+      int64_t index = AutoGrownIndex(id, auto_grown, is_test);
+      if (index < 0) {
+        VLOG(5) << "id " << id << " not in the table, return 0";
+        framework::VisitDataType(
+            framework::ToDataType(value_->type()),
+            TensorFillVisitor(value, i * value_width, value_width, 0.0));
+      } else {
+        framework::VisitDataType(
+            framework::ToDataType(value_->type()),
+            TensorCopyVisitor(value, i * value_width, *value_.get(),
+                              index * value_width, value_width));
+      }
     }
   }
 }
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index daf5e95304fb84eaba26a30c45414d5021e7ffcb..55ca02038e083da4f8984f70fecf4ca2d878088e 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -105,7 +105,7 @@ class SelectedRows {
    * the value
    */
   void Get(const framework::Tensor& ids, framework::Tensor* value,
-           bool auto_grown = false);
+           bool auto_grown = false, bool is_test = false);
 
   /*
    * @brief Get the index of the key from id_to_index_ map. If the key not
@@ -118,7 +118,7 @@ class SelectedRows {
    *
    * @return index of the key.
    */
-  int64_t AutoGrownIndex(int64_t key, bool auto_grown);
+  int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false);
 
   void SyncIndex();
 
diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc
index 9c427a4ae4c9660b107ca891a60db306cb09301f..3b0509e0344efedf08ab21cac0a075049617ca97 100644
--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -84,10 +84,14 @@ TEST(SelectedRows, SparseTable) {
       data[i * embedding_width + j] = static_cast<float>(i);
     }
   }
-  ASSERT_EQ(table.AutoGrownIndex(10, true), 0);
-  ASSERT_EQ(table.AutoGrownIndex(8, true), 1);
-  ASSERT_EQ(table.AutoGrownIndex(8, true), 1);
-  ASSERT_EQ(table.AutoGrownIndex(6, true), 2);
+  ASSERT_EQ(table.AutoGrownIndex(10, true, false), 0);
+  ASSERT_EQ(table.AutoGrownIndex(8, true, false), 1);
+  ASSERT_EQ(table.AutoGrownIndex(8, true, false), 1);
+  ASSERT_EQ(table.AutoGrownIndex(6, true, false), 2);
+  for (int64_t i = 11; i < 20; i++) {
+    ASSERT_EQ(table.AutoGrownIndex(i, true, true), -1);
+    ASSERT_TRUE(!table.HasKey(i));
+  }
   ASSERT_TRUE(table.HasKey(10));
   ASSERT_TRUE(table.HasKey(8));
   ASSERT_TRUE(table.HasKey(6));
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index ca1e01c89f07c4ffc3979a6a6c3728328e0a1819..8d8f07a1f52b3062498b59a4dbc20219d42e4735 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -22,8 +22,8 @@ namespace framework {
 
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 const platform::DeviceContext& ctx, Tensor* dst) {
-  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
-          << dst_place;
+  VLOG(30) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
+           << dst_place;
   src.check_memory_size();
 
   dst->Resize(src.dims());
@@ -37,8 +37,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
 
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
     if (src_ptr == dst_ptr) {
-      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
-              << dst_place;
+      VLOG(30) << "Skip copy the same data async from " << src_place << " to "
+               << dst_place;
       return;
     }
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
@@ -77,8 +77,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     if (platform::is_same_place(src_place, dst_place)) {
       if (src_ptr == dst_ptr) {
-        VLOG(3) << "Skip copy the same data async from " << src_place << " to "
-                << dst_place;
+        VLOG(30) << "Skip copy the same data async from " << src_place << " to "
+                 << dst_place;
         return;
       }
       memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
@@ -114,8 +114,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
 
 void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
                     Tensor* dst) {
-  VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place()
-          << " to " << dst_place;
+  VLOG(30) << "TensorCopySync " << src.dims() << " from " << src.place()
+           << " to " << dst_place;
   src.check_memory_size();
   dst->Resize(src.dims());
   dst->set_layout(src.layout());
@@ -125,8 +125,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   auto size = src.numel() * SizeOfType(src.type());
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
     if (src_ptr == dst_ptr) {
-      VLOG(3) << "Skip copy the same data from " << src_place << " to "
-              << dst_place;
+      VLOG(30) << "Skip copy the same data from " << src_place << " to "
+               << dst_place;
       return;
     }
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
@@ -146,8 +146,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   } else if (platform::is_gpu_place(src_place) &&
              platform::is_gpu_place(dst_place)) {
     if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
-      VLOG(3) << "Skip copy the same data from " << src_place << " to "
-              << dst_place;
+      VLOG(30) << "Skip copy the same data from " << src_place << " to "
+               << dst_place;
       return;
     }
     auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index fcec955360f1c681a62929e904d5736854a8ffad..2dab4e793eeacd65239786976948b8043aeeb215 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -39,7 +39,7 @@ void ThreadPool::Init() {
     int num_threads = std::thread::hardware_concurrency();
     if (FLAGS_dist_threadpool_size > 0) {
       num_threads = FLAGS_dist_threadpool_size;
-      VLOG(1) << "set dist_threadpool_size to " << num_threads;
+      VLOG(10) << "set dist_threadpool_size to " << num_threads;
     }
     PADDLE_ENFORCE_GT(num_threads, 0);
     threadpool_.reset(new ThreadPool(num_threads));
diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc
index 7e3f002b53351ba5892aaa50482b21a83db94069..29ef459b454075a30c3a4d0ff0f9ef1212292b4b 100644
--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
@@ -61,10 +61,10 @@ size_t VarDesc::GetTensorDescNum() const {
 void VarDesc::SetShapes(
     const std::vector<std::vector<int64_t>> &multiple_dims) {
   if (multiple_dims.size() != GetTensorDescNum()) {
-    VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size()
-            << ") doesn't match the existing tensor number("
-            << GetTensorDescNum()
-            << "). The Reader is going to be reinitialized.";
+    VLOG(30) << "WARNING: The number of given shapes(" << multiple_dims.size()
+             << ") doesn't match the existing tensor number("
+             << GetTensorDescNum()
+             << "). The Reader is going to be reinitialized.";
     SetTensorDescNum(multiple_dims.size());
   }
   std::vector<proto::VarType::TensorDesc *> tensors = mutable_tensor_descs();
@@ -94,11 +94,11 @@ void VarDesc::SetDataType(proto::VarType::Type data_type) {
 void VarDesc::SetDataTypes(
     const std::vector<proto::VarType::Type> &multiple_data_type) {
   if (multiple_data_type.size() != GetTensorDescNum()) {
-    VLOG(3) << "WARNING: The number of given data types("
-            << multiple_data_type.size()
-            << ") doesn't match the existing tensor number("
-            << GetTensorDescNum()
-            << "). The Reader is going to be reinitialized.";
+    VLOG(30) << "WARNING: The number of given data types("
+             << multiple_data_type.size()
+             << ") doesn't match the existing tensor number("
+             << GetTensorDescNum()
+             << "). The Reader is going to be reinitialized.";
     SetTensorDescNum(multiple_data_type.size());
   }
   std::vector<proto::VarType::TensorDesc *> tensor_descs =
@@ -139,11 +139,11 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
 
 void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
   if (multiple_lod_level.size() != GetTensorDescNum()) {
-    VLOG(3) << "WARNING: The number of given lod_levels("
-            << multiple_lod_level.size()
-            << ") doesn't match the existing tensor number("
-            << GetTensorDescNum()
-            << "). The Reader is going to be reinitialized.";
+    VLOG(30) << "WARNING: The number of given lod_levels("
+             << multiple_lod_level.size()
+             << ") doesn't match the existing tensor number("
+             << GetTensorDescNum()
+             << "). The Reader is going to be reinitialized.";
     SetTensorDescNum(multiple_lod_level.size());
   }
   switch (desc_.type().type()) {
diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h
index f3035cd712bdea517068b4c172bb2794d5fccddb..64236b78d2e390ea5f6c43c76a4b33b62c67629f 100644
--- a/paddle/fluid/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/type_defs.h"
 
 namespace paddle {
@@ -24,5 +27,27 @@ class VarTypeInference {
   virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0;
 };
 
+class PassInDtypeAndVarTypeToOutput : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const final {
+    auto in_out_var_names = this->GetInputOutputWithSameType();
+
+    for (auto& i_o_n : in_out_var_names) {
+      auto& x_name = op_desc.Input(i_o_n.first).at(0);
+      auto& out_name = op_desc.Output(i_o_n.second).at(0);
+
+      auto& x = block->FindRecursiveOrCreateVar(x_name);
+      auto& out = block->FindRecursiveOrCreateVar(out_name);
+      out.SetType(x.GetType());
+      out.SetDataType(x.GetDataType());
+    }
+  }
+
+ protected:
+  virtual std::unordered_map<std::string, std::string>
+  GetInputOutputWithSameType() const = 0;
+};
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index e5678cf607a8ff3763e79c1f321a81c021846fb1..022d91b465614941581617eaab281e20d4cdd950 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -27,13 +27,9 @@ set(SHARED_INFERENCE_SRCS
     io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc)
-if (WITH_GPU AND TENSORRT_FOUND)
-  set(STATIC_INFERENCE_APIS ${STATIC_INFERENCE_APIS} paddle_inference_tensorrt_subgraph_engine)
-  set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/api/api_tensorrt_subgraph_engine.cc)
-endif()
 
 # Create static library
-cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
+cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder)
 
 if(NOT APPLE)
   # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
@@ -43,7 +39,7 @@ endif()
 
 # Create shared library
 cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
-    DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)
+    DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder)
 
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 if(NOT APPLE)
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 0354f9e6e9588af601210b8a71ae98c1f90d62f0..eb89fc5e1124e97b082d6299e3efc44591a8b01b 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -1,24 +1,25 @@
-cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass)
-set(analysis_deps
-        framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log)
+unset(analysis_deps CACHE)
+set(analysis_deps # analysis_deps can be extended accross the project
+        framework_proto proto_desc graph pass paddle_fluid_api executor pretty_log
+        ir_pass_manager
+        CACHE INTERNAL "")
 
-cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
+add_subdirectory(ir_passes)
+add_subdirectory(passes)
+
+cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass ${INFER_IR_PASSES})
+
+cc_library(argument SRCS argument.cc DEPS scope proto_desc)
+cc_library(analysis_pass SRCS analysis_pass.cc DEPS proto_desc)
+
+cc_library(analysis SRCS
   analyzer.cc
   helper.cc
-  # passes
-  analysis_pass.cc
-  fluid_to_data_flow_graph_pass.cc
-  data_flow_graph_to_fluid_pass.cc
-  dfg_graphviz_draw_pass.cc
-  tensorrt_subgraph_pass.cc
-  tensorrt_subgraph_node_mark_pass.cc
-  fluid_to_ir_pass.cc
-  model_store_pass.cc
-  DEPS ${analysis_deps})
+  analysis_pass
+  DEPS ${analysis_deps}
+  )
 
-cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
-cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid)
 
 function(inference_analysis_test TARGET)
   if(WITH_TESTING)
@@ -34,13 +35,3 @@ function(inference_analysis_test TARGET)
 endfunction(inference_analysis_test)
 
 inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api)
-inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
-inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
-inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc)
-inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
-inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
-inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
-inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
-inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
-inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
-inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h
index 13805ea4acf936b242bcd86b2faf89813753a9fe..299f235a74ae0ffb663be61079607d8ac1105a97 100644
--- a/paddle/fluid/inference/analysis/analysis_pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
@@ -19,42 +19,36 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/inference/analysis/argument.h"
-#include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/inference/analysis/node.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
+/*
+ * AnalysisPass is a pass used to control the IR passes.
+ */
 class AnalysisPass {
  public:
   AnalysisPass() = default;
   virtual ~AnalysisPass() = default;
-  // Mutable Pass.
-  virtual bool Initialize(Argument *argument) { return false; }
-  // Readonly Pass.
-  virtual bool Initialize(const Argument &argument) { return false; }
 
-  // Virtual method overriden by subclasses to do any necessary clean up after
-  // all passes have run.
-  virtual bool Finalize() { return false; }
-
-  // Create a debugger Pass that draw the DFG by graphviz toolkit.
-  virtual AnalysisPass *CreateGraphvizDebugerPass() const { return nullptr; }
-
-  // Run on a single DataFlowGraph.
-  virtual void Run(DataFlowGraph *x) = 0;
+  // Run on a single Graph.
+  void Run(Argument* argument) { RunImpl(argument); }
 
   // Human-readable short representation.
   virtual std::string repr() const = 0;
   // Human-readable long description.
   virtual std::string description() const { return "No DOC"; }
-};
 
-// GraphPass processes on any GraphType.
-class DataFlowGraphPass : public AnalysisPass {};
+ protected:
+  // User should implement these.
+  virtual void RunImpl(Argument* argument) = 0;
+
+  Argument* argument_{nullptr};
+};
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index ef4142f334e503380dc7ccd74c348404ffe52ee6..c8ed373ee7c32552608d501aa642677f940cd520 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -15,135 +15,23 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include <string>
 #include <vector>
-
-#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
-#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
-#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
-#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
-#include "paddle/fluid/inference/analysis/model_store_pass.h"
-#include "paddle/fluid/inference/analysis/pass_manager.h"
-#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
-#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
-
-DEFINE_bool(IA_enable_tensorrt_subgraph_engine, false,
-            "Enable subgraph to TensorRT engine for acceleration");
-
-DEFINE_bool(IA_enable_ir, false, "Turn on IR support");
-
-DEFINE_string(IA_graphviz_log_root, "./",
-              "Graphviz debuger for data flow graphs.");
-
-DEFINE_string(IA_output_storage_path, "", "optimized model output path");
+#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
+#include "paddle/fluid/inference/analysis/passes/passes.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-class DfgPassManagerImpl final : public DfgPassManager {
- public:
-  DfgPassManagerImpl() {
-    // TODO(Superjomn) set the key with pass reprs.
-    if (!FLAGS_IA_enable_ir) {
-      AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
-    } else {
-      AddPass("fluid-to-ir-pass", new FluidToIrPass);
-    }
-    TryAddTensorRtPass();
-    AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
-    if (!FLAGS_IA_output_storage_path.empty()) {
-      AddPass("model-store-pass", new ModelStorePass);
-    }
-  }
+Analyzer::Analyzer() {}
 
-  std::string repr() const override { return "dfg-pass-manager"; }
-  std::string description() const override { return "DFG pass manager."; }
+void Analyzer::Run(Argument *argument) { RunIrAnalysis(argument); }
 
- private:
-  void AddPass(const std::string& name, AnalysisPass* pass) {
-    VLOG(3) << "Adding pass " << name;
-    Register(name, pass);
-    AddGraphvizDebugerPass(pass);
-  }
+void Analyzer::RunIrAnalysis(Argument *argument) {
+  std::vector<std::string> passes({"ir_analysis_compose_pass"});
 
-  void TryAddTensorRtPass() {
-    if (FLAGS_IA_enable_tensorrt_subgraph_engine) {
-      auto trt_teller = [&](const Node* node) {
-        std::unordered_set<std::string> teller_set(
-            {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
-             "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
-             "elementwise_add", "dropout"});
-        if (!node->IsFunction()) return false;
-
-        const auto* func = static_cast<const Function*>(node);
-        if (teller_set.count(func->func_type())) {
-          return true;
-        } else {
-          return false;
-        }
-      };
-
-      AddPass("tensorrt-subgraph-marker",
-              new TensorRTSubgraphNodeMarkPass(trt_teller));
-      AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
-    }
-  }
-
-  // Add the graphviz debuger pass if the parent pass has one.
-  void AddGraphvizDebugerPass(AnalysisPass* pass) {
-    auto* debuger_pass = pass->CreateGraphvizDebugerPass();
-    if (debuger_pass) {
-      Register(debuger_pass->repr(), debuger_pass);
-    }
+  for (auto &pass : passes) {
+    PassRegistry::Global().Retreive(pass)->Run(argument);
   }
-};
-
-Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
-
-void Analyzer::Run(Argument* argument) {
-  std::vector<std::string> passes;
-#ifdef PADDLE_WITH_MKLDNN
-  if (use_mkldnn_) {
-    VLOG(3) << "Adding MKL-DNN placement pass";
-    passes.push_back("mkldnn_placement_pass");
-  }
-#endif
-  // infer_clean_graph_pass should be the first default pass
-  // after mkldnn_placement_pass.
-  passes.push_back("infer_clean_graph_pass");
-  for (auto& pass : ir_passes_) {
-    if (!disabled_ir_passes_.count(pass)) {
-      passes.push_back(pass);
-      passes.push_back("graph_viz_pass");  // add graphviz for debug.
-    }
-  }
-  passes.push_back("graph_viz_pass");
-  argument->Set(kFluidToIrPassesAttr, new std::vector<std::string>(passes));
-
-  for (auto& x : data_) {
-    PADDLE_ENFORCE(x->Initialize(argument));
-    x->RunAll();
-    PADDLE_ENFORCE(x->Finalize());
-  }
-}
-
-Analyzer& Analyzer::IncludeAllIrPasses() {
-  ir_passes_ = all_ir_passes_;
-  return *this;
-}
-
-Analyzer& Analyzer::DisableIrPasses(const std::vector<std::string>& passes) {
-  disabled_ir_passes_.insert(passes.begin(), passes.end());
-  return *this;
-}
-
-Analyzer& Analyzer::IncludeIrPasses(const std::vector<std::string>& passes) {
-  ir_passes_ = passes;
-  return *this;
-}
-
-Analyzer& Analyzer::SetUseMkldnn(bool use_mkldnn) {
-  use_mkldnn_ = use_mkldnn;
-  return *this;
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index 6edfc9dd117fbaa723b9af2162aea247b8aa97ea..b43e67f20f493cd8151871ca3a36eb6fdadcf9ff 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -40,59 +40,21 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/flags.h"
-#include "paddle/fluid/inference/analysis/pass_manager.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-class Analyzer : public OrderedRegistry<PassManager> {
+class Analyzer final {
  public:
-  // Register all the pass-managers.
   Analyzer();
 
   void Run(Argument* argument);
 
-  Analyzer& DisableIrPasses(const std::vector<std::string>& passes);
-  Analyzer& IncludeIrPasses(const std::vector<std::string>& passes);
-  Analyzer& IncludeAllIrPasses();
-  Analyzer& SetUseMkldnn(bool use_mkldnn);
-
   DISABLE_COPY_AND_ASSIGN(Analyzer);
 
- private:
-  // All avaiable IR passes.
-  // The bigger fuse comes first, so that the small operators prefer to be
-  // merged in a larger fuse op. The small fusion will not break the pattern of
-  // larger fusion.
-  const std::vector<std::string> all_ir_passes_{{
-// Manual update the passes here.
-#ifdef PADDLE_WITH_MKLDNN
-      // This pass should run before any other convolution fuse.
-      "depthwise_conv_mkldnn_pass",  //
-#endif
-      "attention_lstm_fuse_pass",       //
-      "seqconv_eltadd_relu_fuse_pass",  //
-      "embedding_fc_lstm_fuse_pass",    //
-      "fc_lstm_fuse_pass",              //
-      "mul_lstm_fuse_pass",             //
-      "fc_gru_fuse_pass",               //
-      "mul_gru_fuse_pass",              //
-      "seq_concat_fc_fuse_pass",        //
-      "fc_fuse_pass",                   //
-      "conv_bn_fuse_pass",              //
-      "conv_eltwiseadd_bn_fuse_pass",   //
-#ifdef PADDLE_WITH_MKLDNN
-      "conv_bias_mkldnn_fuse_pass",             //
-      "conv_relu_mkldnn_fuse_pass",             //
-      "conv_elementwise_add_mkldnn_fuse_pass",  //
-#endif
-  }};
-
-  std::unordered_set<std::string> disabled_ir_passes_;
-  // Ir passes to run
-  std::vector<std::string> ir_passes_;
-  bool use_mkldnn_;
+ protected:
+  void RunIrAnalysis(Argument* argument);
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 5430e5c1ef1c70d27295ebc1a9bd427cd95f006a..48fc5dda2a5bfa24d679d4bf655e580dafc614b3 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -27,21 +27,21 @@ namespace analysis {
 using namespace framework;  // NOLINT
 
 TEST(Analyzer, analysis_without_tensorrt) {
-  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
   Argument argument;
-  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
+  argument.SetModelDir(FLAGS_inference_model_dir);
+  argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
+
   Analyzer analyser;
   analyser.Run(&argument);
 }
 
 TEST(Analyzer, analysis_with_tensorrt) {
-  FLAGS_IA_enable_tensorrt_subgraph_engine = true;
   Argument argument;
-  argument.Set<int>("minimum_subgraph_size", new int(0));
-  argument.Set<int>("max_batch_size", new int(3));
-  argument.Set<int>("workspace_size", new int(1 << 20));
-  argument.Set<std::string>("precision_mode", new std::string("FP32"));
-  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
+  argument.SetTensorRtMaxBatchSize(3);
+  argument.SetTensorRtWorkspaceSize(1 << 20);
+  argument.SetModelDir(FLAGS_inference_model_dir);
+  argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
+
   Analyzer analyser;
   analyser.Run(&argument);
 }
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index e8fb0775b45761f64fd6fd28306c35b76d1e40c4..d7a2f3d1e3a3251263c8670aef5db538fa2c48ea 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -24,13 +24,16 @@
 #pragma once
 
 #include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
+using framework::ir::Graph;
 
 /*
  * The argument definition of both Pass and PassManagers.
@@ -39,75 +42,99 @@ namespace analysis {
  */
 struct Argument {
   Argument() = default;
-  explicit Argument(const std::string& fluid_model_dir)
-      : fluid_model_dir(new std::string(fluid_model_dir)) {}
-  // The directory of the trained model.
-  std::unique_ptr<std::string> fluid_model_dir;
-  // The path of `__model__` and `param`, this is used when the file name of
-  // model and param is changed.
-  std::unique_ptr<std::string> fluid_model_program_path;
-  std::unique_ptr<std::string> fluid_model_param_path;
-
-  // The graph that process by the Passes or PassManagers.
-  std::unique_ptr<DataFlowGraph> main_dfg;
-
-  // The original program desc.
-  std::unique_ptr<framework::proto::ProgramDesc> origin_program_desc;
-
-  // The processed program desc.
-  std::unique_ptr<framework::proto::ProgramDesc> transformed_program_desc;
-
-  // The output storage path of ModelStorePass.
-  std::unique_ptr<std::string> model_output_store_path;
-
-  // Support for any other attributes.
-  template <typename T>
-  void Set(const std::string& key, T* data) {
-    PADDLE_ENFORCE_NOT_NULL(data);
-    PADDLE_ENFORCE(!attrs_.count(key), "Duplicate set Argument's attr [%s]",
-                   key);
-    attrs_[key] = data;
-    attr_deleters_[key] = [data, key]() {
-      VLOG(3) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
-      VLOG(3) << "argument delete attr: " << key;
-      delete data;
-    };
-  }
-
-  bool Has(const std::string& name) const { return attrs_.count(name); }
-
-  template <typename T>
-  T* Release(const std::string& key) {
-    PADDLE_ENFORCE(attrs_.count(key));
-    auto* res = boost::any_cast<T*>(attrs_.at(key));
-    attrs_.erase(key);
-    attr_deleters_.erase(key);
-    return res;
-  }
-
-  template <typename T>
-  T& Get(const std::string& key) {
-    PADDLE_ENFORCE(Has(key));
-    return *boost::any_cast<T*>(attrs_.at(key));
-  }
-
-  ~Argument() {
-    for (auto& item : attr_deleters_) {
-      item.second();
-    }
-  }
+  explicit Argument(const std::string& model_dir) { SetModelDir(model_dir); }
+
+  using unique_ptr_t = std::unique_ptr<void, std::function<void(void*)>>;
+  using fusion_statis_t = std::unordered_map<std::string, int>;
+
+  bool Has(const std::string& key) const { return valid_fields_.count(key); }
+
+#define DECL_ARGUMENT_FIELD(field__, Field, type__) \
+ public:                                            \
+  type__& field__() {                               \
+    PADDLE_ENFORCE(Has(#field__));                  \
+    return field__##_;                              \
+  }                                                 \
+  void Set##Field(const type__& x) {                \
+    field__##_ = x;                                 \
+    valid_fields_.insert(#field__);                 \
+  }                                                 \
+  DECL_ARGUMENT_FIELD_VALID(field__);               \
+  type__* field__##_ptr() { return &field__##_; }   \
+                                                    \
+ private:                                           \
+  type__ field__##_;
+
+#define DECL_ARGUMENT_FIELD_VALID(field__) \
+  bool field__##_valid() { return Has(#field__); }
+
+#define DECL_ARGUMENT_UNIQUE_FIELD(field__, Field, type__)                \
+ public:                                                                  \
+  type__& field__() {                                                     \
+    PADDLE_ENFORCE_NOT_NULL(field__##_);                                  \
+    PADDLE_ENFORCE(Has(#field__));                                        \
+    return *static_cast<type__*>(field__##_.get());                       \
+  }                                                                       \
+  void Set##Field(type__* x) {                                            \
+    field__##_ =                                                          \
+        unique_ptr_t(x, [](void* x) { delete static_cast<type__*>(x); }); \
+    valid_fields_.insert(#field__);                                       \
+  }                                                                       \
+  void Set##Field##NotOwned(type__* x) {                                  \
+    valid_fields_.insert(#field__);                                       \
+    field__##_ = unique_ptr_t(x, [](void* x) {});                         \
+  }                                                                       \
+  DECL_ARGUMENT_FIELD_VALID(field__);                                     \
+  type__* field__##_ptr() {                                               \
+    PADDLE_ENFORCE(Has(#field__));                                        \
+    return static_cast<type__*>(field__##_.get());                        \
+  }                                                                       \
+  type__* Release##Field() {                                              \
+    PADDLE_ENFORCE(Has(#field__));                                        \
+    valid_fields_.erase(#field__);                                        \
+    return static_cast<type__*>(field__##_.release());                    \
+  }                                                                       \
+                                                                          \
+ private:                                                                 \
+  unique_ptr_t field__##_;
+
+  // Model path
+  DECL_ARGUMENT_FIELD(model_dir, ModelDir, std::string);
+  // Model specified with program and parameters files.
+  DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string);
+  DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string);
+
+  // The overall graph to work on.
+  DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph);
+  // The overall Scope to work on.
+  DECL_ARGUMENT_UNIQUE_FIELD(scope, Scope, framework::Scope);
+
+  DECL_ARGUMENT_UNIQUE_FIELD(main_program, MainProgram, framework::ProgramDesc);
+
+  // The ir passes to perform in analysis phase.
+  DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses,
+                      std::vector<std::string>);
+
+  DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
+  DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
+  DECL_ARGUMENT_FIELD(tensorrt_node_teller, TensorRtNodeTeller,
+                      std::function<bool(const framework::ir::Node*)>);
+  DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
+  DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
+
+  // The program transformed by IR analysis phase.
+  DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram,
+                             framework::proto::ProgramDesc);
+
+  DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t);
 
  private:
-  std::unordered_map<std::string, boost::any> attrs_;
-  std::unordered_map<std::string, std::function<void()>> attr_deleters_;
+  std::unordered_set<std::string> valid_fields_;
 };
 
-#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
-#define ANALYSIS_ARGUMENT_CHECK_FIELD(field__)               \
-  if (UNLIKELY(!(field__))) {                                \
-    LOG(ERROR) << "field " << #field__ << " should be set."; \
-    return false;                                            \
-  }
+#define ARGUMENT_CHECK_FIELD(argument__, fieldname__) \
+  PADDLE_ENFORCE(argument__->Has(#fieldname__),       \
+                 "the argument field [%s] should be set", #fieldname__);
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
deleted file mode 100644
index 8c7d58678fd29cb25d13d64a08e6c6f26f242d8b..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ /dev/null
@@ -1,496 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/data_flow_graph.h"
-#include "paddle/fluid/inference/analysis/dot.h"
-#include "paddle/fluid/inference/analysis/node.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-using ir_node_t = framework::ir::Node;
-using ir_graph_t = framework::ir::Graph;
-
-// It is a better idea that the inputs and outputs of this graph is set manually
-// before, but there must be a Pass that helps to prune the unnecessary ops that
-// do not contribute to the given targets, so in this pass, analysis and get the
-// inputs and outputs is OK.
-void DataFlowGraph::Build() {
-  inputs_.clear();
-  outputs_.clear();
-  std::unordered_set<Node *> ins;
-  std::unordered_set<Node *> outs;
-  for (auto &node : nodes.nodes()) {
-    for (auto *in : node->inlinks) {
-      ins.insert(in);
-    }
-    for (auto *out : node->outlinks) {
-      outs.insert(out);
-    }
-  }
-
-  // The nodes that in ins but not in outs is the graph's inputs
-  // similarly, the nodes that in outs but not in ins is the graphs' outputs
-  for (auto *in : ins) {
-    if (!outs.count(in)) {
-      inputs_.push_back(in);
-    }
-  }
-  for (auto *out : outs) {
-    if (!ins.count(out)) {
-      outputs_.push_back(out);
-    }
-  }
-
-  Clean();
-}
-
-void DataFlowGraph::Build(const framework::proto::ProgramDesc &prog) {
-  // insert vars
-  // The `var2id` keeps a map from a variable's name to its Node-id, the Node-id
-  // will keep updating to its latest alias during the graph-building.
-  std::unordered_map<std::string, size_t> var2id;
-  auto &main_block = prog.blocks(framework::kRootBlockIndex);
-  for (int i = 0; i < main_block.vars_size(); i++) {
-    const auto &var = main_block.vars(i);
-    auto *v = nodes.Create(Node::Type::kValue);
-    v->SetName(var.name());
-    v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
-    v->SetPbMsg(var.SerializeAsString());
-    var2id[var.name()] = v->id();
-  }
-
-  // The variables in a SSA can only write once, so if a variable is written
-  // multiple times(quite common in our ProgramDesc design), multiple alias
-  // Nodes of this variable will be created, and each will just write once.
-
-  // An set that keep all the names of the variables(the original, not alias)
-  // that have been written(as outputs). Once an Op's output variable hit the
-  // set, it should create a new alias and update the global alias for this
-  // variable. And that make a Data Flow Graph a SSA.
-  std::unordered_set<Node *> unique_written_vars;
-  for (int i = 0; i < main_block.ops_size(); i++) {
-    const auto &op = main_block.ops(i);
-    auto *o = nodes.Create(Node::Type::kFunction);
-    o->SetName(op.type());
-    static_cast<Function *>(o)->SetFuncType(op.type());
-    // Link to the original protobuf message's memory, make it easier to
-    // generate from a data flow graph to fluid ProgramDesc.
-    o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
-    o->SetPbMsg(op.SerializeAsString());
-
-    // set inputs and outputs
-    for (int j = 0; j < op.inputs_size(); j++) {
-      auto &in_var = op.inputs(j);
-      for (int k = 0; k < in_var.arguments_size(); k++) {
-        auto *in = nodes.GetMutable(var2id.at(in_var.arguments(k)));
-        in->outlinks.push_back(o);
-        o->inlinks.push_back(in);
-        unique_written_vars.insert(in);
-      }
-    }
-    for (int j = 0; j < op.outputs_size(); j++) {
-      auto &out_var = op.outputs(j);
-      for (int k = 0; k < out_var.arguments_size(); k++) {
-        auto *out = nodes.GetMutable(var2id[out_var.arguments(k)]);
-        if (unique_written_vars.count(out)) {
-          // Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
-          auto *out_alias = nodes.Create(Node::Type::kValue);
-          out_alias->SetName(out->name());
-          out_alias->SetPbDesc(out->pb_desc());
-          out_alias->SetPbMsg(out->pb_msg());
-          var2id[out_alias->name()] =
-              out_alias->id();  // update variable's alias Node
-          LOG(INFO) << "loop found in graph, create SSA alias node ["
-                    << out_alias->repr() << "] for [" << out->repr() << "]";
-          out = out_alias;
-        }
-        out->inlinks.push_back(o);
-        o->outlinks.push_back(out);
-      }
-    }
-  }
-  // Analysis and extract the inputs and outputs of this graph.
-  Build();
-}
-
-void DataFlowGraph::Build(const framework::ir::Graph &graph) {
-  // Create nodes
-  std::unordered_map<ir_node_t *, Node *> ir_node_map;
-  for (auto *ir_node : graph.Nodes()) {
-    Node *x{nullptr};
-    if (ir_node->IsOp()) {
-      PADDLE_ENFORCE(ir_node->Op());
-      VLOG(4) << "get op " << ir_node << " " << ir_node->Name();
-      x = nodes.Create(Node::Type::kFunction);
-      x->attr("ir_node").Pointer() = ir_node;
-      PADDLE_ENFORCE(ir_node->Op()->Proto());
-      x->SetName(ir_node->Op()->Proto()->type());
-      x->SetPbMsg(ir_node->Op()->Proto()->SerializeAsString());
-    } else if (ir_node->IsVar()) {
-      // Not create a Node for IR ControlDepVar, considering Inference currently
-      // just used in single thread scenerio.
-      VLOG(4) << "get var " << ir_node->Name();
-      x = nodes.Create(Node::Type::kValue);
-      x->attr("ir_node").Pointer() = ir_node;
-      x->SetName(ir_node->Name());
-      // x->SetPbMsg(ir_node->Var()->Proto()->SerializeAsString());
-    } else {
-      PADDLE_THROW("Failed to create an Node from IR, unknown type");
-    }
-    ir_node_map.emplace(ir_node, x);
-  }
-  VLOG(4) << "finish creating Nodes";
-
-  VLOG(4) << "to create edge";
-  // Create links
-  for (auto *ir_node : graph.Nodes()) {
-    auto it = ir_node_map.find(ir_node);
-    // Skip ControlDepVar.
-    if (it == ir_node_map.end()) continue;
-    auto *node = it->second;
-    for (auto *x : ir_node->inputs) {
-      if (!ir_node_map.count(x)) continue;
-      node->inlinks.push_back(ir_node_map.at(x));
-    }
-    for (auto *x : ir_node->outputs) {
-      if (!ir_node_map.count(x)) continue;
-      node->outlinks.push_back(ir_node_map.at(x));
-    }
-  }
-
-  Build();
-  PADDLE_ENFORCE(!inputs_.empty(),
-                 "Can't deduce any inputs from the graph, Is the graph empty?");
-
-  ir_graph = &graph;
-  VLOG(3) << "finished build from IR";
-}
-
-void DataFlowGraph::Clean() {
-  for (auto &node : nodes.nodes()) {
-    std::unordered_set<Node *> inlinks_set(node->inlinks.begin(),
-                                           node->inlinks.end());
-    std::unordered_set<Node *> outlinks_set(node->outlinks.begin(),
-                                            node->outlinks.end());
-    if (inlinks_set.size() < node->inlinks.size()) {
-      node->inlinks.assign(inlinks_set.begin(), inlinks_set.end());
-    }
-    if (outlinks_set.size() < node->outlinks.size()) {
-      node->outlinks.assign(outlinks_set.begin(), outlinks_set.end());
-    }
-  }
-}
-
-std::string DataFlowGraph::DotString() const {
-  Dot dot;
-
-  // Add nodes
-  for (size_t i = 0; i < nodes.size(); i++) {
-    const Node &node = nodes.Get(i);
-    dot.AddNode(node.repr(), node.dot_attrs());
-  }
-
-  // Add edges
-  for (size_t i = 0; i < nodes.size(); i++) {
-    const Node &node = nodes.Get(i);
-    for (auto &in : node.inlinks) {
-      dot.AddEdge(in->repr(), node.repr(), {});
-    }
-  }
-  return dot.Build();
-}
-
-std::string DataFlowGraph::HumanReadableInfo(bool show_values,
-                                             bool show_functions) const {
-  std::stringstream values, functions;
-  for (auto &n : nodes.nodes()) {
-    if (show_values && n->IsValue()) {
-      values << n->repr() << "\n";
-    }
-    if (show_functions && n->IsFunction()) {
-      functions << n->repr() << "\n";
-    }
-  }
-  return "Values:\n" + values.str() + "\n\n" + "Functions:\n" + functions.str();
-}
-
-//
-// NodesBFSIterator
-//
-
-GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
-    const std::vector<Node *> &source)
-    : queue_(source.begin(), source.end()) {}
-
-GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
-    GraphTraits<DataFlowGraph>::NodesBFSIterator &&other) noexcept
-    : queue_(std::move(other.queue_)),
-      visited_(std::move(other.visited_)) {}
-
-GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
-    const GraphTraits<DataFlowGraph>::NodesBFSIterator &other)
-    : queue_(other.queue_), visited_(other.visited_) {}
-
-Node &GraphTraits<DataFlowGraph>::NodesBFSIterator::operator*() {
-  PADDLE_ENFORCE(!queue_.empty());
-  return *queue_.front();
-}
-
-Node *GraphTraits<DataFlowGraph>::NodesBFSIterator::operator->() {
-  PADDLE_ENFORCE(!queue_.empty());
-  return queue_.front();
-}
-
-GraphTraits<DataFlowGraph>::NodesBFSIterator &
-GraphTraits<DataFlowGraph>::NodesBFSIterator::operator=(
-    const GraphTraits<DataFlowGraph>::NodesBFSIterator &other) {
-  queue_ = other.queue_;
-  visited_ = other.visited_;
-  return *this;
-}
-
-GraphTraits<DataFlowGraph>::NodesBFSIterator
-    &GraphTraits<DataFlowGraph>::NodesBFSIterator::operator++() {
-  PADDLE_ENFORCE(!queue_.empty());
-  auto *cur = queue_.front();
-  visited_.insert(cur);
-  queue_.pop_front();
-  for (auto *output : cur->outlinks) {
-    if (!visited_.count(output)) {
-      queue_.push_back(output);
-      visited_.insert(output);
-    }
-  }
-  return *this;
-}
-
-bool GraphTraits<DataFlowGraph>::NodesBFSIterator::operator==(
-    const GraphTraits<DataFlowGraph>::NodesBFSIterator &other) {
-  if (queue_.empty()) return other.queue_.empty();
-  if ((!queue_.empty()) && (!other.queue_.empty())) {
-    return queue_.front() == other.queue_.front() &&
-           visited_.size() == other.visited_.size();
-    // equality of queue and
-    // visited. Just a light but week implementation.
-  }
-  return false;
-}
-
-//
-// NodesDFSIterator
-//
-GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
-    const std::vector<Node *> &source) {
-  for (auto *x : source) stack_.push(x);
-}
-
-GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
-    GraphTraits<DataFlowGraph>::NodesDFSIterator &&other) noexcept
-    : stack_(std::move(other.stack_)),
-      visited_(std::move(other.visited_)) {}
-
-GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
-    const GraphTraits<DataFlowGraph>::NodesDFSIterator &other)
-    : stack_(other.stack_), visited_(other.visited_) {}
-
-Node &GraphTraits<DataFlowGraph>::NodesDFSIterator::operator*() {
-  PADDLE_ENFORCE(!stack_.empty());
-  return *stack_.top();
-}
-
-GraphTraits<DataFlowGraph>::NodesDFSIterator
-    &GraphTraits<DataFlowGraph>::NodesDFSIterator::operator++() {
-  if (stack_.empty()) return *this;
-  visited_.insert(stack_.top());
-  auto *cur = stack_.top();
-  stack_.pop();
-  for (auto *x : cur->outlinks) {
-    if (!visited_.count(x)) {
-      stack_.push(x);
-      visited_.insert(x);
-    }
-  }
-  return *this;
-}
-bool GraphTraits<DataFlowGraph>::NodesDFSIterator::operator==(
-    const GraphTraits<DataFlowGraph>::NodesDFSIterator &other) {
-  if (stack_.empty()) return other.stack_.empty();
-  if ((!stack_.empty()) && (!other.stack_.empty())) {
-    return stack_.top() == other.stack_.top();
-  }
-  return false;
-}
-
-GraphTraits<DataFlowGraph>::NodesDFSIterator &
-GraphTraits<DataFlowGraph>::NodesDFSIterator::operator=(
-    const GraphTraits<DataFlowGraph>::NodesDFSIterator &other) {
-  stack_ = other.stack_;
-  visited_ = other.visited_;
-  return *this;
-}
-Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
-  return stack_.top();
-}
-
-inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
-  return node.inlinks.size() == n;
-}
-
-GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
-    const std::vector<Node *> &source) {
-  PADDLE_ENFORCE(!source.empty(),
-                 "Start points of topological sorting should not be empty!");
-  // CHECK all the inputs' in-degree is 0
-  for (auto *node : source) {
-    PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
-  }
-
-  std::unordered_set<Node *> visited;
-  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
-
-  std::vector<Node *> inlink_visited;
-  while (!to_visit.empty()) {
-    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
-    for (auto *p : queue) {
-      if (p->deleted()) {
-        visited.insert(p);
-        to_visit.erase(p);
-        continue;
-      }
-      inlink_visited.clear();
-
-      std::copy_if(p->inlinks.begin(), p->inlinks.end(),
-                   std::back_inserter(inlink_visited),
-                   [&](Node *x) { return visited.count(x); });
-
-      if (inlink_visited.size() == p->inlinks.size()) {
-        sorted_.push_back(p);
-        for (auto *_ : p->outlinks) {
-          if (!visited.count(_)) {
-            to_visit.insert(_);
-          }
-        }
-
-        to_visit.erase(p);
-        visited.insert(p);
-      }
-    }
-  }
-}
-
-GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
-    const paddle::inference::analysis::GraphTraits<
-        DataFlowGraph>::NodesTSIterator &other)
-    : sorted_(other.sorted_), cursor_(other.cursor_) {}
-
-Node &GraphTraits<DataFlowGraph>::NodesTSIterator::operator*() {
-  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
-  return *sorted_[cursor_];
-}
-
-paddle::inference::analysis::GraphTraits<DataFlowGraph>::NodesTSIterator
-    &GraphTraits<DataFlowGraph>::NodesTSIterator::operator++() {
-  if (++cursor_ >= sorted_.size()) {
-    sorted_.clear();
-    cursor_ = 0;
-  }
-  return *this;
-}
-paddle::inference::analysis::GraphTraits<DataFlowGraph>::NodesTSIterator &
-GraphTraits<DataFlowGraph>::NodesTSIterator::operator=(
-    const paddle::inference::analysis::GraphTraits<
-        DataFlowGraph>::NodesTSIterator &other) {
-  cursor_ = other.cursor_;
-  sorted_ = other.sorted_;
-  return *this;
-}
-
-bool GraphTraits<DataFlowGraph>::NodesTSIterator::operator==(
-    const paddle::inference::analysis::GraphTraits<
-        DataFlowGraph>::NodesTSIterator &other) {
-  return sorted_ == other.sorted_ && cursor_ == other.cursor_;
-}
-
-Node *GraphTraits<DataFlowGraph>::NodesTSIterator::operator->() {
-  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
-  return sorted_[cursor_];
-}
-
-std::pair<std::vector<Node *>, std::vector<Node *>>
-ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
-  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
-  std::unordered_set<Node *> inputs;
-  std::unordered_set<Node *> outputs;
-  // Input a Value, check whether its inlink is in the subgraph.
-  auto inlink_in_subgraph = [&](Node *n) {
-    for (auto *in : n->inlinks) {
-      if (nodes.count(in)) return true;
-    }
-    return false;
-  };
-
-  for (auto &node : graph) {
-    for (auto *in : node->inlinks) {
-      // The Value that is written by nodes inside a sub-graph shouldn't be the
-      // input of the sub-graph.
-      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
-          !inlink_in_subgraph(in)) {
-        inputs.insert(in);
-      }
-    }
-    for (auto *out : node->outlinks) {
-      if (!nodes.count(out) && out->type() == Node::Type::kValue) {
-        outputs.insert(out);
-      }
-    }
-  }
-  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
-                        std::vector<Node *>(outputs.begin(), outputs.end()));
-}
-
-// Filter the Intermediate results of the subgraph node.
-void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
-  std::vector<Node *> op_nodes;
-  for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) {
-    if (node.type() == Node::Type::kValue || node.deleted()) {
-      continue;
-    }
-    op_nodes.push_back(&node);
-  }
-  size_t op_num = op_nodes.size();
-  for (size_t i = 0; i < op_num; i++) {
-    if (op_nodes[i]->type() == Node::Type::kFunction) continue;
-    std::unordered_set<std::string> follow_up_input_names;
-    for (size_t j = i + 1; j < op_num; j++) {
-      for (auto *in : op_nodes[j]->inlinks) {
-        follow_up_input_names.insert(in->name());
-      }
-    }
-    std::vector<Node *> filtered_subgraph_outlinks;
-    for (auto *out : op_nodes[i]->outlinks) {
-      if (follow_up_input_names.count(out->name())) {
-        filtered_subgraph_outlinks.push_back(out);
-      } else {
-        out->SetDeleted();
-      }
-    }
-    // The filtered_subgraph_outlinks may be empty.
-    op_nodes[i]->outlinks = filtered_subgraph_outlinks;
-  }
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
deleted file mode 100644
index 437e097acd24aad384df6712ce0de6106b3b5c65..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * Data flow graph is an pass that build the basic graph. It contains a graph
- * and the iterators that enable the iteration over the graph.
- */
-
-#pragma once
-
-#include <deque>
-#include <stack>
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/inference/analysis/graph_traits.h"
-#include "paddle/fluid/inference/analysis/node.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * DataFlowGraph - A container of Value and Function Nodes.
- *
- * This is the base graph for any other type of graphs, such as SSA or CFG.
- */
-struct DataFlowGraph {
-  NodeMap nodes;
-  // inputs and outputs are deduced from the graph.
-  // Used to interact with IR.
-  const framework::ir::Graph *ir_graph{nullptr};
-
-  // Extract inputs and outputs of the graph.
-  void Build();
-
-  void Build(const framework::proto::ProgramDesc &prog);
-
-  // Build a graph from ir::Graph.
-  void Build(const framework::ir::Graph &graph);
-
-  // Get an attribute.
-  AnyAttr &Attr(const std::string &key) { return attrs_[key]; }
-
-  // Output a DOT graph file for debug.
-  std::string DotString() const;
-
-  std::string HumanReadableInfo(bool show_values = true,
-                                bool show_functions = true) const;
-
-  const std::vector<Node *> &inputs() const {
-    PADDLE_ENFORCE(!inputs_.empty(),
-                   "No inputs are deduced, need to Build() first.");
-    return inputs_;
-  }
-  const std::vector<Node *> &outputs() const {
-    PADDLE_ENFORCE(!outputs_.empty(),
-                   "No outputs are deduced, need to Build() first.");
-    return outputs_;
-  }
-
- private:
-  mutable std::vector<Node *> inputs_;
-  mutable std::vector<Node *> outputs_;
-  std::unordered_map<std::string, AnyAttr> attrs_;
-
-  // Remove duplicate edges and so on.
-  void Clean();
-};
-
-/*
- * An graph trait help to traverse the graph using BFS.
- * The BFS start from a graph's inputs, the graph should be fully-connected, so
- * that the iterator can reach the end.
- */
-template <>
-struct GraphTraits<DataFlowGraph> {
-  // BFS iterator on nodes.
-  struct NodesBFSIterator
-      : public std::iterator<std::forward_iterator_tag, Node *> {
-    NodesBFSIterator() = default;
-    explicit NodesBFSIterator(const std::vector<Node *> &source);
-    NodesBFSIterator(NodesBFSIterator &&other) noexcept;
-    // NOTE Heavy to use.
-    NodesBFSIterator(const NodesBFSIterator &other);
-
-    Node &operator*();
-    NodesBFSIterator &operator++();
-    Node *operator->();
-    // TODO(Superjomn) current implementation just compare the first
-    // element, need to compare the graph and all the elements in the queue and
-    // set.
-    NodesBFSIterator &operator=(const NodesBFSIterator &other);
-    bool operator==(const NodesBFSIterator &other);
-    bool operator!=(const NodesBFSIterator &other) { return !(*this == other); }
-
-   private:
-    std::deque<Node *> queue_;
-    std::unordered_set<Node *> visited_;
-  };
-
-  // DFS iterator on nodes.
-  struct NodesDFSIterator
-      : public std::iterator<std::forward_iterator_tag, Node *> {
-    NodesDFSIterator() = default;
-    NodesDFSIterator(const std::vector<Node *> &source);
-    NodesDFSIterator(NodesDFSIterator &&other) noexcept;
-    NodesDFSIterator(const NodesDFSIterator &other);
-
-    Node &operator*();
-    NodesDFSIterator &operator++();
-    // TODO(Superjomn) current implementation just compare the first
-    // element, need to compare the graph and all the elements in the queue and
-    // set.
-    NodesDFSIterator &operator=(const NodesDFSIterator &other);
-    bool operator==(const NodesDFSIterator &other);
-    bool operator!=(const NodesDFSIterator &other) { return !(*this == other); }
-    Node *operator->();
-
-   private:
-    std::stack<Node *> stack_;
-    std::unordered_set<Node *> visited_;
-  };
-
-  // Topological sorting iterator on nodes.
-  struct NodesTSIterator
-      : public std::iterator<std::forward_iterator_tag, Node *> {
-    NodesTSIterator() = default;
-    NodesTSIterator(const std::vector<Node *> &source);
-    NodesTSIterator(NodesTSIterator &&other)
-        : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
-      other.cursor_ = 0;
-    }
-    NodesTSIterator(const NodesTSIterator &other);
-
-    Node &operator*();
-    NodesTSIterator &operator++();
-    // TODO(Superjomn) current implementation just compare the first
-    // element, need to compare the graph and all the elements in the queue and
-    // set.
-    NodesTSIterator &operator=(const NodesTSIterator &other);
-    bool operator==(const NodesTSIterator &other);
-    bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
-    Node *operator->();
-
-   private:
-    std::vector<Node *> sorted_;
-    size_t cursor_{0};
-  };
-
-  explicit GraphTraits(const DataFlowGraph &graph) : graph_(graph) {}
-
-  // default use BFS to visit the nodes.
-  iterator_range<NodesBFSIterator> nodes() {
-    return iterator_range<NodesBFSIterator>(nodes_bfs_begin(), nodes_bfs_end());
-  }
-  iterator_range<NodesBFSIterator> nodes_in_BFS() {
-    return iterator_range<NodesBFSIterator>(nodes_bfs_begin(), nodes_bfs_end());
-  }
-  iterator_range<NodesDFSIterator> nodes_in_DFS() {
-    return iterator_range<NodesDFSIterator>(nodes_dfs_begin(), nodes_dfs_end());
-  }
-  iterator_range<NodesTSIterator> nodes_in_TS() {
-    return iterator_range<NodesTSIterator>(nodes_ts_begin(), nodes_ts_end());
-  }
-
- private:
-  NodesBFSIterator nodes_bfs_begin() {
-    return NodesBFSIterator(graph_.inputs());
-  }
-  NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); }
-
-  NodesDFSIterator nodes_dfs_begin() {
-    return NodesDFSIterator(graph_.inputs());
-  }
-  NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); }
-
-  NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_.inputs()); }
-  NodesTSIterator nodes_ts_end() { return NodesTSIterator(); }
-
- private:
-  const DataFlowGraph &graph_;
-};
-
-// Extract the inputs and outputs of a graph. The inputs and outputs of a
-// sub-graph is the inputs nodes and output nodes that doesn't inside the
-// sub-graph.
-std::pair<std::vector<Node *>, std::vector<Node *>>
-ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);  // NOLINT
-
-void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph);
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
deleted file mode 100644
index 50ce20621fb289023ecccf7bb39d98169765d5ee..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/data_flow_graph.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-TEST(DataFlowGraph, BFS) {
-  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
-  auto dfg = ProgramDescToDFG(desc);
-  dfg.Build();
-
-  for (auto* in : dfg.inputs()) {
-    LOG(INFO) << "inputs: " << in->name() << " "
-              << static_cast<int>(in->type());
-  }
-  for (auto* out : dfg.outputs()) {
-    LOG(INFO) << "outputs: " << out->name() << " "
-              << static_cast<int>(out->type());
-  }
-
-  size_t count = 0;
-  for (auto& node : GraphTraits<DataFlowGraph>(dfg).nodes()) {
-    LOG(INFO) << "visiting " << node.name();
-    ++count;
-  }
-  ASSERT_EQ(count, dfg.nodes.size());
-}
-
-TEST(DataFlowGraph, DFS) {
-  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
-  DataFlowGraph dfg;
-  dfg.Build(desc);
-  size_t count = 0;
-  for (auto& node : GraphTraits<DataFlowGraph>(dfg).nodes_in_DFS()) {
-    LOG(INFO) << "visiting " << node.name();
-    ++count;
-  }
-  ASSERT_EQ(count, dfg.nodes.size());
-}
-
-// Topological sorting.
-/*
- * Graph topology
- * inputs: 0, 1, 2
- * 0 -> 4
- * 0 -> 5
- * 1 -> 6
- * 2 -> 7
- * 4 -> 5
- * 4 -> 7
- * 4 -> 3
- * 7 -> 3
- */
-TEST(DataFlowGraph, TS) {
-  DataFlowGraph graph;
-
-  for (int i = 0; i < 8; i++) {
-    auto* node = graph.nodes.Create(Node::Type::kValue);
-    node->SetName("node-" + std::to_string(i));
-  }
-
-  auto add_link = [&](int i, int j) {
-    Node* source = graph.nodes.GetMutable(i);
-    Node* target = graph.nodes.GetMutable(j);
-    target->inlinks.push_back(source);
-    source->outlinks.push_back(target);
-  };
-
-  add_link(0, 4);
-  add_link(0, 5);
-  add_link(1, 6);
-  add_link(2, 7);
-  add_link(4, 5);
-  add_link(4, 7);
-  add_link(4, 3);
-  add_link(7, 3);
-  graph.Build();
-
-  auto its = GraphTraits<DataFlowGraph>(graph).nodes_in_TS();
-  std::vector<int> sorted_ids;
-  for (auto it = its.begin(); it != its.end(); ++it) {
-    LOG(INFO) << it->name();
-    sorted_ids.push_back(it->id());
-  }
-
-  // Assert a occurs prior to b in the sorted_ids.
-  auto assert_positive_sequence_pair = [&](int a, int b) {
-    auto a_offset = std::find(sorted_ids.begin(), sorted_ids.end(), a);
-    auto b_offset = std::find(sorted_ids.begin(), sorted_ids.end(), b);
-    ASSERT_LT(a_offset, b_offset);
-  };
-
-  assert_positive_sequence_pair(2, 7);
-  assert_positive_sequence_pair(7, 3);
-  assert_positive_sequence_pair(4, 3);
-  assert_positive_sequence_pair(0, 4);
-  assert_positive_sequence_pair(0, 5);
-  assert_positive_sequence_pair(1, 6);
-  assert_positive_sequence_pair(4, 5);
-  assert_positive_sequence_pair(4, 7);
-}
-
-TEST(DataFlowGraph, Build_ProgramDesc) {
-  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
-  DataFlowGraph graph;
-  graph.Build(desc);
-  ASSERT_EQ(graph.nodes.size(), 38UL);
-}
-
-void SetOp(framework::ProgramDesc* prog, const std::string& type,
-           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-  op->SetType(type);
-  op->SetInput("Xs", inputs);
-  op->SetOutput("Xs", outputs);
-  op->SetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName(),
-              static_cast<int>(framework::OpRole::kForward));
-}
-
-TEST(DataFlowGraph, Build_IR_Graph) {
-  framework::ProgramDesc prog;
-  for (auto& v : std::vector<std::string>({"a", "b", "c", "d", "e", "f"})) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    var->SetType(framework::proto::VarType::SELECTED_ROWS);
-    if (v == "c") {
-      var->SetPersistable(true);
-    }
-  }
-
-  SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
-        std::vector<std::string>({"b"}));
-  SetOp(&prog, "OP1", std::vector<std::string>({"a"}),
-        std::vector<std::string>({"c"}));
-  SetOp(&prog, "mul", std::vector<std::string>({"b", "c"}),
-        std::vector<std::string>({"d"}));
-  SetOp(&prog, "elementwise_add", std::vector<std::string>({"d", "e"}),
-        std::vector<std::string>({"f"}));
-
-  DataFlowGraph graph;
-
-  framework::ir::Graph ir_graph(prog);
-
-  graph.Build(ir_graph);
-
-  ASSERT_EQ(graph.nodes.size(), ir_graph.Nodes().size());
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
deleted file mode 100644
index cb549f4b50cf56154a951d16b58b022dbad3e990..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ /dev/null
@@ -1,284 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
-#include <vector>
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/proto_desc.h"
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
-#include "paddle/fluid/inference/io.h"
-
-namespace paddle {
-namespace inference {
-
-namespace analysis {
-
-using framework::proto::ProgramDesc;
-
-std::vector<std::string> ExtractParameters(
-    const std::vector<std::unique_ptr<Node>> &nodes);
-
-bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
-  ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
-  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
-  // The transformed_program_desc should inherit all the VarDesc and BlockDesc
-  // from the original program desc. The operators of the main block(the first
-  // block) should rewritten by data flow graph.
-  argument->transformed_program_desc.reset(
-      new ProgramDesc(*argument->origin_program_desc));
-  argument->transformed_program_desc->mutable_blocks(framework::kRootBlockIndex)
-      ->clear_ops();
-  desc_ = argument->transformed_program_desc.get();
-  argument_ = argument;
-  return true;
-}
-
-bool DataFlowGraphToFluidPass::Finalize() { return true; }
-
-void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
-  // FilterRedundantOutputOfSubGraph(graph);
-  for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) {
-    if (node.deleted()) continue;
-
-    switch (node.type()) {
-      case Node::Type::kFunction: {
-        AddFluidOp(&node);
-      } break;
-      case Node::Type::kFunctionBlock: {
-        AddEngineOp(&node);
-      } break;
-      default:
-        continue;
-    }
-  }
-
-  if (argument_->Has(framework::ir::kParamScopeAttr)) {
-    LOG(WARNING) << "parameter changes in the scope takes effect";
-  }
-
-  PADDLE_ENFORCE(argument_->transformed_program_desc.get());
-}
-
-void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
-  PADDLE_ENFORCE(node);
-  PADDLE_ENFORCE(node->IsFunction());
-  PADDLE_ENFORCE(node->pb_desc() || !node->pb_msg().empty(),
-                 "node has invalid protobuf repr.");
-
-  // currently only the main block is analyzed.
-  PADDLE_ENFORCE(desc_);
-  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  auto *op = main_block->add_ops();
-
-  if (node->pb_desc()) {
-    auto *ori_op = static_cast<framework::proto::OpDesc *>(node->pb_desc());
-    *op =
-        *ori_op;  // copy the attributes, by default, these will not be changed
-    // by analysis phrase.
-    // The inputs and outputs of the existing ops are not changed by tensorrt
-    // subgraph pass.
-    // NOTE It might be changed by other passes in the long run.
-  } else {
-    op->ParseFromString(node->pb_msg());
-  }
-}
-
-void CreateTrtEngineOp(Node *node, Argument *argument,
-                       framework::proto::BlockDesc *block) {
-  PADDLE_ENFORCE(argument->main_dfg.get());
-  const DataFlowGraph &graph = *(argument->main_dfg);
-  static int counter{0};
-  PADDLE_ENFORCE(node->IsFunctionBlock());
-  framework::OpDesc desc;
-  auto *func = static_cast<FunctionBlock *>(node);
-
-  // collect inputs
-  std::unordered_set<std::string> input_names;
-  std::unordered_set<std::string> input_names_with_id;
-  for (auto *x : func->inlinks) {
-    input_names.insert(x->name());
-    input_names_with_id.insert(x->name() + std::to_string(x->id()));
-  }
-  desc.SetInput(
-      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
-
-  std::unordered_set<std::string> output_names;
-  std::unordered_set<std::string> output_names_with_id;
-  for (auto *x : func->outlinks) {
-    output_names.insert(x->name());
-    output_names_with_id.insert(x->name() + std::to_string(x->id()));
-  }
-
-  desc.SetOutput(
-      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
-  desc.SetType("tensorrt_engine");
-
-  std::unordered_map<std::string, std::string> output_name_map;
-
-  // The following procedure is used to rename all the intermediate
-  // variables and the output variables of the subgraph.
-  // Why we do this?
-  // During the transition from fluid OP to tensorrt OP, we map
-  // the input and output Tensor(fluid data structure) of fluid OP
-  // to the correspondin ITensor (trt data structure) through the
-  // Tensor name. When we set up ITensor for an variable, we must
-  // ensure that it has not been set before.
-  // If there is variable in the fluid graph, which is not only the
-  // input of a OP, but also the output of a Op, there will be problems.
-  // So we have to rename the variable in the subgraph to make sure
-  // it is either an OP's input or an OP's output.
-
-  auto subgraph_nodes = func->subgraph;
-  for (int index = 0; index < block->ops_size(); index++) {
-    framework::proto::OpDesc *op = block->mutable_ops(index);
-    auto correspond_node = subgraph_nodes[index];
-    PADDLE_ENFORCE_EQ(correspond_node->name(), op->type());
-
-    std::unordered_map<std::string, size_t> var2id;
-    for (auto *in_var : correspond_node->inlinks) {
-      var2id[in_var->name()] = in_var->id();
-    }
-    // rename for the input variables of op inside subgraph
-    for (int i = 0; i < op->inputs_size(); i++) {
-      framework::proto::OpDesc_Var *in_var = op->mutable_inputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < in_var->arguments_size(); k++) {
-        std::string arg_value = in_var->arguments(k);
-        std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (input_names_with_id.count(arg_value_with_id)) {
-          replaced_names.push_back(arg_value);
-        } else {
-          replaced_names.push_back(arg_value_with_id);
-        }
-      }
-      in_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        in_var->add_arguments(replaced_names[k]);
-      }
-    }
-    var2id.clear();
-    for (auto out_var : correspond_node->outlinks) {
-      var2id[out_var->name()] = out_var->id();
-    }
-
-    // rename for the output variables of op inside subgraph
-    for (int i = 0; i < op->outputs_size(); i++) {
-      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < out_var->arguments_size(); k++) {
-        std::string arg_value = out_var->arguments(k);
-        std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (output_names_with_id.count(arg_value_with_id)) {
-          output_name_map[arg_value] = arg_value_with_id;
-        }
-        replaced_names.push_back(arg_value_with_id);
-      }
-      out_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        out_var->add_arguments(replaced_names[k]);
-      }
-    }
-  }
-  // When tensorrt engine runs at the end of the operation,
-  // output_mapping help us copy the data from the renamed ITensor
-  // to Tensor.
-  std::vector<std::string> output_mapping;
-  for (auto name : output_names) {
-    PADDLE_ENFORCE(output_name_map.count(name) != 0);
-    output_mapping.push_back(output_name_map[name]);
-  }
-
-  PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc");
-  // Set attrs
-
-  SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
-  SetAttr(desc.Proto(), "max_batch_size", argument->Get<int>("max_batch_size"));
-  SetAttr(desc.Proto(), "workspace_size", argument->Get<int>("workspace_size"));
-  SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
-  SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
-  SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
-  node->SetPbMsg(desc.Proto()->SerializeAsString());
-}
-
-std::vector<std::string> ExtractParameters(
-    const std::vector<std::unique_ptr<Node>> &nodes) {
-  std::vector<std::string> parameters;
-  for (const auto &node : nodes) {
-    if (!node->IsValue()) continue;
-    PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first");
-    framework::proto::VarDesc var;
-    var.ParseFromString(node->pb_msg());
-    if (var.persistable()) {
-      parameters.push_back(var.name());
-    }
-  }
-  return parameters;
-}
-
-void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
-  // TODO(Superjomn) Here need to expose some arguments for default setting.
-  PADDLE_ENFORCE(node->IsFunctionBlock());
-  auto *block_node = static_cast<FunctionBlock *>(node);
-  framework::proto::BlockDesc proto;
-  framework::BlockDesc block_desc(nullptr, &proto);
-  block_desc.Proto()->set_parent_idx(-1);
-  block_desc.Proto()->set_idx(0);
-  VLOG(4) << "origin variable size: "
-          << argument_->origin_program_desc->blocks(0).vars().size();
-  VLOG(4) << "transformed variable size: " << block_desc.Proto()->vars().size();
-  // copy ops.
-
-  for (auto *node : block_node->subgraph) {
-    auto *op = block_desc.AppendOp();
-    PADDLE_ENFORCE(!node->pb_msg().empty());
-    op->Proto()->ParseFromString(node->pb_msg());
-  }
-
-  *block_desc.Proto()->mutable_vars() =
-      argument_->origin_program_desc->blocks(0).vars();
-  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
-  CreateTrtEngineOp(node, argument_, block_desc.Proto());
-  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  auto *op = main_block->add_ops();
-  PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
-  op->ParseFromString(node->pb_msg());
-}
-
-namespace {
-class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
- public:
-  using Config = DFG_GraphvizDrawPass::Config;
-  explicit DFG_DebuggerPass(const Config &config)
-      : DFG_GraphvizDrawPass(config) {}
-
-  std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
-
-  bool Finalize() override { return true; }
-};
-}  // namespace
-
-AnalysisPass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
-  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
-      FLAGS_IA_graphviz_log_root,
-      "data_flow_graph_to_fluid_graphviz_debugger"));
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
deleted file mode 100644
index 891c7226e245fa3b92892785362c186185a61f62..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-/*
- * This file implements the transformation from fluid ProgramDesc to data flow
- * graph.
- */
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/data_flow_graph.h"
-
-namespace paddle {
-namespace inference {
-
-namespace analysis {
-class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
- public:
-  DataFlowGraphToFluidPass() = default;
-
-  bool Initialize(Argument *argument) override;
-  bool Finalize() override;
-
-  void Run(DataFlowGraph *graph) override;
-
-  std::string repr() const override { return "DFG to fluid"; }
-  std::string description() const override {
-    return "Transform a DFG to a Fluid ProgramDesc";
-  }
-
-  AnalysisPass *CreateGraphvizDebugerPass() const override;
-
- protected:
-  // Add a Fluid Op into the ProgramDesc.
-  void AddFluidOp(Node *node);
-  // Add a EngineOp into the ProgramDesc.
-  void AddEngineOp(Node *node);
-
- private:
-  framework::proto::ProgramDesc *desc_;
-  Argument *argument_;
-};
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
deleted file mode 100644
index 4ef381db295b986b91173a728b6d98640f6f4f51..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
-
-#include <glog/logging.h>
-#include <google/protobuf/text_format.h>
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/io.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-TEST(DataFlowGraph, Test) {
-  Argument argument(FLAGS_inference_model_dir);
-
-  FluidToDataFlowGraphPass pass0;
-  DataFlowGraphToFluidPass pass1;
-  ASSERT_TRUE(pass0.Initialize(&argument));
-  ASSERT_TRUE(pass1.Initialize(&argument));
-
-  pass0.Run(argument.main_dfg.get());
-  pass1.Run(argument.main_dfg.get());
-
-  pass0.Finalize();
-  pass1.Finalize();
-
-  LOG(INFO) << argument.main_dfg->nodes.size();
-}
-
-};  // namespace analysis
-};  // namespace inference
-};  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
deleted file mode 100644
index 648b8f7d6a6ec4bafbad2838c5631e776c8699b1..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-int DFG_GraphvizDrawPass::counter_{0};
-
-void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {
-  auto content = Draw(graph);
-  auto dot_path = GenDotPath();
-  std::ofstream file(dot_path);
-  file.write(content.c_str(), content.size());
-  file.close();
-
-  auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png";
-  std::string message;
-  VLOG(3) << "draw to " << png_path;
-  ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message);
-}
-
-std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
-  Dot dot;
-  // Add nodes
-  for (size_t i = 0; i < graph->nodes.size(); i++) {
-    const Node &node = graph->nodes.Get(i);
-    if (config_.display_deleted_node || !node.deleted()) {
-      dot.AddNode(node.repr(), node.dot_attrs());
-    }
-  }
-  // Add edges
-  for (size_t i = 0; i < graph->nodes.size(); i++) {
-    const Node &node = graph->nodes.Get(i);
-    if (!config_.display_deleted_node && node.deleted()) continue;
-    for (auto &out : node.outlinks) {
-      if (!config_.display_deleted_node && out->deleted()) continue;
-      dot.AddEdge(node.repr(), out->repr(), {});
-    }
-  }
-  return dot.Build();
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
deleted file mode 100644
index e537bfc0e64d4ff46b3d61499a1a0298ed83533f..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file create an DFG_GraphvizDrawPass which helps to draw a data flow
- * graph's structure using graphviz.
- */
-
-#pragma once
-
-#include <fstream>
-#include <string>
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/dot.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * Output a dot file and write to some place.
- */
-class DFG_GraphvizDrawPass : public DataFlowGraphPass {
- public:
-  struct Config {
-    Config(const std::string &dir, const std::string &id,
-           bool display_deleted_node = false)
-        : dir(dir), id(id), display_deleted_node(display_deleted_node) {}
-
-    // The directory to store the .dot or .png files.
-    const std::string dir;
-    // The identifier for this dot file.
-    const std::string id;
-    // Whether to display deleted nodes, default false.
-    const bool display_deleted_node;
-  };
-
-  explicit DFG_GraphvizDrawPass(const Config &config) : config_(config) {}
-
-  bool Initialize(Argument *argument) override { return true; }
-  void Run(DataFlowGraph *graph) override;
-  bool Finalize() override { return true; }
-
-  std::string repr() const override { return "DFG graphviz drawer"; }
-  std::string description() const override {
-    return "Debug a DFG by draw with graphviz";
-  }
-
- protected:
-  // A counter to add a number prefix to the debugger image output so that they
-  // will sort in the triggered order.
-  static int counter_;
-
-  // Path of the dot file to output.
-  std::string GenDotPath() const {
-    return config_.dir + "/" + std::to_string(counter_++) + "-graph_" +
-           config_.id + ".dot";
-  }
-
-  virtual std::string Draw(DataFlowGraph *graph);
-
-  Config config_;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
deleted file mode 100644
index 928be7917047382d9b86294f6039b26b0ebf6f49..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
-
-#include <gtest/gtest.h>
-#include <fstream>
-#include <string>
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-TEST(DFG_GraphvizDrawPass, dfg_graphviz_draw_pass_tester) {
-  Argument argument(FLAGS_inference_model_dir);
-  FluidToDataFlowGraphPass pass0;
-  ASSERT_TRUE(pass0.Initialize(&argument));
-  pass0.Run(argument.main_dfg.get());
-
-  // auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
-
-  DFG_GraphvizDrawPass::Config config("./", "test");
-  DFG_GraphvizDrawPass pass(config);
-  pass.Initialize(&argument);
-  pass.Run(argument.main_dfg.get());
-
-  // test content
-  std::ifstream file("./0-graph_test.dot");
-  ASSERT_TRUE(file.is_open());
-
-  std::string line;
-  int no{0};
-  while (std::getline(file, line)) {
-    no++;
-  }
-  // DFG is sensitive to ProgramDesc, be careful to change the existing models.
-  ASSERT_EQ(no, 83);
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dot_tester.cc b/paddle/fluid/inference/analysis/dot_tester.cc
index 56ceb9bd5d6f41a601d66f6124fb7b4099c9337e..c785a312bf96c3586ea990fd9028cfd3b930d577 100644
--- a/paddle/fluid/inference/analysis/dot_tester.cc
+++ b/paddle/fluid/inference/analysis/dot_tester.cc
@@ -16,7 +16,6 @@
 
 #include <gtest/gtest.h>
 #include <memory>
-#include "paddle/fluid/inference/analysis/data_flow_graph.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
deleted file mode 100644
index 2b7d632c839e735ca03c6e17b94307b40cc13374..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
-#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
-  ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
-  if (argument->origin_program_desc) {
-    LOG(WARNING) << "argument's origin_program_desc is already set, might "
-                    "duplicate called";
-  }
-  if (!argument->fluid_model_program_path) {
-    ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir);
-    argument->fluid_model_program_path.reset(
-        new std::string(*argument->fluid_model_dir + "/__model__"));
-  }
-  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
-  auto program = LoadProgramDesc(*argument->fluid_model_program_path);
-  argument->origin_program_desc.reset(
-      new framework::proto::ProgramDesc(program));
-
-  if (!argument->main_dfg) {
-    argument->main_dfg.reset(new DataFlowGraph);
-  }
-  desc_ = argument->origin_program_desc.get();
-  return true;
-}
-
-bool FluidToDataFlowGraphPass::Finalize() { return true; }
-
-void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
-  PADDLE_ENFORCE(graph);
-  PADDLE_ENFORCE(desc_);
-  graph->Build(*desc_);
-}
-
-namespace {
-class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
- public:
-  using Config = DFG_GraphvizDrawPass::Config;
-  explicit DFG_DebuggerPass(const Config &config)
-      : DFG_GraphvizDrawPass(config) {}
-  std::string repr() const override { return "fluid-to-dfg-debuger-pass"; }
-  bool Finalize() override { return true; }
-};
-}
-
-AnalysisPass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
-  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
-      FLAGS_IA_graphviz_log_root, "fluid-to-dfg-debuger"));
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
deleted file mode 100644
index b9e262020e9522e167b998d57e2be2ac19b48447..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-/*
- * This file implements the transformation from data flow graph to fluid
- * ProgramDesc.
- */
-
-#pragma once
-
-#include <string>
-
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/data_flow_graph.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * Transform a FluidDesc to a SSA.
- */
-class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
- public:
-  FluidToDataFlowGraphPass() = default;
-
-  bool Initialize(Argument *argument) override;
-  bool Finalize() override;
-
-  void Run(DataFlowGraph *graph) override;
-
-  std::string repr() const override { return "fluid-to-data-flow-graph"; }
-  std::string description() const override {
-    return "transform a fluid ProgramDesc to a data flow graph.";
-  }
-
-  AnalysisPass *CreateGraphvizDebugerPass() const override;
-
- private:
-  framework::proto::ProgramDesc const *desc_;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
deleted file mode 100644
index 267a0a84ebf75615e0b390f4a1b3bf3b51793fc7..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-TEST(FluidToDataFlowGraphPass, Test) {
-  FluidToDataFlowGraphPass pass;
-  Argument argument(FLAGS_inference_model_dir);
-  pass.Initialize(&argument);
-  pass.Run(argument.main_dfg.get());
-  // Analysis is sensitive to ProgramDesc, careful to change the original model.
-  ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL);
-  pass.Finalize();
-  ASSERT_FALSE(argument.main_dfg->DotString().empty());
-  EXPECT_FALSE(argument.main_dfg->inputs().empty());
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
deleted file mode 100644
index fc60ca3bd0bf706407defb2655a093d999aef7c2..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/inference/io.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-void FluidToIrPass::EnableParamModify(const std::string &model_dir,
-                                      const std::string &prog_file,
-                                      const std::string &param_file) {
-  PADDLE_ENFORCE(argument_);
-  argument_->Set(framework::ir::kParamScopeAttr, new framework::Scope);
-  // Load parameters.
-  VLOG(3) << "Loading parameters from " << model_dir;
-  LoadParams(&argument_->Get<framework::Scope>(framework::ir::kParamScopeAttr),
-             model_dir, prog_file, param_file);
-}
-
-bool FluidToIrPass::LoadParams(framework::Scope *scope, const std::string &dir,
-                               const std::string &prog_file,
-                               const std::string &param_file) {
-  platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
-  framework::Executor executor(place);
-  PADDLE_ENFORCE(argument_->origin_program_desc.get());
-  framework::ProgramDesc program(*argument_->origin_program_desc);
-  if ((!prog_file.empty()) && (!param_file.empty())) {
-    LOG(INFO) << "load single model file from " << prog_file;
-    Load(&executor, scope, prog_file, param_file);
-  } else if (!dir.empty()) {
-    LOG(INFO) << "load from dir " << dir;
-    Load(&executor, scope, dir);
-  } else {
-    LOG(ERROR) << "failed to load parameters";
-    return false;
-  }
-  return true;
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
deleted file mode 100644
index c2599e218a2306f9353b843b7ea3f18aeacb008e..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/flags.h"
-#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-static const char kFluidToIrPassesAttr[] = "__fluid_to_ir_passes__";
-
-class FluidToIrPass final : public DataFlowGraphPass {
- public:
-  FluidToIrPass() = default;
-
-  bool Initialize(Argument *argument) override {
-    ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
-    PADDLE_ENFORCE(argument->Has(kFluidToIrPassesAttr),
-                   "argument need the attr %s", kFluidToIrPassesAttr);
-    argument_ = argument;
-    if (argument->origin_program_desc) {
-      LOG(WARNING) << "argument's origin_program_desc is already set, might "
-                      "duplicate called";
-    }
-    // set fluid model program path
-    if (!argument->fluid_model_program_path) {
-      ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir);
-      argument->fluid_model_program_path.reset(
-          new std::string(*argument->fluid_model_dir + "/__model__"));
-    }
-    ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
-    // Load program.
-    auto program = LoadProgramDesc(*argument->fluid_model_program_path);
-    argument->origin_program_desc.reset(
-        new framework::proto::ProgramDesc(program));
-    // Create main data flow graph.
-    if (!argument->main_dfg) {
-      argument->main_dfg.reset(new DataFlowGraph);
-    }
-    argument->Set("ir_program_desc", new ProgramDesc(program));
-
-    LOG(INFO) << "Loading parameters";
-    // Load parameters to argument if needed.
-    if (argument->fluid_model_dir || (argument->fluid_model_program_path &&
-                                      argument->fluid_model_param_path)) {
-#define SAFE_GET(ATTR) std::string ATTR = argument->ATTR ? *argument->ATTR : "";
-      SAFE_GET(fluid_model_dir);
-      SAFE_GET(fluid_model_program_path);
-      SAFE_GET(fluid_model_param_path);
-#undef SAFE_GET
-      EnableParamModify(fluid_model_dir, fluid_model_program_path,
-                        fluid_model_param_path);
-    }
-
-    return true;
-  }
-
-  bool Finalize() override { return true; }
-
-  void Run(DataFlowGraph *graph) override {
-    // Call all the IR Passes
-    IRPassManager ir_passes(argument_->Get<ProgramDesc>("ir_program_desc"),
-                            nullptr);
-    // Pass the scope from analysis to IR if needed.
-    if (argument_->Has(framework::ir::kParamScopeAttr)) {
-      // Here the address is passed, attention that IR doesn't own the scope, so
-      // the real scope in analysis should live during the IR phase.
-      ir_passes.graph().Set(
-          framework::ir::kParamScopeAttr,
-          new framework::Scope *(&argument_->Get<framework::Scope>(
-              framework::ir::kParamScopeAttr)));
-    }
-
-    if (FLAGS_IA_enable_ir) {
-      const auto &ir_passes_to_apply =
-          argument_->Get<std::vector<std::string>>(kFluidToIrPassesAttr);
-      ir_passes.Apply(ir_passes_to_apply);
-    }
-
-    PADDLE_ENFORCE(argument_->main_dfg.get());
-    argument_->main_dfg->Build(ir_passes.graph());
-    // inherit the arguments from ir.
-    if (ir_passes.graph().Has(framework::ir::kFuseStatisAttr)) {
-      argument_->Set(
-          framework::ir::kFuseStatisAttr,
-          new std::unordered_map<std::string, int>(
-              ir_passes.graph().Get<std::unordered_map<std::string, int>>(
-                  framework::ir::kFuseStatisAttr)));
-    }
-  }
-
-  void EnableParamModify(const std::string &model_dir,
-                         const std::string &prog_file,
-                         const std::string &param_file);
-
-  std::string repr() const override { return "fluid-to-ir-pass"; }
-
- private:
-  // Load parameters from a single file or from a directory.
-  bool LoadParams(framework::Scope *scope, const std::string &dir,
-                  const std::string &prog_file, const std::string &param_file);
-
- private:
-  Argument *argument_{nullptr};
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/graph_traits.cc b/paddle/fluid/inference/analysis/graph_traits.cc
deleted file mode 100644
index 2ea70a1d2060e03769d67060dc6f008207342b52..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/graph_traits.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/graph_traits.h"
diff --git a/paddle/fluid/inference/analysis/graph_traits.h b/paddle/fluid/inference/analysis/graph_traits.h
deleted file mode 100644
index aed2b1e8e27d94b430201d70ecf09d4acc33c8fa..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/graph_traits.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file defines the GraphTraits<X> template class that should be specified
- * by classes that want to be iteratable by generic graph iterators.
- *
- * This file also defines the marker class Inverse that is used to iterate over
- * graphs in a graph defined, inverse ordering...
- */
-
-#pragma once
-
-#include "paddle/fluid/inference/analysis/helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * This class should be specialized by different graph types...
- * That's why the base class is empty.
- */
-template <typename GraphType>
-struct GraphTraits {
-  // using NodesBFSIterator = xxx
-
-  // NodesBFSIterator nodes_begin();
-  // NodesBFSIterator nodes_end();
-};
-
-/*
- * Inverse - This class is used as a marker class to tell the graph iterator to
- * iterate in a graph defined Inverse order.
- */
-template <typename GraphType>
-struct Inverse {
-  const GraphType &graph;
-
-  explicit Inverse(const GraphType &graph) : graph(graph) {}
-};
-
-/*
- * Provide a partial specialization of GraphTraits so that the inverse of an
- * inverse turns into the original graph.
- */
-template <typename GraphType>
-struct GraphTraits<Inverse<Inverse<GraphType>>> : GraphTraits<GraphType> {};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index 5151e2b69ac199dea136535ba445e890596f6227..5511a0481e47c4abe70af5f39be9ecc9ac10f102 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -101,20 +101,20 @@ class OrderedRegistry {
  public:
   T *Register(const std::string &name, T *x) {
     PADDLE_ENFORCE(!dic_.count(name), "duplicate key [%s]", name);
-    dic_[name] = data_.size();
-    data_.emplace_back(std::unique_ptr<T>(x));
-    return data_.back().get();
+    dic_[name] = elements_.size();
+    elements_.emplace_back(std::unique_ptr<T>(x));
+    return elements_.back().get();
   }
 
   T *Lookup(const std::string &name) {
     auto it = dic_.find(name);
     if (it == dic_.end()) return nullptr;
-    return data_[it->second].get();
+    return elements_[it->second].get();
   }
 
  protected:
   std::unordered_map<std::string, int> dic_;
-  std::vector<std::unique_ptr<T>> data_;
+  std::vector<std::unique_ptr<T>> elements_;
 };
 
 template <typename T>
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index e76708baf4b39afb0febbcf3ff71281dfbfc8627..fce5e1cac92064a320179243380ea02b2c5d7838 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -18,6 +18,8 @@
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/argument.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
 #include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
@@ -27,21 +29,33 @@ using string::PrettyLogEndl;
 using string::PrettyLog;
 using string::Style;
 
-IRPassManager::IRPassManager(const ProgramDesc &program,
-                             framework::Scope *scope)
-    : program_(program) {
-  graph_.reset(new framework::ir::Graph(program));
-  if (scope)
-    graph_->Set(framework::ir::kParamScopeAttr, new framework::Scope *(scope));
+IRPassManager::IRPassManager(Argument *argument) {
+  ARGUMENT_CHECK_FIELD(argument, main_program);
+  graph_ = std::unique_ptr<Graph>(new Graph(argument->main_program()));
+  if (argument->Has("scope")) {
+    graph_->Set(framework::ir::kParamScopeAttr,
+                new framework::Scope *(
+                    const_cast<framework::Scope *>(&argument->scope())));
+  }
+
+  ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
+  CreatePasses(argument, argument->ir_analysis_passes());
 }
 
-void IRPassManager::Apply(const std::vector<std::string> &passes) {
-  // Apply all the passes
+void IRPassManager::CreatePasses(Argument *argument,
+                                 const std::vector<std::string> &passes) {
   std::string pre_pass;
   int pass_num = 0;
   for (const std::string &pass_name : passes) {
-    PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass_name);
     auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
+
+    // Set some pass attributes.
+    if (pass_name == "ir_analysis_pass") {
+      pass->Set("tensorrt_node_teller",
+                new SubgraphDetector::NodeInsideSubgraphTeller(
+                    argument->tensorrt_node_teller()));
+    }
+
     if (pass_name == "graph_viz_pass") {
       std::string dot_file_path = std::to_string(pass_num) + "_ir_" +
                                   (pre_pass.empty() ? "origin" : pre_pass) +
@@ -49,11 +63,47 @@ void IRPassManager::Apply(const std::vector<std::string> &passes) {
       pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
       pass_num++;
     }
-    graph_ = pass->Apply(std::move(graph_));
+
+    if (pass_name == "tensorrt_subgraph_pass") {
+      PADDLE_ENFORCE(argument->tensorrt_node_teller_valid());
+      pass->SetNotOwned("tensorrt_node_teller",
+                        argument->tensorrt_node_teller_ptr());
+      pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
+      pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
+    }
+
+    // graph_ = pass->Apply(std::move(graph_));
     pre_pass = pass_name;
+
+    passes_.emplace_back(std::move(pass));
   }
 }
 
+std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
+  if (passes_.empty()) {
+    return graph;
+  }
+  PADDLE_ENFORCE(graph.get());
+  // Apply all the passes
+  for (const auto &pass : passes_) {
+    PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
+    graph = pass->Apply(std::move(graph));
+  }
+  return std::move(graph);
+}
+
+framework::proto::ProgramDesc IRPassManager::AcquireProgram(
+    std::unique_ptr<Graph> *graph, const ProgramDesc &program) const {
+  auto pass =
+      framework::ir::PassRegistry::Instance().Get("graph_to_program_pass");
+
+  ProgramDesc desc(program);
+  pass->SetNotOwned("program", &desc);
+  auto *the_graph = graph->release();
+  *graph = pass->Apply(std::unique_ptr<Graph>(the_graph));
+  return *desc.Proto();
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h
index bb230283b7c2cc783d0b68ea0aa3cca1cabc75e6..983a582649706fa6eedb5aa459b5ac53b98f658b 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.h
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.h
@@ -20,27 +20,38 @@
  * for inference.
  */
 
+#pragma once
+
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/argument.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 using framework::ProgramDesc;
+using framework::ir::Graph;
 
 class IRPassManager final {
  public:
-  IRPassManager(const ProgramDesc &program, framework::Scope *scope);
+  explicit IRPassManager(Argument *argument);
+
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph);
 
-  void Apply(const std::vector<std::string> &passes);
+  framework::proto::ProgramDesc AcquireProgram(
+      std::unique_ptr<Graph> *graph, const ProgramDesc &program) const;
 
   framework::ir::Graph &graph() const { return *graph_; }
 
  private:
-  std::unique_ptr<framework::ir::Graph> graph_;
-  ProgramDesc program_;
+  void CreatePasses(Argument *argument, const std::vector<std::string> &passes);
+
+  std::unique_ptr<Graph> graph_;
+  std::vector<std::unique_ptr<framework::ir::Pass>> passes_;
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c71cff889ed7cdb95f79b9bc89a9ca5ab370271c
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -0,0 +1,7 @@
+cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
+cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector)
+set(analysis_deps ${analysis_deps}
+        subgraph_detector tensorrt_subgraph_pass
+        CACHE INTERNAL "")
+
+set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
similarity index 54%
rename from paddle/fluid/inference/analysis/subgraph_splitter.cc
rename to paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
index 526bbbadfe90c3064d7c620cc22e30f7fef99088..e903ec54cc4ed25ab0648c8c19caa2c8bb00b94f 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
@@ -12,46 +12,110 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
+#include <string>
+#include <utility>
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/node.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-const char *SubGraphSplitter::kMarkerAttrName =
-    "_sub_graph_splitter_inside_sub_graph";
+using framework::ir::Node;
+
+std::pair<std::vector<Node *>, std::vector<Node *>>
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
+  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
+  std::unordered_set<Node *> inputs;
+  std::unordered_set<Node *> outputs;
+  // Input a Value, check whether its inlink is in the subgraph.
+  auto inlink_in_subgraph = [&](Node *n) {
+    for (auto *in : n->inputs) {
+      if (nodes.count(in)) return true;
+    }
+    return false;
+  };
+
+  for (auto &node : graph) {
+    for (auto *in : node->inputs) {
+      // The Value that is written by nodes inside a sub-graph shouldn't be the
+      // input of the sub-graph.
+      if (!nodes.count(in) && in->IsVar() && !inlink_in_subgraph(in)) {
+        inputs.insert(in);
+      }
+    }
+    for (auto *out : node->outputs) {
+      if (!nodes.count(out) && out->IsVar()) {
+        outputs.insert(out);
+      }
+    }
+  }
+  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
+                        std::vector<Node *>(outputs.begin(), outputs.end()));
+}
+
+// Filter the Intermediate results of the subgraph node.
+void FilterRedundantOutputOfSubGraph(Graph *graph) {
+  std::vector<Node *> op_nodes;
+  for (auto &node : TopologicalSort(*graph)) {
+    if (node.IsVar() || Agent(&node).deleted()) {
+      continue;
+    }
+    op_nodes.push_back(&node);
+  }
+  size_t op_num = op_nodes.size();
+  for (size_t i = 0; i < op_num; i++) {
+    if (op_nodes[i]->IsOp()) continue;
+    std::unordered_set<std::string> follow_up_input_names;
+    for (size_t j = i + 1; j < op_num; j++) {
+      for (auto *in : op_nodes[j]->inputs) {
+        follow_up_input_names.insert(in->Name());
+      }
+    }
+    std::vector<Node *> filtered_subgraph_outlinks;
+    for (auto *out : op_nodes[i]->outputs) {
+      if (follow_up_input_names.count(out->Name())) {
+        filtered_subgraph_outlinks.push_back(out);
+      } else {
+        Agent(out).set_deleted(true);
+      }
+    }
+    // The filtered_subgraph_outlinks may be empty.
+    op_nodes[i]->outputs = filtered_subgraph_outlinks;
+  }
+}
 
-std::vector<std::vector<Node *>> SubGraphSplitter::operator()() {
+std::vector<std::vector<Node *>> SubgraphDetector::operator()() {
   MarkNodesInsideSubGraph();
   return ExtractSubGraphs();
 }
 
 // Mark the output variables inside a subgraph with the func.
-inline void MarkOutLinksInSubGraph(const Function *func) {
-  for (auto *var : func->outlinks) {
-    var->attr(SubGraphSplitter::kMarkerAttrName).Bool() = true;
+inline void MarkOutLinksInSubGraph(const Node *func) {
+  for (auto *var : func->outputs) {
+    Agent(var).set_marked(true);
   }
 }
 
-void SubGraphSplitter::MarkNodesInsideSubGraph() {
-  for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes()) {
+void SubgraphDetector::MarkNodesInsideSubGraph() {
+  for (auto &node : framework::ir::GraphTraits::DFS(*graph_)) {
     if (node_inside_subgraph_teller_(&node)) {
-      node.attr(kMarkerAttrName).Bool() = true;
-      if (node.type() == Node::Type::kFunction) {
+      Agent(&node).set_marked(true);
+      if (node.IsOp()) {
         // If a function is inside the sub-graph, mark all the output variables
         // to be inside too, so that two marked functions will be inside a same
         // sub-graph, lets take a example:  A_function->var->B_function, if
         // A_function is marked, var should also be marked, so that B_function
         // will be in the same sub-graph with A_function if B_function is
         // marked.
-        MarkOutLinksInSubGraph(static_cast<const Function *>(&node));
+        MarkOutLinksInSubGraph(&node);
       }
     }
   }
 }
 
-const char *kUnionFindParent = "_sub_graph_splitter_union_find_parent_";
-
 // Use the Union Find(UF) algorithm to find fully connected sub-graphs, if node
 // a's output is node b, that is a and b is in the same sub-graph. The UF
 // algorithm will group them to the same cluster.
@@ -60,8 +124,8 @@ using node_map_t = std::unordered_map<int, Node *>;
 int UnionFindGetAncestor(const node_map_t &node_map, size_t id) {
   int tmp = id;
   do {
-    tmp = node_map.at(tmp)->attr(kUnionFindParent).Int32();
-  } while (node_map.at(tmp)->attr(kUnionFindParent).Int32() != tmp);
+    tmp = Agent(node_map.at(tmp)).union_find_parent();
+  } while (Agent(node_map.at(tmp)).union_find_parent() != tmp);
   return tmp;
 }
 // Make this two node share the same ancestor.
@@ -69,9 +133,9 @@ int UnionFindGetAncestor(const node_map_t &node_map, size_t id) {
 void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
   int a_ancestor = UnionFindGetAncestor(node_map, a);
   int b_ancestor = UnionFindGetAncestor(node_map, b);
-  node_map.at(b_ancestor)->attr(kUnionFindParent).Int32() = a_ancestor;
-  node_map.at(a)->attr(kUnionFindParent).Int32() = a_ancestor;
-  node_map.at(b)->attr(kUnionFindParent).Int32() = a_ancestor;
+  Agent(node_map.at(b_ancestor)).set_union_find_parent(a_ancestor);
+  Agent(node_map.at(a)).set_union_find_parent(a_ancestor);
+  Agent(node_map.at(b)).set_union_find_parent(a_ancestor);
 }
 
 // This is a simple representation of a graph.
@@ -195,16 +259,21 @@ void FlexibleDFS(const std::vector<BriefNode *> &source, bool reverse,
   }
 }
 
-std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
+std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubGraphs() {
   // Run the Extract algorithm to find all subgraphs.
   std::vector<Node *> marked_nodes;
   //  We use brief_node_map to represent the original graph in order to avoid
   //  changing the original graph.
   std::unordered_map<int, BriefNode *> brief_node_map;
 
-  for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes_in_TS()) {
+  std::unordered_set<int32_t> valid_node_ids;
+  for (auto *node : graph_->Nodes()) {
+    valid_node_ids.insert(node->id());
+  }
+
+  for (auto &node : framework::ir::GraphTraits::TS(*graph_)) {
     brief_node_map[node.id()] = new BriefNode(&node);
-    if (node.attr(kMarkerAttrName).Bool()) {
+    if (Agent(&node).marked()) {
       marked_nodes.push_back(&node);
     }
   }
@@ -213,26 +282,34 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
   node_map_t node_map;  // id to ptr
   for (auto *n : marked_nodes) {
     // n's parent == n.id means it is the ancestor
-    n->attr(kUnionFindParent).Int32() = n->id();
+    Agent(n).set_union_find_parent(n->id());
     node_map[n->id()] = n;
   }
 
   // create breif node map
   for (auto &itr : brief_node_map) {
-    for (Node *node : itr.second->node->inlinks) {
-      itr.second->inlinks.push_back(brief_node_map[node->id()]);
+    for (Node *node : itr.second->node->inputs) {
+      if (!valid_node_ids.count(node->id())) {
+        LOG(INFO) << "invalid node id " << node->id();
+        continue;
+      }
+      itr.second->inlinks.push_back(brief_node_map.at(node->id()));
     }
 
-    for (Node *node : itr.second->node->outlinks) {
-      itr.second->outlinks.push_back(brief_node_map[node->id()]);
+    for (Node *node : itr.second->node->outputs) {
+      if (!valid_node_ids.count(node->id())) {
+        LOG(INFO) << "invalid node id " << node->id();
+        continue;
+      }
+      itr.second->outlinks.push_back(brief_node_map.at(node->id()));
     }
   }
 
   for (auto &itr : brief_node_map) {
     BriefNode *brief_node = itr.second;
 
-    if (!brief_node->node->attr(kMarkerAttrName).Bool()) {
-      VLOG(4) << brief_node->node->id() << " node not a trt candicate.";
+    if (!Agent(brief_node->node).marked()) {
+      VLOG(4) << brief_node->node->id() << " node not a trt candidate.";
       continue;
     }
 
@@ -254,7 +331,7 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
       std::unordered_set<BriefNode *> contract_nodes;
       for (auto *out : brief_node->outlinks) {
         // must be an trt candidate
-        if (!out->node->attr(kMarkerAttrName).Bool()) continue;
+        if (!Agent(out->node).marked()) continue;
         // get all dst input nodes except src.
         std::vector<BriefNode *> source_nodes;
         for (auto *n : out->inlinks) {
@@ -289,9 +366,8 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
 
   std::unordered_map<int /*ancestor*/, std::vector<Node *>> clusters;
   for (auto *n : marked_nodes) {
-    if (n->type() == Node::Type::kFunction) {
-      clusters[UnionFindGetAncestor(node_map,
-                                    n->attr(kUnionFindParent).Int32())]
+    if (n->IsOp()) {
+      clusters[UnionFindGetAncestor(node_map, Agent(n).union_find_parent())]
           .push_back(n);
     }
   }
@@ -304,28 +380,59 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
   return result;
 }
 
-void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
+void SubGraphFuser::operator()() { ReplaceNodesWithSubGraphs(); }
+
+void RemoveIntermediateOutputInSubgraph(const std::vector<Node *> &subgraph,
+                                        Graph *graph,
+                                        std::vector<Node *> *outputs) {
+  std::unordered_set<Node *> subgraph_set(subgraph.begin(), subgraph.end());
+  std::unordered_set<Node *> valid_output;
+
+  for (auto *output : *outputs) {
+    int num_used = 0;
+    for (auto *node : output->outputs) {
+      if (!subgraph_set.count(node)) ++num_used;
+      if (num_used > 0) valid_output.insert(output);
+    }
+  }
+
+  outputs->assign(valid_output.begin(), valid_output.end());
+}
+
+void DetachDeletedNodes(framework::ir::Graph *graph) {
+  std::unordered_set<const Node *> nodes;
+  for (auto *node : graph->Nodes()) {
+    if (Agent(node).deleted()) {
+      node->inputs.clear();
+      node->outputs.clear();
+    }
+  }
+}
 
-void SubGraphFuse::ReplaceNodesWithSubGraphs() {
-  auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
+void SubGraphFuser::ReplaceNodesWithSubGraphs() {
+  auto subgraphs = SubgraphDetector(graph_, node_inside_subgraph_teller_)();
   for (auto &subgraph : subgraphs) {
-    if (subgraph.size() <= argument_->Get<int>("minimum_subgraph_size"))
-      continue;
+    if (subgraph.size() <= min_subgraph_size_) continue;
+    LOG(INFO) << "detect a subgraph size " << subgraph.size();
     std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
     // replace this sub-graph with the first node. Two steps: 1. Create a Block
     // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
     // as deleted. 3. Replace the deleted node with the new Block Node.
-    auto *block_node = static_cast<FunctionBlock *>(
-        graph_->nodes.Create(Node::Type::kFunctionBlock));
+    framework::OpDesc empty_desc;
+    empty_desc.SetType("tensorrt_engine");
+    auto *block_node = graph_->CreateOpNode(&empty_desc);
+    Agent(block_node).set_subgraph({});
     auto io = ExtractInputAndOutputOfSubGraph(subgraph);
-    block_node->inlinks = std::move(io.first);
-    block_node->outlinks = std::move(io.second);
+    block_node->inputs = std::move(io.first);
+    block_node->outputs = std::move(io.second);
+
+    RemoveIntermediateOutputInSubgraph(subgraph, graph_, &block_node->outputs);
 
     for (auto *node : subgraph) {
       // TODO(Superjomn) need a unified mechanism to treat deleted node in each
       // pass.
-      node->SetDeleted();
-      block_node->subgraph.push_back(node);
+      Agent(node).set_deleted(true);
+      Agent(block_node).subgraph()->push_back(node);
     }
 
     // Change all the sub-graph's inputs and outputs corresponding inlink and
@@ -339,16 +446,92 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() {
       std::unordered_set<Node *> uniq(nodes.begin(), nodes.end());
       nodes.assign(uniq.begin(), uniq.end());
     };
-    for (auto *i : block_node->inlinks) {
-      inlink_or_outlink_cleaner(i->outlinks);
+    for (auto *i : block_node->inputs) {
+      inlink_or_outlink_cleaner(i->outputs);
     }
-    for (auto *&o : block_node->outlinks) {
-      inlink_or_outlink_cleaner(o->inlinks);
+    for (auto *&o : block_node->outputs) {
+      inlink_or_outlink_cleaner(o->inputs);
     }
   }
+  // DetachDeletedNodes(graph_);
   FilterRedundantOutputOfSubGraph(graph_);
 }
 
+inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
+  return node.inputs.size() == n;
+}
+
+NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
+  PADDLE_ENFORCE(!source.empty(),
+                 "Start points of topological sorting should not be empty!");
+  // CHECK all the inputs' in-degree is 0
+  for (auto *node : source) {
+    PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
+  }
+
+  std::unordered_set<Node *> visited;
+  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
+
+  std::vector<Node *> inlink_visited;
+  while (!to_visit.empty()) {
+    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
+    for (auto *p : queue) {
+      if (Agent(p).deleted()) {
+        visited.insert(p);
+        to_visit.erase(p);
+      }
+
+      inlink_visited.clear();
+
+      std::copy_if(p->inputs.begin(), p->inputs.end(),
+                   std::back_inserter(inlink_visited),
+                   [&](Node *x) -> bool { return visited.count(x) != 0; });
+
+      if (inlink_visited.size() == p->inputs.size()) {
+        sorted_.push_back(p);
+        for (auto *_ : p->outputs) {
+          if (!visited.count(_)) {
+            to_visit.insert(_);
+          }
+        }
+
+        to_visit.erase(p);
+        visited.insert(p);
+      }
+    }
+  }
+}
+
+NodesTSIterator::NodesTSIterator(const NodesTSIterator &other)
+    : sorted_(other.sorted_), cursor_(other.cursor_) {}
+
+Node &NodesTSIterator::operator*() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return *sorted_[cursor_];
+}
+
+NodesTSIterator &NodesTSIterator::operator++() {
+  if (++cursor_ >= sorted_.size()) {
+    sorted_.clear();
+    cursor_ = 0;
+  }
+  return *this;
+}
+NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) {
+  cursor_ = other.cursor_;
+  sorted_ = other.sorted_;
+  return *this;
+}
+
+bool NodesTSIterator::operator==(const NodesTSIterator &other) {
+  return sorted_ == other.sorted_ && cursor_ == other.cursor_;
+}
+
+Node *NodesTSIterator::operator->() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return sorted_[cursor_];
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea88edd042aa9d46f66af1aa92f2cb273696c118
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
@@ -0,0 +1,182 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file defines the the class to partition a graph.
+ */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/inference/analysis/argument.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+using framework::ir::Graph;
+
+const char kIsFunctionNode[] = "__is_function_node__";
+const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__";
+const char kSubgraphSplitterMarkerAttrName[] =
+    "_sub_graph_splitter_inside_sub_graph";
+
+/*
+ * Detect the nodes in a sub-graph that meet some conditions. This class doesn't
+ * modify the graph.
+ */
+class SubgraphDetector {
+ public:
+  // Tell whether a node is inside a sub-graph.
+  using NodeInsideSubgraphTeller =
+      std::function<bool(const framework::ir::Node *)>;
+
+  SubgraphDetector(Graph *graph, const NodeInsideSubgraphTeller &teller)
+      : graph_(graph), node_inside_subgraph_teller_(teller) {}
+
+  std::vector<std::vector<framework::ir::Node *>> operator()();
+
+ protected:
+  // Mark the nodes inside the accepted sub-graph using
+  // node_inside_subgraph_teller.
+  void MarkNodesInsideSubGraph();
+
+  // Merge the marked nodes into sub-graphs and return the sub-graphs.
+  std::vector<std::vector<framework::ir::Node *>> ExtractSubGraphs();
+
+ private:
+  Graph *graph_;
+  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+};
+
+/*
+ * SubGraphFuser - Replace some nodes with the sub-graph node they are inside.
+ * To some extent, the TensorRT engine is just a fusion op for a model.
+ */
+class SubGraphFuser {
+ public:
+  using NodeInsideSubgraphTeller = SubgraphDetector::NodeInsideSubgraphTeller;
+
+  SubGraphFuser(Graph *graph, const NodeInsideSubgraphTeller &teller,
+                int min_subgraph_size)
+      : graph_(graph),
+        node_inside_subgraph_teller_(teller),
+        min_subgraph_size_{min_subgraph_size} {}
+
+  // The main method which run all the logic.
+  void operator()();
+
+ protected:
+  // Remove the nodes inside sub-graphs and replace with the SubGraphNode.
+  void ReplaceNodesWithSubGraphs();
+
+ private:
+  Graph *graph_;
+  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+  int min_subgraph_size_;
+};
+
+struct NodeWrapper {
+  bool deleted{false};
+  bool marked{false};
+  int union_find_parent{-1};
+  std::vector<framework::ir::Node *> subgraph;
+};
+
+/*
+ * ir::Node agent for subgraph detector.
+ */
+struct Agent {
+  explicit Agent(framework::ir::Node *x) : x_(x) {}
+
+  NodeWrapper &wrapper() {
+    if (!x_->IsWrappedBy<NodeWrapper>()) {
+      x_->WrappedBy<NodeWrapper>(new NodeWrapper);
+    }
+    return x_->template Wrapper<NodeWrapper>();
+  }
+
+  bool deleted() { return wrapper().deleted; }
+  void set_deleted(bool x) { wrapper().deleted = x; }
+
+  bool marked() { return wrapper().marked; }
+  void set_marked(bool x) { wrapper().marked = x; }
+
+  void set_subgraph(const std::vector<framework::ir::Node *> &x) {
+    wrapper().subgraph = x;
+  }
+
+  int union_find_parent() { return wrapper().union_find_parent; }
+  void set_union_find_parent(int v) { wrapper().union_find_parent = v; }
+
+  std::vector<framework::ir::Node *> *subgraph() { return &wrapper().subgraph; }
+  std::vector<framework::ir::Node *> &inputs() { return x_->inputs; }
+  std::vector<framework::ir::Node *> &outputs() { return x_->outputs; }
+
+ private:
+  framework::ir::Node *x_;
+};
+
+// Topological sorting iterator on nodes.
+struct NodesTSIterator
+    : public std::iterator<std::forward_iterator_tag, framework::ir::Node *> {
+  NodesTSIterator() = default;
+  explicit NodesTSIterator(const std::vector<framework::ir::Node *> &source);
+  NodesTSIterator(NodesTSIterator &&other)
+      : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
+    other.cursor_ = 0;
+  }
+  NodesTSIterator(const NodesTSIterator &other);
+
+  framework::ir::Node &operator*();
+  NodesTSIterator &operator++();
+  // TODO(Superjomn) current implementation just compare the first
+  // element, need to compare the graph and all the elements in the queue and
+  // set.
+  NodesTSIterator &operator=(const NodesTSIterator &other);
+  bool operator==(const NodesTSIterator &other);
+  bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
+  framework::ir::Node *operator->();
+
+ private:
+  std::vector<framework::ir::Node *> sorted_;
+  size_t cursor_{0};
+};
+
+// The nodes those have no input will be treated as start points.
+static std::vector<framework::ir::Node *> ExtractStartPoints(const Graph &g) {
+  std::vector<framework::ir::Node *> result;
+  for (auto *node : g.Nodes()) {
+    if (node->inputs.empty()) {
+      result.push_back(node);
+    }
+  }
+  return result;
+}
+
+static iterator_range<NodesTSIterator> TopologicalSort(const Graph &g) {
+  auto start_points = ExtractStartPoints(g);
+  PADDLE_ENFORCE(!start_points.empty());
+  NodesTSIterator x(start_points);
+  return iterator_range<NodesTSIterator>(NodesTSIterator(start_points),
+                                         NodesTSIterator());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f27347b9d176eae8fbd087a21bdedb9cb84085e6
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -0,0 +1,220 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+using framework::ir::Node;
+
+std::vector<std::string> ExtractParameters(
+    const std::unordered_set<Node *> &nodes);
+
+std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
+
+    std::unique_ptr<framework::ir::Graph> graph) const {
+  framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
+
+  auto teller =
+      Get<SubgraphDetector::NodeInsideSubgraphTeller>("tensorrt_node_teller");
+
+  SubGraphFuser fuser(graph.get(), teller, 2 /*min subgraph size*/);
+  fuser();
+
+  for (auto *node : graph->Nodes()) {
+    if (node->IsOp() && !Agent(node).subgraph()->empty()) {
+      CreateTensorRTOp(node, graph.get());
+
+      std::unordered_set<const Node *> nodes2remove(
+          Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
+      framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+    }
+  }
+
+  std::unordered_set<const Node *> nodes2remove;
+  for (auto *node : graph->Nodes()) {
+    if (node->IsOp() && Agent(node).deleted()) {
+      nodes2remove.insert(node);
+    }
+  }
+  framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+
+  return graph;
+}
+
+void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
+                                            Graph *graph) const {
+  auto *op_desc = node->Op();
+  static int counter{0};
+  auto &subgraph = *Agent(node).subgraph();
+  PADDLE_ENFORCE(!subgraph.empty());
+
+  // An fake block desc.
+  framework::proto::BlockDesc block_proto;
+  framework::BlockDesc block_desc(nullptr, &block_proto);
+  block_desc.Proto()->set_parent_idx(-1);
+  block_desc.Proto()->set_idx(0);
+  for (auto *node : subgraph) {
+    auto *op = block_desc.AppendOp();
+    *op->Proto() = *node->Op()->Proto();
+  }
+
+  // collect inputs
+  std::unordered_set<std::string> input_names;
+  std::unordered_set<std::string> input_names_with_id;
+  for (auto *x : node->inputs) {
+    input_names.insert(x->Name());
+    input_names_with_id.insert(x->Name() + std::to_string(x->id()));
+  }
+  op_desc->SetInput(
+      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
+
+  std::unordered_set<std::string> output_names;
+  std::unordered_set<std::string> output_names_with_id;
+  for (auto *x : node->outputs) {
+    output_names.insert(x->Name());
+    output_names_with_id.insert(x->Name() + std::to_string(x->id()));
+  }
+
+  op_desc->SetOutput(
+      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
+  op_desc->SetType("tensorrt_engine");
+
+  std::unordered_map<std::string, std::string> output_name_map;
+
+  // The following procedure is used to rename all the intermediate
+  // variables and the output variables of the subgraph.
+  // Why we do this?
+  // During the transition from fluid OP to tensorrt OP, we map
+  // the input and output Tensor(fluid data structure) of fluid OP
+  // to the corresponding ITensor (trt data structure) through the
+  // Tensor name. When we set up ITensor for an variable, we must
+  // ensure that it has not been set before.
+  // If there is variable in the fluid graph, which is not only the
+  // input of a OP, but also the output of a Op, there will be problems.
+  // So we have to rename the variable in the subgraph to make sure
+  // it is either an OP's input or an OP's output.
+
+  auto &subgraph_nodes = *Agent(node).subgraph();
+  for (int index = 0; index < block_desc.OpSize(); index++) {
+    framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
+    auto correspond_node = subgraph_nodes[index];
+    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
+
+    std::unordered_map<std::string, size_t> var2id;
+    for (auto *in_var : correspond_node->inputs) {
+      var2id[in_var->Name()] = in_var->id();
+    }
+    // rename for the input variables of op inside subgraph
+    for (int i = 0; i < op->inputs_size(); i++) {
+      // one input
+      auto *in_var = op->mutable_inputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < in_var->arguments_size(); k++) {  // all the arguments
+        std::string arg_value = in_var->arguments(k);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (input_names_with_id.count(arg_value_with_id)) {
+          replaced_names.push_back(arg_value);
+        } else {
+          replaced_names.push_back(arg_value_with_id);
+        }
+      }
+      in_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        in_var->add_arguments(replaced_names[k]);
+      }
+    }
+    var2id.clear();
+    for (auto out_var : correspond_node->outputs) {
+      var2id[out_var->Name()] = out_var->id();
+    }
+
+    // rename for the output variables of op inside subgraph
+    for (int i = 0; i < op->outputs_size(); i++) {
+      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < out_var->arguments_size(); k++) {
+        std::string arg_value = out_var->arguments(k);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (output_names_with_id.count(arg_value_with_id)) {
+          output_name_map[arg_value] = arg_value_with_id;
+        }
+        replaced_names.push_back(arg_value_with_id);
+      }
+      out_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        out_var->add_arguments(replaced_names[k]);
+      }
+    }
+  }
+
+  // When tensorrt engine runs at the end of the operation,
+  // output_mapping help us copy the data from the renamed ITensor
+  // to Tensor.
+  std::vector<std::string> output_mapping;
+  for (auto name : output_names) {
+    // LOG(INFO) << name << " " << output_name_map.size();
+    PADDLE_ENFORCE(output_name_map.count(name) != 0);
+    output_mapping.push_back(output_name_map[name]);
+  }
+
+  *block_desc.Proto()->mutable_vars() =
+      const_cast<framework::ProgramDesc *>(&graph->program())
+          ->Proto()
+          ->blocks(0)
+          .vars();
+  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
+                 "the block has no var-desc");
+  PADDLE_ENFORCE(!output_mapping.empty());
+  // Set attrs
+  SetAttr(op_desc->Proto(), "subgraph",
+          block_desc.Proto()->SerializeAsString());
+  SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
+  SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
+  SetAttr(op_desc->Proto(), "engine_uniq_key",
+          "trt-" + std::to_string(counter++));
+  SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
+  SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
+}
+
+std::vector<std::string> ExtractParameters(
+    const std::unordered_set<Node *> &nodes) {
+  std::vector<std::string> parameters;
+  for (const auto &node : nodes) {
+    if (!node->IsVar()) continue;
+    if (node->Var()->Persistable()) {
+      parameters.push_back(node->Name());
+    }
+  }
+  return parameters;
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_PASS(tensorrt_subgraph_pass,
+              paddle::inference::analysis::TensorRtSubgraphPass)
+    .RequirePassAttr("tensorrt_node_teller")
+    .RequirePassAttr("max_batch_size")
+    .RequirePassAttr("workspace_size");
diff --git a/paddle/fluid/inference/analysis/model_store_pass_tester.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
similarity index 55%
rename from paddle/fluid/inference/analysis/model_store_pass_tester.cc
rename to paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
index d6493fc25edf25003504542f1b01c4105754c8df..502353b95fc15e763900a0caf1649257508f0880 100644
--- a/paddle/fluid/inference/analysis/model_store_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
@@ -12,31 +12,24 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/model_store_pass.h"
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/analysis/analyzer.h"
+#pragma once
+#include <paddle/fluid/framework/ir/fuse_pass_base.h>
+#include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-DEFINE_string(inference_model_dir, "", "Model path");
-
-TEST(DFG_StorePass, test) {
-  Analyzer analyzer;
-  Argument argument(FLAGS_inference_model_dir);
-  argument.model_output_store_path.reset(
-      new std::string("./_dfg_store_pass_tmp"));
-  // disable storage in alalyzer
-  FLAGS_IA_output_storage_path = "";
-  analyzer.Run(&argument);
+class TensorRtSubgraphPass : public framework::ir::FusePassBase {
+ public:
+  std::unique_ptr<framework::ir::Graph> ApplyImpl(
+      std::unique_ptr<framework::ir::Graph> graph) const override;
 
-  ModelStorePass pass;
-  pass.Initialize(&argument);
-  pass.Run(argument.main_dfg.get());
-}
+ private:
+  void CreateTensorRTOp(framework::ir::Node *x,
+                        framework::ir::Graph *graph) const;
+  void CleanIntermediateOutputs(framework::ir::Node *node);
+};
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/model_store_pass.cc b/paddle/fluid/inference/analysis/model_store_pass.cc
deleted file mode 100644
index c313db08875669010ddcca13aa66b383ee6d26f8..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/model_store_pass.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string>
-
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/analysis/argument.h"
-#include "paddle/fluid/inference/analysis/model_store_pass.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-void ModelStorePass::Run(DataFlowGraph *x) {
-  if (!argument_->fluid_model_param_path) {
-    PADDLE_ENFORCE_NOT_NULL(argument_->fluid_model_dir);
-    argument_->fluid_model_param_path.reset(
-        new std::string(*argument_->fluid_model_dir + "param"));
-  }
-  PADDLE_ENFORCE_NOT_NULL(argument_->model_output_store_path);
-  // Directly copy param file to destination.
-  std::stringstream ss;
-  // NOTE these commands only works on linux.
-  ss << "mkdir -p " << *argument_->model_output_store_path;
-  VLOG(3) << "run command: " << ss.str();
-  PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
-  ss.str("");
-
-  ss << "cp " << *argument_->fluid_model_dir << "/*"
-     << " " << *argument_->model_output_store_path;
-  VLOG(3) << "run command: " << ss.str();
-  PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
-
-  // Store program
-  PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc,
-                          "program desc is not transformed, should call "
-                          "DataFlowGraphToFluidPass first.");
-  VLOG(3) << "store analyzed program to "
-          << *argument_->model_output_store_path;
-  const std::string program_output_path =
-      *argument_->model_output_store_path + "/__model__";
-  std::ofstream file(program_output_path, std::ios::binary);
-  PADDLE_ENFORCE(file.is_open(), "failed to open %s to write.",
-                 program_output_path);
-  const std::string serialized_message =
-      argument_->transformed_program_desc->SerializeAsString();
-  file.write(serialized_message.c_str(), serialized_message.size());
-}
-
-bool ModelStorePass::Finalize() { return true; }
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/node.cc b/paddle/fluid/inference/analysis/node.cc
deleted file mode 100644
index 3339b5044df0cf91d00aa9ddad310d4bf263bc3c..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/node.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/node.h"
-#include "glog/logging.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-std::vector<Dot::Attr> Value::dot_attrs() const {
-  return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
-                                 Dot::Attr("shape", "box"),
-                                 Dot::Attr("fillcolor", "red")});
-}
-
-std::vector<Dot::Attr> Function::dot_attrs() const {
-  return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
-                                 Dot::Attr("shape", "diamond"),
-                                 Dot::Attr("fillcolor", "yellow")});
-}
-
-Node *NodeMap::Create(Node::Type type) {
-  switch (type) {
-    case Node::Type::kFunction:
-      nodes_.emplace_back(new Function);
-      break;
-    case Node::Type::kValue:
-      nodes_.emplace_back(new Value);
-      break;
-    case Node::Type::kFunctionBlock:
-      nodes_.emplace_back(new FunctionBlock);
-      break;
-    default:
-      PADDLE_THROW("Not supported node type.");
-  }
-  nodes_.back()->id_ = size() - 1;
-  return nodes_.back().get();
-}
-
-Node *NodeMap::GetMutable(size_t id) {
-  PADDLE_ENFORCE_GT(size(), id);
-  return nodes_[id].get();
-}
-
-const Node &NodeMap::Get(size_t id) const {
-  PADDLE_ENFORCE_GT(size(), id);
-  return *nodes_[id].get();
-}
-
-void NodeMap::Delete(size_t id) {
-  PADDLE_ENFORCE_LT(id, size());
-  nodes_[id]->SetDeleted();
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/node.h b/paddle/fluid/inference/analysis/node.h
deleted file mode 100644
index af34156bc2f101465d87cb10e2155745022eb521..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/node.h
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file defines the Node class and its subclasses. A Node is the basis
- * analysis element in a computation graph.
- * There are basically two kinds of nodes, the function node and value node.
- */
-#pragma once
-
-#include <limits>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/inference/analysis/device.h"
-#include "paddle/fluid/inference/analysis/dot.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/platform/variant.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-class NodeMap;
-
-// A helper class to maintain the status from Pass.
-struct AnyAttr {
-  using any_t =
-      boost::variant<bool, float, int32_t, int64_t, void *, std::string>;
-  // NOTE T should be a primary type or a struct combined by several primary
-  // types.
-  // NOTE the STL containers should not use here.
-  // Some usages
-  //   Attr attr;
-  //   attr.Bool() = true;
-  bool &Bool() { return As<bool>(); }
-  float &Float() { return As<float>(); }
-  int32_t &Int32() { return As<int32_t>(); }
-  int64_t &Int64() { return As<int64_t>(); }
-  void *&Pointer() { return As<void *>(); }
-  std::string &String() { return As<std::string>(); }
-
-  template <typename T>
-  T &As() {
-    if (type_index_ == typeid(AnyAttr)) {
-      type_index_ = typeid(T);
-      any_data_ = T();
-    } else {
-      PADDLE_ENFORCE(type_index_ == typeid(T), "fetch error type");
-    }
-    return boost::get<T>(any_data_);
-  }
-
- private:
-  any_t any_data_;
-  std::type_index type_index_{typeid(AnyAttr)};
-};
-
-/*
- * Node Representation.
- *
- * This is a very important class for analysis. It is the base class of all
- * nodes computed by a program that may be used as operands to other nodes.
- * Node is the super class of other important classes such as Function and
- * Value, some nodes can have a name.
- */
-class Node {
- public:
-  // Node type. NOTE the new node types should add here.
-  enum class Type { kNone = -1, kFunction, kValue, kFunctionBlock };
-
-  Node() = default;
-
-  // Cast to a subclass type, Function for example.
-  template <typename Subclass>
-  Subclass &As() {
-    return *dynamic_cast<Subclass *>(this);
-  }
-
-  // Formatted representation of this Node.
-  virtual std::string repr() const {
-    return name() + "(" + std::to_string(id()) + ")";
-  }
-
-  // DOT node representation. One Node type can customize its own node
-  // representation.
-  virtual std::vector<Dot::Attr> dot_attrs() const {
-    return std::vector<Dot::Attr>({Dot::Attr("style", "filled")});
-  }
-
-  // Get an additional attribute and convert it to T data type. NOTE this will
-  // silently create a new attribute if not exists.
-  AnyAttr &attr(const std::string &name) const { return attrs_[name]; }
-
-  int id() const { return id_; }
-
-  // The Protobuf description is set/get with a void* to decouple Node interface
-  // from a specific kind of Protobuf message.
-  void SetPbDesc(void *pb) { attr("pb_desc").Pointer() = pb; }
-  void *pb_desc() const { return attr("pb_desc").Pointer(); }
-
-  void SetPbMsg(const std::string &s) { attr("pb_msg").String() = s; }
-  const std::string &pb_msg() const { return attr("pb_msg").String(); }
-
-  void SetDeleted() { deleted_ = true; }
-  bool deleted() const { return deleted_; }
-
-  void SetName(const std::string &name) { name_ = name; }
-  const std::string &name() const { return name_; }
-
-  void SetType(Type type) { type_ = type; }
-  Type type() const { return type_; }
-
-  // Input links.
-  std::vector<Node *> inlinks;
-  // Output links.
-  std::vector<Node *> outlinks;
-
-  // Type checks.
-  bool IsFunction() const { return type_ == Node::Type::kFunction; }
-  bool IsValue() const { return type_ == Node::Type::kValue; }
-  bool IsFunctionBlock() const { return type_ == Node::Type::kFunctionBlock; }
-
-  virtual ~Node() {}
-
-  friend class NodeMap;
-
-  PADDLE_DISALLOW_COPY_AND_ASSIGN(Node);
-
- protected:
-  // The id number not the name is a node's unique identifier in the computation
-  // graph.
-  int id_{-1};
-  std::string name_;
-  Type type_{Type::kNone};
-  // Mark this node is deleted by some pass.
-  bool deleted_{false};
-  mutable std::unordered_map<std::string, AnyAttr> attrs_;
-};
-
-class Function;
-/*
- * Value represents a value node, it has some attributes including dims, data
- * type and so on.
- */
-class Value : public Node {
- public:
-  enum class DataType { kInt32, kInt64, kFloat32, kFloat64 };
-  using Dims = std::vector<int>;
-
-  void SetDataType(DataType data_type) { data_type_ = data_type; }
-  DataType data_type() const { return data_type_; }
-
-  void SetDims(const Dims &dims) { dims_ = dims; }
-  const Dims &dims() const { return dims_; }
-
-  Device device() const { return device_; }
-  void SetDevice(Device device) { device_ = device; }
-
-  std::vector<Dot::Attr> dot_attrs() const override;
-
-  PADDLE_DISALLOW_COPY_AND_ASSIGN(Value);
-
- protected:
-  Value() { SetType(Node::Type::kValue); }
-  friend class NodeMap;
-
- private:
-  DataType data_type_;
-  Dims dims_;
-  Device device_;
-};
-
-/*
- * Function represents any kind of executable concepts that takes several Values
- * as input, and outputs several Values.
- */
-class Function : public Node {
- public:
-  std::vector<Dot::Attr> dot_attrs() const override;
-
-  // Get the operator's type from Desc.
-  const std::string &func_type() const { return func_type_; }
-  // Set the operator's type.
-  void SetFuncType(const std::string &func_type) { func_type_ = func_type; }
-
-  PADDLE_DISALLOW_COPY_AND_ASSIGN(Function);
-
- protected:
-  std::string func_type_;
-  Function() { SetType(Node::Type::kFunction); }
-  friend class NodeMap;
-};
-
-/*
- * FunctionBlock is a Node that contains a sub-graph multiple Node.
- */
-struct FunctionBlock : public Node {
-  std::string repr() const override { return "block-" + std::to_string(id()); }
-  std::vector<Node *> subgraph;
-
- protected:
-  FunctionBlock() { SetType(Node::Type::kFunctionBlock); }
-  friend class NodeMap;
-};
-
-class NodeMap {
- public:
-  // Create a new node with type.
-  Node *Create(Node::Type type);
-
-  // Get a node by its id.
-  Node *GetMutable(size_t id);
-
-  const Node &Get(size_t id) const;
-
-  void Delete(size_t id);
-
-  const std::vector<std::unique_ptr<Node>> &nodes() const { return nodes_; }
-
-  size_t size() const { return nodes_.size(); }
-
- private:
-  std::vector<std::unique_ptr<Node>> nodes_;
-  std::unordered_map<std::string, Node *> map_;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/node_tester.cc b/paddle/fluid/inference/analysis/node_tester.cc
deleted file mode 100644
index 9207c15373fb4264ff0e738e93ae88e1c08b554c..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/node_tester.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/node.h"
-
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-TEST(NodeAttr, bool) {
-  AnyAttr x;
-  x.Bool() = true;
-  ASSERT_EQ(x.Bool(), true);
-}
-
-TEST(NodeAttr, int32) {
-  AnyAttr x;
-  x.Int32() = 32;
-  ASSERT_EQ(x.Int32(), 32);
-}
-
-TEST(NodeAttr, string) {
-  AnyAttr x;
-  x.String() = "Hello";
-  ASSERT_EQ(x.String(), "Hello");
-}
-
-TEST(Node, Attr) {
-  // Node is an abstract class, use Value instead for they share the same Attr
-  // logic.
-  NodeMap nodes;
-  auto* node = nodes.Create(Node::Type::kValue);
-  node->attr("v0").Int32() = 2008;
-  ASSERT_EQ(node->attr("v0").Int32(), 2008);
-
-  node->attr("str").String() = "hello world";
-  ASSERT_EQ(node->attr("str").String(), "hello world");
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc
deleted file mode 100644
index a6ac0ee49f8f408faa7a17bf5ef5d2799a9a6238..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/pass_manager.h"
-#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
-#include "paddle/fluid/string/pretty_log.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-bool PassManager::Initialize(Argument* argument) {
-  argument_ = argument;
-  for (auto& pass : data_) {
-    VLOG(3) << "Initializing pass [" << pass->repr() << "]";
-    if (!pass->Initialize(argument)) {
-      LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
-      return false;
-    }
-  }
-  return true;
-}
-
-void DfgPassManager::RunAll() {
-  PADDLE_ENFORCE(argument_);
-  VLOG(3) << "Total " << data_.size() << " Analysys passes";
-  for (auto& pass : data_) {
-    string::PrettyLogEndl(string::Style::H1(), "* Running Analysis pass [%s]",
-                          pass->repr());
-    pass->Run(argument_->main_dfg.get());
-  }
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass_manager.h b/paddle/fluid/inference/analysis/pass_manager.h
deleted file mode 100644
index 412747c4fcce73303703f586f7a04edf4cc5ee76..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/pass_manager.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file defines the logic of pass management. The analysis for inference is
- * a pipeline of Passes, a PassManager is a agency that helps to manage the
- * executation of the Passes.
- *
- * There are two modes of Passes, the first one is called NodePass and takes
- * an Node as input and output; the second one is called DFGPass and takes a
- * DFG(Data Flow Graph) as input and output. It is hard to put all the passes in
- * the same pipeline, there are two kinds of PassManagers, both takes a DFG as
- * input and output a DFG, but the Passes inside are different:
- *
- *   1. NodePassManager: the passes inside are all NodePasses, it can have
- *      different graph trivial algorithm, for example, DFS_NodePassManager will
- *      trigger the passes in depth first order;
- *   2. DfgPassManager: the passes inside are all DfgPasses.
- */
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * PassManager is the base class for all pass managers, a pass manager has
- * several Pass-es registered, and execute them in the linear order.
- */
-class PassManager : public OrderedRegistry<AnalysisPass> {
- public:
-  PassManager() = default;
-  // Call all the passes' Initialize methods. The desc and data_flow_graph are
-  // globally shared, so pass them as the arguemnts for all the pass managers.
-  virtual bool Initialize(const Argument& argument) { return false; }
-
-  virtual bool Initialize(Argument* argument);
-
-  // Call all the passes' Finalize methods.
-  virtual bool Finalize() {
-    for (auto& pass : data_) {
-      if (!pass->Finalize()) {
-        LOG(ERROR) << "Failed to finalize pass [" << pass->repr() << "]";
-        return false;
-      }
-    }
-    return true;
-  }
-
-  // Run all the passes.
-  virtual void RunAll() = 0;
-
-  // Short identifier.
-  virtual std::string repr() const = 0;
-  // Long description.
-  virtual std::string description() const = 0;
-
-  virtual ~PassManager() = default;
-
- protected:
-  Argument* argument_{nullptr};
-};
-
-/*
- * A pass manager that process a DFG.
- */
-class DfgPassManager : public PassManager {
- public:
-  DfgPassManager() = default;
-
-  void RunAll() override;
-
-  virtual ~DfgPassManager() = default;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc
deleted file mode 100644
index 72b0fbf7e571ec97a0ea093d01449c1d5ddb9b91..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
-#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
-#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
-#include "paddle/fluid/inference/analysis/pass_manager.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-class TestDfgPassManager final : public DfgPassManager {
- public:
-  TestDfgPassManager() = default;
-  virtual ~TestDfgPassManager() = default;
-  // Short identifier.
-  std::string repr() const override { return "test-pass-manager"; }
-  // Long description.
-  std::string description() const override { return "test doc"; }
-};
-
-TEST(PassManager, DFG_pass_manager) {
-  TestDfgPassManager manager;
-  DFG_GraphvizDrawPass::Config config("./", "dfg.dot");
-
-  manager.Register("fluid-to-flow-graph", new FluidToDataFlowGraphPass);
-  manager.Register("graphviz", new DFG_GraphvizDrawPass(config));
-  manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass);
-
-  Argument argument(FLAGS_inference_model_dir);
-
-  ASSERT_TRUE(&argument);
-  ASSERT_TRUE(manager.Initialize(&argument));
-  manager.RunAll();
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a30c27b1183a75de8c0bb50ef3617d747b239fae
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -0,0 +1,9 @@
+cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
+cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
+cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass)
+
+set(analysis_deps ${analysis_deps}
+        ir_graph_build_pass
+        ir_analysis_pass
+        analysis_passes
+        CACHE INTERNAL "")
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dc4d0906c4f260c8f7a11832fc52eba7191c54e8
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void IrAnalysisComposePass::RunImpl(Argument *argument) {
+  ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
+  if (argument->use_tensorrt_valid() && argument->use_tensorrt()) {
+    InitTensorRTAttrs(argument);
+  }
+  ApplyIrPasses(argument);
+  CollectFusionStatis(argument);
+}
+
+std::string IrAnalysisComposePass::repr() const {
+  return "ir-analysis-compose-pass";
+}
+
+void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
+  if (argument->use_tensorrt_valid() && argument->use_tensorrt()) {
+    LOG(INFO) << "Initing TensorRT pass";
+    argument->SetTensorRtNodeTeller([](const framework::ir::Node *node) {
+      std::unordered_set<std::string> teller_set(
+          {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
+           "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
+           "elementwise_add", "dropout"});
+      if (!node->IsOp()) return false;
+
+      if (teller_set.count(node->Op()->Type())) {
+        return true;
+      } else {
+        return false;
+      }
+    });
+  }
+}
+
+void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
+  std::vector<std::string> passes({
+      "ir_graph_build_pass", "ir_analysis_pass",
+  });
+  for (const auto &pass : passes) {
+    VLOG(2) << "Run pass " << pass;
+    auto *the_pass = PassRegistry::Global().Retreive(pass);
+    the_pass->Run(argument);
+  }
+}
+
+void IrAnalysisComposePass::CollectFusionStatis(Argument *argument) {
+  if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) {
+    LOG(INFO) << "argument has no fuse statis";
+    return;
+  }
+  argument->SetFusionStatis(
+      argument->main_graph().Get<Argument::fusion_statis_t>(
+          framework::ir::kFuseStatisAttr));
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass.h b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
similarity index 53%
rename from paddle/fluid/inference/analysis/model_store_pass.h
rename to paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
index f14b49e09c2f8e79c6fc4accdbf17f4f7a9bb1a3..53e2ebb0038a5c105f68a0146b3da90a6ae34af8 100644
--- a/paddle/fluid/inference/analysis/model_store_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
@@ -12,42 +12,35 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-/*
- * This file defines ModelStorePass, which store the runtime DFG to a Paddle
- * model in the disk, and that model can be reloaded for prediction.
- */
-
 #pragma once
+
 #include <string>
+#include <vector>
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/inference/analysis/passes/passes.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-class ModelStorePass : public DataFlowGraphPass {
+/*
+ * The analysis pass to run a list of IR passes (like a function call).
+ * Currently, it should be the first pass of analysis phase.
+ */
+class IrAnalysisComposePass : public AnalysisPass {
  public:
-  bool Initialize(Argument* argument) override {
-    if (!argument) {
-      LOG(ERROR) << "invalid argument";
-      return false;
-    }
-    argument_ = argument;
-    return true;
-  }
+  void RunImpl(Argument* argument) override;
+  std::string repr() const override;
 
-  void Run(DataFlowGraph* x) override;
+ private:
+  void InitTensorRTAttrs(Argument* argument);
 
-  std::string repr() const override { return "DFG-store-pass"; }
-  std::string description() const override {
-    return R"DD(This file defines ModelStorePass, which store the runtime DFG to a Paddle
-    model in the disk, and that model can be reloaded for prediction again.)DD";
-  }
+  void ApplyIrPasses(Argument* argument);
 
-  bool Finalize() override;
+  void CollectFusionStatis(Argument* argument);
 
- private:
-  Argument* argument_{nullptr};
+  // Assign a Scope for IR passes to modify the weights.
+  void AssignScopeToModify(Argument* argument);
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e327bd39f0ae0b8fbe3b189e4bb26a23c44d910c
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
+#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void IrAnalysisPass::RunImpl(Argument* argument) {
+  ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
+  ARGUMENT_CHECK_FIELD(argument, main_program);
+  ARGUMENT_CHECK_FIELD(argument, scope);
+
+  auto* the_graph = argument->ReleaseMainGraph();
+  auto graph = std::unique_ptr<Graph>(the_graph);
+
+  // Apply passes.
+  IRPassManager the_ir_manager(argument);
+  graph = the_ir_manager.Apply(std::move(graph));
+  PADDLE_ENFORCE_GT(graph->Nodes().size(), 0);
+  argument->SetIrAnalyzedProgram(new framework::proto::ProgramDesc(
+      the_ir_manager.AcquireProgram(&graph, argument->main_program())));
+  argument->SetMainGraph(graph.release());
+}
+
+std::string IrAnalysisPass::repr() const { return "ir-analysis-pass"; }
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/node_attr_flags.h b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
similarity index 70%
rename from paddle/fluid/inference/analysis/node_attr_flags.h
rename to paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
index a3f70e5419a66969e8fb20152a8a8ace39316f57..d8a7449807585257c153d3c8958555ea2306afa3 100644
--- a/paddle/fluid/inference/analysis/node_attr_flags.h
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
@@ -12,20 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-/*
- * This file contains all the flags that declared in Node::Attr.
- *
- * The Node::Attr is designed to share information between different passes, one
- * can get other's attributes in a Node by the flags in this file.
- */
 #pragma once
+
+#include <string>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-#define DECLARE_NODE_ATTR(flag__) const char ATTR_##flag__[] = #flag__;
-
-DECLARE_NODE_ATTR(supported_by_tensorrt)  // bool
+/*
+ * Perform IR analysis passes.
+ *
+ * It is used to fuse some
+ */
+class IrAnalysisPass : public AnalysisPass {
+ public:
+  void RunImpl(Argument* argument) override;
+  std::string repr() const override;
+};
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a30fef08b5726c965637e2fb489bdb2036bd2a8d
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
+#include <paddle/fluid/framework/ir/fuse_pass_base.h>
+#include <string>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+
+extern void ReadBinaryFile(const std::string &filename, std::string *contents);
+
+namespace analysis {
+
+void IrGraphBuildPass::RunImpl(Argument *argument) {
+  if (!argument->scope_valid()) {
+    argument->SetScope(new framework::Scope);
+  }
+
+  if (argument->model_dir_valid()) {
+    auto program = LoadModel(argument->model_dir(), argument->scope_ptr());
+    argument->SetMainProgram(program.release());
+  } else if (argument->model_program_path_valid() &&
+             argument->model_params_path_valid()) {
+    auto program =
+        LoadModel(argument->model_program_path(), argument->model_params_path(),
+                  argument->scope_ptr());
+    argument->SetMainProgram(program.release());
+  } else {
+    PADDLE_THROW(
+        "either model_dir or (program path and parameter path) should be set.");
+  }
+
+  auto graph = std::unique_ptr<Graph>(new Graph(argument->main_program()));
+  argument->SetMainGraph(graph.release());
+  argument->main_graph().Set(framework::ir::kParamScopeAttr,
+                             new framework::Scope *(argument->scope_ptr()));
+}
+
+std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
+    const std::string &path, framework::Scope *scope) {
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  return Load(&exe, scope, path);
+}
+
+std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
+    const std::string &program_path, const std::string &params_path,
+    framework::Scope *scope) {
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  return Load(&exe, scope, program_path, params_path);
+}
+
+std::string IrGraphBuildPass::repr() const { return "ir-graph-build-pass"; }
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..3291e4f6ad3ca3079e672350805cab1f1e7b2413
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Load program and parameter to memory from the disk.
+ */
+class IrGraphBuildPass : public AnalysisPass {
+ public:
+  void RunImpl(Argument *argument) override;
+
+  std::string repr() const override;
+
+ private:
+  std::unique_ptr<framework::ProgramDesc> LoadModel(const std::string &path,
+                                                    framework::Scope *scope);
+  std::unique_ptr<framework::ProgramDesc> LoadModel(
+      const std::string &program_path, const std::string &params_path,
+      framework::Scope *scope);
+
+  std::string model_binary_str_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2ef515f45f2483df8d1238b4758d6729d0299ce9
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/passes.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/passes/passes.h"
+#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc"
+#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
+#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+PassRegistry::PassRegistry() {
+  passes_.emplace("ir_analysis_pass",
+                  std::unique_ptr<AnalysisPass>(new IrAnalysisPass));
+  passes_.emplace("ir_graph_build_pass",
+                  std::unique_ptr<AnalysisPass>(new IrGraphBuildPass));
+  passes_.emplace("ir_analysis_compose_pass",
+                  std::unique_ptr<AnalysisPass>(new IrAnalysisComposePass));
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc b/paddle/fluid/inference/analysis/passes/passes.h
similarity index 61%
rename from paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
rename to paddle/fluid/inference/analysis/passes/passes.h
index 367c25805d05f8d10fb8341158760ac6356a5c48..ea07e0dcbd992c9d10c6662909798ef79a01e3a7 100644
--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/passes/passes.h
@@ -12,24 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
+#pragma once
 
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include <string>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST(FluidToIrPass, Test) {
-  FluidToIrPass pass;
-  Argument argument(FLAGS_inference_model_dir);
-  argument.Set(kFluidToIrPassesAttr,
-               new std::vector<std::string>({"infer_clean_graph_pass"}));
-  pass.Initialize(&argument);
-  pass.Run(argument.main_dfg.get());
-}
+struct PassRegistry {
+  PassRegistry();
+
+  AnalysisPass* Retreive(const std::string& pass_type) {
+    return passes_[pass_type].get();
+  }
+
+  static PassRegistry& Global() {
+    static auto* x = new PassRegistry;
+    return *x;
+  }
+
+ private:
+  std::unordered_map<std::string, std::unique_ptr<AnalysisPass>> passes_;
+};
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.h b/paddle/fluid/inference/analysis/subgraph_splitter.h
deleted file mode 100644
index 76e4fda0249e03c617d1b37c079dcd97f21387c1..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/subgraph_splitter.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file defines the the class to partition a graph.
- */
-
-#pragma once
-
-#include <vector>
-
-#include "paddle/fluid/inference/analysis/argument.h"
-#include "paddle/fluid/inference/analysis/data_flow_graph.h"
-#include "paddle/fluid/inference/analysis/node.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * Detect the nodes in a sub-graph that meet some conditions. This class doesn't
- * modify the graph.
- */
-class SubGraphSplitter {
- public:
-  static const char *kMarkerAttrName;
-  // Tell whether a node is inside a sub-graph.
-  using NodeInsideSubgraphTeller = std::function<bool(const Node *)>;
-
-  SubGraphSplitter(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller)
-      : graph_(graph), node_inside_subgraph_teller_(teller) {}
-
-  std::vector<std::vector<Node *>> operator()();
-
- protected:
-  // Mark the nodes inside the accepted sub-graph using
-  // node_inside_subgraph_teller.
-  void MarkNodesInsideSubGraph();
-
-  // Merge the marked nodes into sub-graphs and return the sub-graphs.
-  std::vector<std::vector<Node *>> ExtractSubGraphs();
-
- private:
-  DataFlowGraph *graph_;
-  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
-};
-
-/*
- * SubGraphFuse - Replace some nodes with the sub-graph node they are inside. To
- * some extent, the TensorRT engine is just a fusion op for a model.
- */
-class SubGraphFuse {
- public:
-  using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller;
-
-  SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller,
-               Argument *argument)
-      : graph_(graph),
-        node_inside_subgraph_teller_(teller),
-        argument_(argument) {}
-
-  // The main method which run all the logic.
-  void operator()();
-
- protected:
-  // Remove the nodes inside sub-graphs and replace with the SubGraphNode.
-  void ReplaceNodesWithSubGraphs();
-
- private:
-  DataFlowGraph *graph_;
-  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
-  Argument *argument_;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
deleted file mode 100644
index e1dc89fab5fb76d456b07c316ab1cabe6de23b26..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
-  if (node->type() != Node::Type::kFunction) return false;
-  const auto* func = static_cast<const Function*>(node);
-  if (func->func_type() == "elementwise_add" || func->func_type() == "relu" ||
-      func->func_type() == "conv2d" || func->func_type() == "mul" ||
-      func->func_type() == "sigmoid" || func->func_type() == "softmax") {
-    LOG(INFO) << "sub-graph marked " << node->repr();
-    return true;
-  }
-  return false;
-};
-
-TEST(SubGraphSplitter, Split) {
-  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
-  auto dfg = ProgramDescToDFG(desc);
-  LOG(INFO) << "spliter\n" << dfg.DotString();
-
-  ASSERT_GT(dfg.nodes.size(), 5UL);
-
-  auto subgraphs = SubGraphSplitter(&dfg, teller)();
-
-  // Check the number of the marked nodes.
-  int marked_nodes = 0;
-  for (auto& node : dfg.nodes.nodes()) {
-    if (node->IsFunction() &&
-        node->attr(SubGraphSplitter::kMarkerAttrName).Bool()) {
-      ++marked_nodes;
-    }
-  }
-  EXPECT_EQ(marked_nodes, 6);
-
-  // For human debug.
-  for (auto& subgraph : subgraphs) {
-    LOG(INFO) << "subgraph size " << subgraph.size();
-    for (auto* node : subgraph) {
-      LOG(INFO) << "node " << node->repr();
-    }
-  }
-
-  ASSERT_EQ(subgraphs.size(), 1UL);
-  // The last sub-graph has 5 Functions.
-  ASSERT_EQ(subgraphs.back().size(), 6UL);
-}
-
-TEST(SubGraphSplitter, Fuse) {
-  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
-  auto dfg = ProgramDescToDFG(desc);
-  Argument argument;
-  argument.Set<int>("minimum_subgraph_size", new int(3));
-
-  size_t count0 = dfg.nodes.size();
-
-  SubGraphFuse fuse(&dfg, teller, &argument);
-  fuse();
-
-  int count1 = 0;
-  for (auto& node : dfg.nodes.nodes()) {
-    if (node->deleted()) {
-      LOG(INFO) << "deleted " << node->repr();
-    }
-    count1 += node->deleted();
-  }
-
-  // At least one nodes should be deleted.
-  ASSERT_EQ(dfg.nodes.size(), count0 + 1);  // added a new FunctionBlock
-  ASSERT_EQ(11, count1);
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
deleted file mode 100644
index 174c8513f92cf869419f04cab5a54af65e9673b8..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
-#include "paddle/fluid/inference/analysis/node_attr_flags.h"
-#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-void TensorRTSubgraphNodeMarkPass::Run(DataFlowGraph *graph) {
-  for (auto &node : graph->nodes.nodes()) {
-    node->attr(ATTR_supported_by_tensorrt).Bool() = teller_(node.get());
-  }
-}
-
-class DfgDebuggerPass : public DFG_GraphvizDrawPass {
- public:
-  explicit DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
-      : DFG_GraphvizDrawPass(config) {}
-
-  std::string repr() const override {
-    return "tensorrt-subgraph-node-mark-debugger";
-  }
-
-  bool Finalize() override { return true; }
-
- protected:
-  std::string Draw(DataFlowGraph *graph) override {
-    Dot dot;
-    // Add nodes
-    for (size_t i = 0; i < graph->nodes.size(); i++) {
-      const Node &node = graph->nodes.Get(i);
-      if (config_.display_deleted_node || !node.deleted()) {
-        auto dot_attr = node.dot_attrs();
-        if (node.attr(ATTR_supported_by_tensorrt).Bool()) {
-          dot_attr.assign(
-              {Dot::Attr{"color", "green"}, Dot::Attr{"style", "filled"}});
-        }
-        dot.AddNode(node.repr(), dot_attr);
-      }
-    }
-    // Add edges
-    for (size_t i = 0; i < graph->nodes.size(); i++) {
-      const Node &node = graph->nodes.Get(i);
-      if (!config_.display_deleted_node && node.deleted()) continue;
-      for (auto &in : node.inlinks) {
-        if (!config_.display_deleted_node && in->deleted()) continue;
-        dot.AddEdge(in->repr(), node.repr(), {});
-      }
-    }
-    return dot.Build();
-  }
-};
-
-AnalysisPass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
-  DFG_GraphvizDrawPass::Config config(FLAGS_IA_graphviz_log_root,
-                                      "tensorrt_marked_node");
-  return new DfgDebuggerPass(config);
-}
-bool TensorRTSubgraphNodeMarkPass::Finalize() { return true; }
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
deleted file mode 100644
index c881a54c240538b68abdcb9060db69de3bf2b8bb..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * This file defines TensorRTSubgraphNodeMarkPass which helps to mark the ops
- * that supported by TensorRT engine.
- */
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * Mark the operators that TensorRT engine supports.
- */
-class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
- public:
-  using teller_t = SubGraphSplitter::NodeInsideSubgraphTeller;
-
-  explicit TensorRTSubgraphNodeMarkPass(const teller_t& teller)
-      : teller_(teller) {}
-
-  bool Initialize(Argument* argument) override { return true; }
-
-  // This class get a sub-graph as input and determine whether to transform this
-  // sub-graph into TensorRT.
-  void Run(DataFlowGraph* graph) override;
-
-  std::string repr() const override { return "tensorrt-sub-subgraph-mark"; }
-  std::string description() const override {
-    return "tensorrt sub-graph mark pass";
-  }
-
-  AnalysisPass* CreateGraphvizDebugerPass() const override;
-  bool Finalize() override;
-
- private:
-  teller_t teller_;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
deleted file mode 100644
index c1d932878e559180af987594535959afdf475587..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/analysis/node_attr_flags.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-TEST(TensorRTSubgraphNodeMarkPass, test) {
-  // init
-  FluidToDataFlowGraphPass pass;
-  Argument argument(FLAGS_inference_model_dir);
-  ASSERT_TRUE(pass.Initialize(&argument));
-  pass.Run(argument.main_dfg.get());
-
-  TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) {
-    return node->IsFunction() &&
-           static_cast<const Function*>(node)->func_type() == "mul";
-  };
-  TensorRTSubgraphNodeMarkPass pass1(teller);
-  ASSERT_TRUE(pass1.Initialize(&argument));
-  pass1.Run(argument.main_dfg.get());
-
-  int counter{0};
-  for (auto& node : argument.main_dfg->nodes.nodes()) {
-    counter += node->attr(ATTR_supported_by_tensorrt).Bool();
-  }
-  ASSERT_EQ(counter, 2);
-  LOG(INFO) << counter << " nodes marked";
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
deleted file mode 100644
index cc1746ecb34c983d219693bcec17c8789c38fa9f..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
-#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-TensorRTSubGraphPass::TensorRTSubGraphPass(
-    const TensorRTSubGraphPass::NodeInsideSubgraphTeller &teller)
-    : node_inside_subgraph_teller_(teller) {}
-
-void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
-  SubGraphFuse(graph, node_inside_subgraph_teller_, argument_)();
-  VLOG(4) << "debug info "
-          << graph->HumanReadableInfo(false /*show_values*/,
-                                      true /*show_functions*/);
-}
-
-}  // namespace analysis
-}  // namespace inference
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
deleted file mode 100644
index 3545da9109d79964f36c3d7e738620cc2e0f9a6c..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/node.h"
-#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * Parse the graph and replace TensorRT supported nodes with SubGraphNode
- */
-class TensorRTSubGraphPass : public DataFlowGraphPass {
- public:
-  // Tell whether to transform a sub-graph into TensorRT.
-  using NodeInsideSubgraphTeller = SubGraphFuse::NodeInsideSubgraphTeller;
-
-  explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
-
-  bool Initialize(Argument* argument) override {
-    argument_ = argument;
-    return true;
-  }
-
-  // This class get a sub-graph as input and determine whether to transform this
-  // sub-graph into TensorRT.
-  void Run(DataFlowGraph* graph) override;
-
-  bool Finalize() override { return true; }
-
-  std::string repr() const override { return "tensorrt-sub-graph"; }
-  std::string description() const override { return "tensorrt sub graph pass"; }
-
- private:
-  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
-  Argument* argument_;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
deleted file mode 100644
index 9748e24b06295a4e7c2995429e6588cd0f225fe6..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-DEFINE_string(dot_dir, "./", "");
-
-TEST(TensorRTSubGraphPass, main) {
-  std::unordered_set<std::string> teller_set(
-      {"elementwise_add", "mul", "sigmoid"});
-  SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) {
-    if (node->type() != Node::Type::kFunction) return false;
-    const auto* func = static_cast<const Function*>(node);
-    if (teller_set.count(func->func_type())) return true;
-    return false;
-  };
-
-  Argument argument(FLAGS_inference_model_dir);
-  argument.Set<int>("minimum_subgraph_size", new int(0));
-  argument.Set<int>("max_batch_size", new int(3));
-  argument.Set<int>("workspace_size", new int(1 << 20));
-  argument.Set<std::string>("precision_mode", new std::string("FP32"));
-
-  DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
-  DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
-
-  DFG_GraphvizDrawPass dfg_pass(config);
-  DFG_GraphvizDrawPass dfg_pass1(config1);
-  FluidToDataFlowGraphPass pass0;
-  TensorRTSubGraphPass trt_pass(std::move(teller));
-
-  dfg_pass.Initialize(&argument);
-  dfg_pass1.Initialize(&argument);
-  pass0.Initialize(&argument);
-  trt_pass.Initialize(&argument);
-
-  argument.main_dfg.reset(new DataFlowGraph);
-  pass0.Run(argument.main_dfg.get());
-  dfg_pass.Run(argument.main_dfg.get());
-  trt_pass.Run(argument.main_dfg.get());
-  dfg_pass1.Run(argument.main_dfg.get());
-
-  // Check the TRT op's block desc
-  for (auto& node : argument.main_dfg->nodes.nodes()) {
-    if (node->IsFunctionBlock()) {
-      LOG(INFO) << "get function block";
-    }
-  }
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h
index 1073a6f686eaeeaaae2d93ab044149b7df518085..d599099a8050eaeabb8e0544b1bfe3b6b46b17ec 100644
--- a/paddle/fluid/inference/analysis/ut_helper.h
+++ b/paddle/fluid/inference/analysis/ut_helper.h
@@ -18,8 +18,6 @@ limitations under the License. */
 #include <fstream>
 #include <string>
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/inference/analysis/data_flow_graph.h"
-#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 
 namespace paddle {
@@ -32,29 +30,6 @@ namespace analysis {
 
 DEFINE_string(inference_model_dir, "", "inference test model dir");
 
-static DataFlowGraph ProgramDescToDFG(
-    const framework::proto::ProgramDesc& desc) {
-  DataFlowGraph graph;
-  FluidToDataFlowGraphPass pass;
-  Argument argument;
-  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
-  argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
-  pass.Initialize(&argument);
-  pass.Run(&graph);
-  pass.Finalize();
-  return graph;
-}
-
-class DFG_Tester : public ::testing::Test {
- protected:
-  void SetUp() override {
-    auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
-    argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
-  }
-
-  Argument argument;
-};
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 49a9ebe3ddec1e4fd59ae1155a706859e249d25c..82f74a269a5915dfa1d97a28f5ae15a12ea0b154 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -17,17 +17,22 @@ if(APPLE)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
 endif(APPLE)
 
-set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB})
+
+set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor analysis_predictor ${GLOB_PASS_LIB})
 
 if(WITH_GPU AND TENSORRT_FOUND)
-    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor)
+    set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
 endif()
 
 cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
-cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
+cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder)
+cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder)
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder)
 cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
 cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api)
+
+
 cc_test(test_paddle_inference_api
         SRCS api_tester.cc
         DEPS paddle_inference_api)
@@ -37,23 +42,13 @@ if(WITH_TESTING)
                       ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
   set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
 endif()
-cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api
-        ARGS --dirname=${PYTHON_TESTS_DIR}/book)
-
-if(WITH_GPU AND TENSORRT_FOUND)
-cc_library(paddle_inference_tensorrt_subgraph_engine
-        SRCS api_tensorrt_subgraph_engine.cc
-        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy)
-  if(WITH_TESTING)
-    inference_base_test(test_api_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine_tester.cc DEPS ${inference_deps}
-                      ARGS --dirname=${WORD2VEC_MODEL_DIR})
-  endif()
-endif()
+cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps}
+        ARGS --dirname=${WORD2VEC_MODEL_DIR})
 
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
     # compile the libinference_anakin_api.a and anakin.so.
-    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml scope zero_copy_tensor_dummy)
-    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber scope)
+    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml zero_copy_tensor_dummy)
+    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber zero_copy_tensor_dummy)
     function(anakin_target target_name)
       target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
     endfunction()
diff --git a/paddle/fluid/inference/api/README.md b/paddle/fluid/inference/api/README.md
index 20969fac6c8f894ffb4a02b48f795e2a0dcbd096..a2d685d723bd9ab2b84969adb86e177a8754328d 100644
--- a/paddle/fluid/inference/api/README.md
+++ b/paddle/fluid/inference/api/README.md
@@ -2,25 +2,15 @@
 
 Paddle inference offers the APIs in `C` and `C++` languages.
 
-One can easily deploy a model trained by Paddle following the steps as below:
+You can easily deploy a model trained by Paddle following the steps as below:
 
 1. Optimize the native model;
 2. Write some codes for deployment.
 
+## The APIs
 
-Let's explain the steps in detail.
-
-## Optimize the native Fluid Model
-
-The native model that get from the training phase needs to be optimized for that.
-
-- Clean the noise such as the cost operators that do not need inference;
-- Prune unnecessary computation fork that has nothing to do with the output;
-- Remove extraneous variables;
-- Memory reuse for native Fluid executor;
-- Translate the model storage format to some third-party engine's, so that the inference API can utilize the engine for acceleration;
-
-We have an official tool to do the optimization, call `paddle_inference_optimize --help` for more information.
+All the released APIs are located in the `paddle_inference_api.h` header file. 
+The stable APIs are wrapped by `namespace paddle`, the unstable APIs are protected by `namespace paddle::contrib`.
 
 ## Write some codes
 
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ccd2dc5ab353b1634b651a4b7caa2af0da75ce4
--- /dev/null
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -0,0 +1,103 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle_pass_builder.h"  // NOLINT
+
+namespace paddle {
+
+PassStrategy *contrib::AnalysisConfig::pass_builder() const {
+  PADDLE_ENFORCE(
+      pass_builder_.get(),
+      "Should call constructor first, that will init the pass_builder_.");
+  return pass_builder_.get();
+}
+
+contrib::AnalysisConfig::AnalysisConfig(bool use_gpu) {
+  this->use_gpu = use_gpu;
+  if (use_gpu) {
+    pass_builder_.reset(new GpuPassStrategy);
+  } else {
+    pass_builder_.reset(new CpuPassStrategy);
+  }
+}
+
+contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
+  // fields from Config
+  model_dir = other.model_dir;
+  // fields from NativeConfig
+  use_gpu = other.use_gpu;
+  device = other.device;
+  fraction_of_gpu_memory = other.fraction_of_gpu_memory;
+  prog_file = other.prog_file;
+  param_file = other.param_file;
+  specify_input_name = other.specify_input_name;
+  // fields from this.
+  enable_ir_optim = other.enable_ir_optim;
+  use_feed_fetch_ops = other.use_feed_fetch_ops;
+  use_tensorrt_ = other.use_tensorrt_;
+  tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
+  tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
+
+  if (use_gpu) {
+    pass_builder_.reset(new GpuPassStrategy(
+        *static_cast<GpuPassStrategy *>(other.pass_builder())));
+  } else {
+    pass_builder_.reset(new CpuPassStrategy(
+        *static_cast<CpuPassStrategy *>(other.pass_builder())));
+  }
+}
+
+contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
+  // fields from Config
+  model_dir = other.model_dir;
+  // fields from NativeConfig
+  use_gpu = other.use_gpu;
+  device = other.device;
+  fraction_of_gpu_memory = other.fraction_of_gpu_memory;
+  prog_file = other.prog_file;
+  param_file = other.param_file;
+  specify_input_name = other.specify_input_name;
+  // fields from this.
+  enable_ir_optim = other.enable_ir_optim;
+  use_feed_fetch_ops = other.use_feed_fetch_ops;
+  use_tensorrt_ = other.use_tensorrt_;
+  tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
+  tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
+  pass_builder_ = std::move(other.pass_builder_);
+}
+
+void contrib::AnalysisConfig::EnableMKLDNN() {
+#ifdef PADDLE_WITH_MKLDNN
+  pass_builder()->EnableMKLDNN();
+  use_mkldnn_ = true;
+#else
+  LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
+  use_mkldnn_ = false;
+#endif
+}
+
+void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
+                                                   int max_batch_size) {
+  use_tensorrt_ = true;
+  tensorrt_workspace_size_ = workspace_size;
+  tensorrt_max_batchsize_ = max_batch_size;
+  // Append after the infer_clean pass.
+  pass_builder()->InsertPass(1, "tensorrt_subgraph_pass");
+}
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 54c37fe64590aa82d7100c93c4c5c4ee36491421..7407a1ba2f63bfe31a9d3a6f33395575c5809dee 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -13,10 +13,13 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/analysis_predictor.h"
+#include <glog/logging.h>
+#include <algorithm>
 #include <memory>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/naive_executor.h"
@@ -24,6 +27,9 @@
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#if PADDLE_WITH_TENSORRT
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#endif
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -35,10 +41,21 @@ namespace paddle {
 
 using contrib::AnalysisConfig;
 
+namespace {
+bool IsPersistable(const framework::VarDesc *var) {
+  if (var->Persistable() &&
+      var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
+      var->GetType() != framework::proto::VarType::FETCH_LIST) {
+    return true;
+  }
+  return false;
+}
+}  // namespace
+
 bool AnalysisPredictor::Init(
     const std::shared_ptr<framework::Scope> &parent_scope,
     const std::shared_ptr<framework::ProgramDesc> &program) {
-  VLOG(3) << "Predictor::init()";
+  VLOG(30) << "Predictor::init()";
 #if !defined(_WIN32)
   if (FLAGS_profile) {
     LOG(WARNING) << "Profiler is actived, might affect the performance";
@@ -52,36 +69,93 @@ bool AnalysisPredictor::Init(
   // no matter with or without MKLDNN
   paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
 
-  if (config_.use_gpu) {
-    place_ = paddle::platform::CUDAPlace(config_.device);
-    LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim "
-                    "is turned false.";
-    config_.enable_ir_optim = false;
-  } else {
-    place_ = paddle::platform::CPUPlace();
+  if (!PrepareScope(parent_scope)) {
+    return false;
+  }
+  if (!CreateExecutor()) {
+    return false;
+  }
+  if (!PrepareProgram(program)) {
+    return false;
+  }
+
+  // Prepare executor, create local variables.
+  if (!PrepareExecutor()) {
+    return true;
   }
+
+  // Get the feed_target_names and fetch_target_names
+  PrepareFeedFetch();
+
+  return true;
+}
+
+bool AnalysisPredictor::PrepareScope(
+    const std::shared_ptr<framework::Scope> &parent_scope) {
   if (parent_scope) {
+    PADDLE_ENFORCE_NOT_NULL(
+        parent_scope,
+        "Both program and parent_scope should be set in Clone mode.");
     scope_ = parent_scope;
-    sub_scope_ = &(parent_scope->NewScope());
+    status_is_cloned_ = true;
   } else {
     paddle::framework::InitDevices(false);
     scope_.reset(new paddle::framework::Scope());
+    status_is_cloned_ = false;
   }
-
-  executor_.reset(new paddle::framework::NaiveExecutor(place_));
-
+  sub_scope_ = &scope_->NewScope();
+  return true;
+}
+bool AnalysisPredictor::PrepareProgram(
+    const std::shared_ptr<framework::ProgramDesc> &program) {
   if (!program) {
     if (!LoadProgramDesc()) return false;
-    OptimizeInferenceProgram();
+
+    // Optimize the program, and load parameters and modify them in the
+    // scope_.
+    // This will change the scope_ address.
+    if (config_.enable_ir_optim) {
+      status_ir_optim_enabled_ = true;
+      OptimizeInferenceProgram();
+    } else {
+      // If the parent_scope is passed, we assert that the persistable variables
+      // are already created, so just create the no persistable variables.
+
+      // If not cloned, the parameters should be loaded
+      // OptimizeInferenceProgram.
+      // So in both cases, just the local variables are needed to load, not the
+      // parematers.
+      executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
+
+      // Load parameters
+      LOG(INFO) << "load parameters ";
+      LoadParameters();
+    }
   } else {
+    // If the program is passed from external, no need to optimize it, this
+    // logic is used in the clone scenario.
     inference_program_ = program;
   }
 
-  executor_->Prepare(scope_.get(), *inference_program_, 0,
+  executor_->CreateVariables(*inference_program_, 0, false, sub_scope_);
+
+  return true;
+}
+bool AnalysisPredictor::CreateExecutor() {
+  if (config_.use_gpu) {
+    status_use_gpu_ = true;
+    place_ = paddle::platform::CUDAPlace(config_.device);
+  } else {
+    place_ = paddle::platform::CPUPlace();
+  }
+  executor_.reset(new paddle::framework::NaiveExecutor(place_));
+  return true;
+}
+bool AnalysisPredictor::PrepareExecutor() {
+  executor_->Prepare(sub_scope_, *inference_program_, 0,
                      config_.use_feed_fetch_ops);
 
-  // Get the feed_target_names and fetch_target_names
-  PrepareFeedFetch();
+  PADDLE_ENFORCE_NOT_NULL(sub_scope_);
 
   return true;
 }
@@ -89,7 +163,7 @@ bool AnalysisPredictor::Init(
 bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
                             std::vector<PaddleTensor> *output_data,
                             int batch_size) {
-  VLOG(3) << "Predictor::predict";
+  VLOG(30) << "Predictor::predict";
   inference::Timer timer;
   timer.tic();
   // set feed variable
@@ -109,7 +183,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
     LOG(ERROR) << "fail to get fetches";
     return false;
   }
-  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+  VLOG(30) << "predict cost: " << timer.toc() << "ms";
 
   // Fix TensorArray reuse not cleaned bug.
   tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
@@ -119,7 +193,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
 
 bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                                 framework::Scope *scope) {
-  VLOG(3) << "Predictor::set_feed";
+  VLOG(30) << "Predictor::set_feed";
   if (inputs.size() != feeds_.size()) {
     LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
                << inputs.size();
@@ -184,7 +258,7 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
 
 bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
                                  framework::Scope *scope) {
-  VLOG(3) << "Predictor::get_fetch";
+  VLOG(30) << "Predictor::get_fetch";
   outputs->resize(fetchs_.size());
   for (size_t i = 0; i < fetchs_.size(); ++i) {
     int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
@@ -206,61 +280,47 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
   return true;
 }
 
+// NOTE All the members in AnalysisConfig should be copied to Argument.
 void AnalysisPredictor::OptimizeInferenceProgram() {
-  LOG(INFO) << "optimize begin";
-  FLAGS_IA_enable_ir = config_.enable_ir_optim;
-  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
-  FLAGS_IA_output_storage_path = "";  // Don't output the model.
+  status_program_optimized_ = true;
+
+  argument_.SetUseGPU(config_.use_gpu);
   // Analyze inference_program
   if (!config_.model_dir.empty()) {
-    argument_.fluid_model_dir.reset(new std::string(config_.model_dir));
+    argument_.SetModelDir(config_.model_dir);
   } else {
     PADDLE_ENFORCE(
         !config_.param_file.empty(),
         "Either model_dir or (param_file, prog_file) should be set.");
     PADDLE_ENFORCE(!config_.prog_file.empty());
-    argument_.fluid_model_program_path.reset(
-        new std::string(config_.prog_file));
-    argument_.fluid_model_param_path.reset(new std::string(config_.param_file));
+    argument_.SetModelProgramPath(config_.prog_file);
+    argument_.SetModelParamsPath(config_.param_file);
   }
 
-  argument_.origin_program_desc.reset(
-      new ProgramDesc(*inference_program_->Proto()));
-
-  switch (config_.ir_mode) {
-    case contrib::AnalysisConfig::IrPassMode::kExclude:
-      Analyzer()
-          .IncludeAllIrPasses()
-          .SetUseMkldnn(config_._use_mkldnn)
-          .DisableIrPasses(config_.ir_passes)
-          .Run(&argument_);
-      break;
-    case contrib::AnalysisConfig::IrPassMode::kInclude:
-      Analyzer()
-          .SetUseMkldnn(config_._use_mkldnn)
-          .IncludeIrPasses(config_.ir_passes)
-          .Run(&argument_);
-      break;
-    default:
-      LOG(ERROR) << "Only kExclude and kInclude modes are supoorted yet.";
+  if (config_.use_gpu && config_.use_tensorrt_) {
+    argument_.SetUseTensorRT(true);
+    argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
+    argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
   }
 
-  CHECK(argument_.transformed_program_desc);
+  auto passes = config_.pass_builder()->AllPasses();
+  if (!config_.enable_ir_optim) passes.clear();
+  argument_.SetIrAnalysisPasses(passes);
+  argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get()));
+  Analyzer().Run(&argument_);
+
+  PADDLE_ENFORCE(argument_.scope_valid());
   VLOG(5) << "to prepare executor";
+  ARGUMENT_CHECK_FIELD((&argument_), ir_analyzed_program);
   inference_program_.reset(
-      new framework::ProgramDesc(*argument_.transformed_program_desc));
-  if (argument_.Has(framework::ir::kParamScopeAttr)) {
-    // Update scope.
-    scope_.reset(
-        argument_.Release<framework::Scope>(framework::ir::kParamScopeAttr));
-  }
+      new framework::ProgramDesc(argument_.ir_analyzed_program()));
   LOG(INFO) << "== optimize end ==";
 }
 
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
-  VLOG(3) << "create AnalysisConfig";
+  VLOG(30) << "create AnalysisConfig";
   if (config.use_gpu) {
     // 1. GPU memeroy
     PADDLE_ENFORCE_GT(
@@ -274,7 +334,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
       std::string flag = "--fraction_of_gpu_memory_to_use=" +
                          std::to_string(config.fraction_of_gpu_memory);
       flags.push_back(flag);
-      VLOG(3) << "set flag: " << flag;
+      VLOG(30) << "set flag: " << flag;
       framework::InitGflags(flags);
     }
   }
@@ -283,10 +343,12 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   if (!dynamic_cast<AnalysisPredictor *>(predictor.get())->Init(nullptr)) {
     return nullptr;
   }
-  return predictor;
+  return std::move(predictor);
 }
 
 void AnalysisPredictor::PrepareFeedFetch() {
+  PADDLE_ENFORCE_NOT_NULL(sub_scope_);
+  CreateFeedFetchVar(sub_scope_);
   for (auto *op : inference_program_->Block(0).AllOps()) {
     if (op->Type() == "feed") {
       int idx = boost::get<int>(op->GetAttr("col"));
@@ -305,6 +367,14 @@ void AnalysisPredictor::PrepareFeedFetch() {
   }
 }
 
+void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
+  PADDLE_ENFORCE_NOT_NULL(scope);
+  auto *var = scope->Var("feed");
+  var->GetMutable<framework::FeedFetchList>();
+  var = scope->Var("fetch");
+  var->GetMutable<framework::FeedFetchList>();
+}
+
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
     const std::string &name) {
   PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
@@ -335,27 +405,98 @@ bool AnalysisPredictor::ZeroCopyRun() {
 
 bool AnalysisPredictor::LoadProgramDesc() {
   // Initialize the inference program
-  std::unique_ptr<framework::Executor> tmp_exe(
-      new framework::Executor(platform::CPUPlace()));
+  std::string filename;
   if (!config_.model_dir.empty()) {
-    // Parameters are saved in separate files sited in
-    // the specified `dirname`.
-    inference_program_ = paddle::inference::Load(
-        static_cast<framework::Executor *>(tmp_exe.get()), scope_.get(),
-        config_.model_dir);
+    filename = config_.model_dir + "/__model__";
   } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
     // All parameters are saved in a single file.
     // The file names should be consistent with that used
     // in Python API `fluid.io.save_inference_model`.
-    inference_program_ = paddle::inference::Load(
-        static_cast<framework::Executor *>(tmp_exe.get()), scope_.get(),
-        config_.prog_file, config_.param_file);
+    filename = config_.prog_file;
   } else {
+    if (config_.model_dir.empty() && config_.prog_file.empty()) {
+      LOG(ERROR)
+          << "Either model_dir or (prog_file, param_file) should be set.";
+      return false;
+    }
     LOG(ERROR) << string::Sprintf(
         "not valid model path '%s' or program path '%s'.", config_.model_dir,
         config_.param_file);
     return false;
   }
+
+  std::string pb_content;
+  // Read binary
+  std::ifstream fin(filename, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
+  fin.seekg(0, std::ios::end);
+
+  pb_content.resize(fin.tellg());
+  fin.seekg(0, std::ios::beg);
+  fin.read(&(pb_content.at(0)), pb_content.size());
+  fin.close();
+
+  // Create ProgramDesc
+  framework::proto::ProgramDesc proto;
+  proto.ParseFromString(pb_content);
+  inference_program_.reset(new framework::ProgramDesc(proto));
+  return true;
+}
+
+bool AnalysisPredictor::LoadParameters() {
+  PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
+                          "The inference program should be loaded first.");
+  const auto &global_block = inference_program_->MutableBlock(0);
+
+  // create a temporary program to load parameters.
+
+  std::unique_ptr<framework::ProgramDesc> load_program(
+      new framework::ProgramDesc());
+  framework::BlockDesc *load_block = load_program->MutableBlock(0);
+  std::vector<std::string> params;
+
+  for (auto *var : global_block->AllVars()) {
+    if (IsPersistable(var)) {
+      VLOG(3) << "persistable variable's name: " << var->Name();
+
+      framework::VarDesc *new_var = load_block->Var(var->Name());
+      new_var->SetShape(var->GetShape());
+      new_var->SetDataType(var->GetDataType());
+      new_var->SetType(var->GetType());
+      new_var->SetLoDLevel(var->GetLoDLevel());
+      new_var->SetPersistable(true);
+
+      if (!config_.param_file.empty()) {
+        params.push_back(new_var->Name());
+      } else {
+        // append_op
+        framework::OpDesc *op = load_block->AppendOp();
+        op->SetType("load");
+        op->SetOutput("Out", {new_var->Name()});
+        op->SetAttr("file_path", {config_.model_dir + "/" + new_var->Name()});
+        op->CheckAttrs();
+      }
+    }
+  }
+
+  if (!config_.param_file.empty()) {
+    // sort paramlist to have consistent ordering
+    std::sort(params.begin(), params.end());
+    // append just the load_combine op
+    framework::OpDesc *op = load_block->AppendOp();
+    op->SetType("load_combine");
+    op->SetOutput("Out", params);
+    op->SetAttr("file_path", {config_.param_file});
+    op->CheckAttrs();
+  }
+
+  // Use NaiveExecutor to Load parameters.
+  platform::CPUPlace place;
+  framework::NaiveExecutor e(place);
+  e.Prepare(scope_.get(), *load_program, 0, false);
+  e.Run();
+  VLOG(3) << "get " << scope_->LocalVarNames().size() << " vars after load";
+
   return true;
 }
 
@@ -385,3 +526,26 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
 }
 
 }  // namespace paddle
+
+#if PADDLE_WITH_TENSORRT
+USE_TRT_CONVERTER(elementwise_add_weight);
+USE_TRT_CONVERTER(elementwise_add_tensor);
+USE_TRT_CONVERTER(elementwise_sub_tensor);
+USE_TRT_CONVERTER(elementwise_div_tensor);
+USE_TRT_CONVERTER(elementwise_mul_tensor);
+USE_TRT_CONVERTER(elementwise_max_tensor);
+USE_TRT_CONVERTER(elementwise_min_tensor);
+USE_TRT_CONVERTER(elementwise_pow_tensor);
+USE_TRT_CONVERTER(mul);
+USE_TRT_CONVERTER(conv2d);
+USE_TRT_CONVERTER(relu);
+USE_TRT_CONVERTER(sigmoid);
+USE_TRT_CONVERTER(tanh);
+USE_TRT_CONVERTER(fc);
+USE_TRT_CONVERTER(pool2d);
+USE_TRT_CONVERTER(softmax);
+USE_TRT_CONVERTER(batch_norm);
+USE_TRT_CONVERTER(concat);
+USE_TRT_CONVERTER(dropout);
+USE_TRT_CONVERTER(pad);
+#endif
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index b7dc2067332278c1c38df4beefb5059efe76417f..cf81b7db738d899566ddf32c5e5a40475c8e7bc7 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #pragma once
+#include <algorithm>
+#include <map>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/naive_executor.h"
@@ -21,7 +23,10 @@
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/string/printf.h"
-
+#ifdef PADDLE_WITH_TESTING
+#include <gtest/gtest.h>
+#include <gtest/gtest_prod.h>
+#endif
 namespace paddle {
 
 using inference::analysis::Argument;
@@ -52,6 +57,7 @@ class AnalysisPredictor : public PaddlePredictor {
 
   bool ZeroCopyRun() override;
 
+  void CreateFeedFetchVar(framework::Scope *scope);
   void PrepareFeedFetch();
 
   void OptimizeInferenceProgram();
@@ -60,11 +66,17 @@ class AnalysisPredictor : public PaddlePredictor {
 
   std::unique_ptr<PaddlePredictor> Clone() override;
 
-  framework::Scope *scope() { return executor_->scope(); }
+  framework::Scope *scope() { return scope_.get(); }
   framework::ProgramDesc &program() { return *inference_program_; }
 
  protected:
+  bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program);
+  bool PrepareScope(const std::shared_ptr<framework::Scope> &parent_scope);
+  bool CreateExecutor();
+  bool PrepareExecutor();
+
   bool LoadProgramDesc();
+  bool LoadParameters();
 
   bool SetFeed(const std::vector<PaddleTensor> &input_datas,
                framework::Scope *scope);
@@ -75,6 +87,14 @@ class AnalysisPredictor : public PaddlePredictor {
                    PaddleTensor *output_data);
   ~AnalysisPredictor();
 
+// Some more detailed tests, they are made the friends of the predictor, so that
+// the all the details can be tested.
+#if PADDLE_WITH_TESTING
+  FRIEND_TEST(AnalysisPredictor, analysis_off);
+  FRIEND_TEST(AnalysisPredictor, analysis_on);
+  FRIEND_TEST(AnalysisPredictor, with_gpu);
+#endif
+
  private:
   contrib::AnalysisConfig config_;
   Argument argument_;
@@ -90,6 +110,13 @@ class AnalysisPredictor : public PaddlePredictor {
   // concurrency problems, so cache them.
   std::vector<framework::LoDTensor> feed_tensors_;
   details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
+
+ private:
+  // Some status here that help to determine the status inside the predictor.
+  bool status_program_optimized_{false};
+  bool status_is_cloned_{false};
+  bool status_use_gpu_{false};
+  bool status_ir_optim_enabled_{false};
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 13c25da1b52742e6114b294847c21ce735b9fc21..1e6f75e364cbe66d141cf2336f065d50928d1bc2 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -12,19 +12,88 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/inference/api/analysis_predictor.h"
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <thread>
+#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 
 DEFINE_string(dirname, "", "dirname to tests.");
 
 namespace paddle {
-namespace inference {
 using contrib::AnalysisConfig;
 
+TEST(AnalysisPredictor, analysis_off) {
+  AnalysisConfig config(false);
+  config.model_dir = FLAGS_dirname;
+  config.enable_ir_optim = false;
+
+  auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+  auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
+
+  // Without analysis, the scope_ and sub_scope_ are created by predictor
+  // itself.
+  ASSERT_TRUE(predictor->scope_);
+  ASSERT_TRUE(predictor->sub_scope_);
+  ASSERT_EQ(predictor->scope_->parent(), nullptr);
+  ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get());
+  // ir is turned off, so program shouldn't be optimized.
+  ASSERT_FALSE(predictor->status_program_optimized_);
+  LOG(INFO) << "scope parameters " << predictor->scope_->LocalVarNames().size();
+
+  // 2. Dummy Input Data
+  int64_t data[4] = {1, 2, 3, 4};
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({4, 1});
+  tensor.data.Reset(data, sizeof(data));
+  tensor.dtype = PaddleDType::INT64;
+
+  std::vector<PaddleTensor> inputs(4, tensor);
+  std::vector<PaddleTensor> outputs;
+  ASSERT_TRUE(predictor->Run(inputs, &outputs));
+}
+
+TEST(AnalysisPredictor, analysis_on) {
+  AnalysisConfig config(false);
+  config.model_dir = FLAGS_dirname;
+  config.enable_ir_optim = true;
+
+  auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+  auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
+
+  ASSERT_TRUE(predictor->scope_);
+  ASSERT_TRUE(predictor->sub_scope_);
+  ASSERT_EQ(predictor->scope_->parent(), nullptr);
+  ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get());
+  // ir is turned on, so program should be optimized.
+  ASSERT_TRUE(predictor->status_program_optimized_);
+  // 2. Dummy Input Data
+  int64_t data[4] = {1, 2, 3, 4};
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({4, 1});
+  tensor.data.Reset(data, sizeof(data));
+  tensor.dtype = PaddleDType::INT64;
+
+  std::vector<PaddleTensor> inputs(4, tensor);
+  std::vector<PaddleTensor> outputs;
+  ASSERT_TRUE(predictor->Run(inputs, &outputs));
+
+  for (auto& output : outputs) {
+    LOG(INFO) << inference::DescribeTensor(output);
+  }
+
+  // compare with NativePredictor
+  auto naive_predictor = CreatePaddlePredictor<NativeConfig>(config);
+  std::vector<PaddleTensor> naive_outputs;
+  ASSERT_TRUE(naive_predictor->Run(inputs, &naive_outputs));
+  ASSERT_EQ(naive_outputs.size(), 1UL);
+  inference::CompareTensor(outputs.front(), naive_outputs.front());
+}
+
 TEST(AnalysisPredictor, ZeroCopy) {
   AnalysisConfig config;
-  config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
+  config.model_dir = FLAGS_dirname;
   config.use_feed_fetch_ops = false;
 
   auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
@@ -61,5 +130,59 @@ TEST(AnalysisPredictor, ZeroCopy) {
   LOG(INFO) << "output_data: " << out_data;
 }
 
-}  // namespace inference
+TEST(AnalysisPredictor, Clone) {
+  AnalysisConfig config;
+  config.model_dir = FLAGS_dirname;
+  config.use_feed_fetch_ops = true;
+  config.enable_ir_optim = true;
+
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  predictors.emplace_back(CreatePaddlePredictor(config));
+
+  LOG(INFO) << "************** to clone ************************";
+  const int num_threads = 3;
+  for (int i = 1; i < num_threads; i++) {
+    predictors.emplace_back(predictors.front()->Clone());
+  }
+
+  auto* root_scope =
+      static_cast<AnalysisPredictor*>(predictors[0].get())->scope();
+  ASSERT_FALSE(root_scope->kids().empty());
+  LOG(INFO) << "***** scope ******\n"
+            << framework::GenScopeTreeDebugInfo(root_scope);
+
+  // 2. Dummy Input Data
+  int64_t data[4] = {1, 2, 3, 4};
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({4, 1});
+  tensor.data.Reset(data, sizeof(data));
+  tensor.dtype = PaddleDType::INT64;
+
+  std::vector<PaddleTensor> inputs(4, tensor);
+  std::vector<PaddleTensor> outputs;
+  predictors[0]->Run(inputs, &outputs);
+
+  LOG(INFO) << "Run with single thread";
+  for (int i = 0; i < num_threads; i++) {
+    LOG(INFO) << "run predictor " << i;
+    ASSERT_TRUE(predictors[i]->Run(inputs, &outputs));
+  }
+
+  LOG(INFO) << "Run with multiple threads";
+  std::vector<std::thread> threads;
+  for (int i = 0; i < num_threads; i++) {
+    threads.emplace_back([&predictors, &inputs, i] {
+      LOG(INFO) << "thread #" << i << " running";
+      std::vector<PaddleTensor> outputs;
+      for (int j = 0; j < 10; j++) {
+        ASSERT_TRUE(predictors[i]->Run(inputs, &outputs));
+      }
+    });
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index 01ea942d3c8d20180cfc9664b8601ba87a898e86..9be059c73e20ebeeff2c4b6e8e5502e4a56fd0d6 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -15,8 +15,8 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle_inference_api.h"
 
 namespace paddle {
 
diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h
index 04536ea3a53bbbc9293d92e69a23567e4bfd84c0..6a8b81cc57281b12cd3a4c89c863b20a824ce34a 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
@@ -19,11 +19,13 @@ limitations under the License. */
 
 #pragma once
 
+#define WITH_ANAKIN
+
 #include <vector>
 
 #include "framework/core/net/net.h"
 #include "framework/graph/graph.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_anakin_config.h"
 #include "saber/core/shape.h"
 #include "saber/saber_types.h"
 
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index d06ab8f8c8e3c0adf4a4074eb1450012126e83ea..fcbc3803d04def9a9855f2fee489e7e2c561b454 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -157,7 +157,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
     LOG(ERROR) << "fail to get fetches";
     return false;
   }
-  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+  VLOG(30) << "predict cost: " << timer.toc() << "ms";
 
   // Fix TensorArray reuse not cleaned bug.
   tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc
index 5152b8670ddb206f0927c03149684af4a096df42..014bdc6a379744463e535df97af4c9c2e1651656 100644
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -292,7 +292,14 @@ TEST(inference_api_native, image_classification_gpu) {
 // TEST(inference_api_native, image_classification_gpu_threads) {
 //   MainThreadsImageClassification(true /*use_gpu*/);
 // }
-
 #endif
 
+TEST(PassBuilder, Delete) {
+  contrib::AnalysisConfig config(false);
+  config.pass_builder()->DeletePass("attention_lstm_fuse_pass");
+  const auto& passes = config.pass_builder()->AllPasses();
+  auto it = std::find(passes.begin(), passes.end(), "attention_lstm_fuse_pass");
+  ASSERT_EQ(it, passes.end());
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
deleted file mode 100644
index 7ac468ee4d33f49bba20a07c976055a083743cbc..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ /dev/null
@@ -1,188 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/api/api_impl.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/operators/tensorrt_engine_op.h"
-
-namespace paddle {
-
-using inference::analysis::Argument;
-using inference::Singleton;
-using inference::analysis::Analyzer;
-using framework::proto::ProgramDesc;
-using paddle::contrib::MixedRTConfig;
-
-class TensorRTSubgraphPredictor : public NativePaddlePredictor {
- public:
-  explicit TensorRTSubgraphPredictor(const MixedRTConfig& config)
-      : NativePaddlePredictor(config), config_(config) {}
-
-  bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
-    FLAGS_IA_enable_tensorrt_subgraph_engine = true;
-    VLOG(3) << "Predictor::init()";
-    if (config_.use_gpu) {
-      place_ = paddle::platform::CUDAPlace(config_.device);
-    } else {
-      place_ = paddle::platform::CPUPlace();
-    }
-    if (parent_scope) {
-      scope_ = parent_scope;
-      sub_scope_ = &(parent_scope->NewScope());
-    } else {
-      paddle::framework::InitDevices(false);
-      scope_.reset(new paddle::framework::Scope());
-    }
-
-    executor_.reset(new paddle::framework::Executor(place_));
-
-    // Initialize the inference program
-    if (!config_.model_dir.empty()) {
-      // Parameters are saved in separate files sited in
-      // the specified `dirname`.
-      inference_program_ = paddle::inference::Load(
-          executor_.get(), scope_.get(), config_.model_dir);
-    } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
-      // All parameters are saved in a single file.
-      // The file names should be consistent with that used
-      // in Python API `fluid.io.save_inference_model`.
-      inference_program_ = paddle::inference::Load(
-          executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
-    } else {
-      LOG(ERROR) << "fail to load inference model.";
-      return false;
-    }
-
-    OptimizeInferenceProgram();
-    ctx_ = executor_->Prepare(*inference_program_, 0);
-
-    VLOG(5) << "to create variables";
-    executor_->CreateVariables(*inference_program_,
-                               sub_scope_ ? sub_scope_ : scope_.get(), 0);
-    // Get the feed_target_names and fetch_target_names
-    PrepareFeedFetch();
-    return true;
-  }
-
-  bool Run(const std::vector<PaddleTensor>& inputs,
-           std::vector<PaddleTensor>* output_data,
-           int batch_size = -1) override {
-    PADDLE_ENFORCE_GT(batch_size, 0,
-                      "TensorRT engine needs the argument batch_size set");
-    FLAGS_tensorrt_engine_batch_size = batch_size;
-    return NativePaddlePredictor::Run(inputs, output_data, batch_size);
-  }
-
-  void OptimizeInferenceProgram() {
-    // Analyze inference_program
-    Argument argument;
-
-    argument.Set<int>("minimum_subgraph_size",
-                      new int(config_.minimum_subgraph_size));
-    argument.Set<int>("max_batch_size", new int(config_.max_batch_size));
-    argument.Set<int>("workspace_size", new int(config_.workspace_size));
-    argument.Set<std::string>("precision_mode",
-                              new std::string(config_.precision_mode));
-
-    if (!config_.model_dir.empty()) {
-      argument.fluid_model_dir.reset(new std::string(config_.model_dir));
-    } else {
-      PADDLE_ENFORCE(
-          !config_.param_file.empty(),
-          "Either model_dir or (param_file, prog_file) should be set.");
-      PADDLE_ENFORCE(!config_.prog_file.empty());
-      argument.fluid_model_program_path.reset(
-          new std::string(config_.prog_file));
-      argument.fluid_model_param_path.reset(
-          new std::string(config_.param_file));
-    }
-    argument.origin_program_desc.reset(
-        new ProgramDesc(*inference_program_->Proto()));
-    Singleton<Analyzer>::Global().Run(&argument);
-    CHECK(argument.transformed_program_desc);
-    VLOG(5) << "transformed program:\n"
-            << argument.transformed_program_desc->SerializeAsString();
-    VLOG(5) << "to prepare executor";
-    inference_program_.reset(
-        new framework::ProgramDesc(*argument.transformed_program_desc));
-  }
-
- private:
-  MixedRTConfig config_;
-};
-
-template <>
-std::unique_ptr<PaddlePredictor>
-CreatePaddlePredictor<MixedRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
-    const MixedRTConfig& config) {
-  VLOG(3) << "create TensorRTSubgraphPredictor";
-  if (config.use_gpu) {
-    // 1. GPU memeroy
-    PADDLE_ENFORCE_GT(
-        config.fraction_of_gpu_memory, 0.f,
-        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
-    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
-    std::vector<std::string> flags;
-    if (config.fraction_of_gpu_memory >= 0.0f ||
-        config.fraction_of_gpu_memory <= 0.95f) {
-      flags.push_back("dummpy");
-      std::string flag = "--fraction_of_gpu_memory_to_use=" +
-                         std::to_string(config.fraction_of_gpu_memory);
-      flags.push_back(flag);
-      VLOG(3) << "set flag: " << flag;
-      framework::InitGflags(flags);
-    }
-  }
-
-  std::unique_ptr<PaddlePredictor> predictor(
-      new TensorRTSubgraphPredictor(config));
-  if (!dynamic_cast<TensorRTSubgraphPredictor*>(predictor.get())
-           ->Init(nullptr)) {
-    return nullptr;
-  }
-  return std::move(predictor);
-}
-
-template <>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<MixedRTConfig>(
-    const MixedRTConfig& config) {
-  return CreatePaddlePredictor<MixedRTConfig,
-                               PaddleEngineKind::kAutoMixedTensorRT>(config);
-}
-
-}  // namespace paddle
-
-USE_TRT_CONVERTER(elementwise_add_weight);
-USE_TRT_CONVERTER(elementwise_add_tensor);
-USE_TRT_CONVERTER(elementwise_sub_tensor);
-USE_TRT_CONVERTER(elementwise_div_tensor);
-USE_TRT_CONVERTER(elementwise_mul_tensor);
-USE_TRT_CONVERTER(elementwise_max_tensor);
-USE_TRT_CONVERTER(elementwise_min_tensor);
-USE_TRT_CONVERTER(elementwise_pow_tensor);
-USE_TRT_CONVERTER(mul);
-USE_TRT_CONVERTER(conv2d);
-USE_TRT_CONVERTER(relu);
-USE_TRT_CONVERTER(sigmoid);
-USE_TRT_CONVERTER(tanh);
-USE_TRT_CONVERTER(fc);
-USE_TRT_CONVERTER(pool2d);
-USE_TRT_CONVERTER(softmax);
-USE_TRT_CONVERTER(batch_norm);
-USE_TRT_CONVERTER(concat);
-USE_TRT_CONVERTER(dropout);
-USE_TRT_CONVERTER(pad);
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
deleted file mode 100644
index 89c9a65cb06ba565f0e0cbdb9b6031c6adbcb64e..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-
-namespace paddle {
-
-using contrib::MixedRTConfig;
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-void CompareTensorRTWithFluid(bool enable_tensorrt) {
-  FLAGS_IA_enable_tensorrt_subgraph_engine = enable_tensorrt;
-
-  //# 1. Create PaddlePredictor with a config.
-  NativeConfig config0;
-  config0.model_dir = FLAGS_dirname;
-  config0.use_gpu = true;
-  config0.fraction_of_gpu_memory = 0.3;
-  config0.device = 0;
-
-  MixedRTConfig config1;
-  config1.model_dir = FLAGS_dirname;
-  config1.use_gpu = true;
-  config1.fraction_of_gpu_memory = 0.3;
-  config1.device = 0;
-  config1.max_batch_size = 10;
-
-  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
-  auto predictor1 = CreatePaddlePredictor<MixedRTConfig>(config1);
-
-  for (int batch_id = 0; batch_id < 1; batch_id++) {
-    //# 2. Prepare input.
-    std::vector<int64_t> data(20);
-    for (int i = 0; i < 20; i++) data[i] = i;
-
-    PaddleTensor tensor;
-    tensor.shape = std::vector<int>({10, 1});
-    tensor.data = PaddleBuf(data.data(), data.size() * sizeof(int64_t));
-    tensor.dtype = PaddleDType::INT64;
-
-    // For simplicity, we set all the slots with the same data.
-    std::vector<PaddleTensor> slots(4, tensor);
-
-    //# 3. Run
-    std::vector<PaddleTensor> outputs0;
-    std::vector<PaddleTensor> outputs1;
-    CHECK(predictor0->Run(slots, &outputs0));
-    CHECK(predictor1->Run(slots, &outputs1, 10));
-
-    //# 4. Get output.
-    ASSERT_EQ(outputs0.size(), 1UL);
-    ASSERT_EQ(outputs1.size(), 1UL);
-
-    const size_t num_elements = outputs0.front().data.length() / sizeof(float);
-    const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
-    EXPECT_EQ(num_elements, num_elements1);
-
-    auto *data0 = static_cast<float *>(outputs0.front().data.data());
-    auto *data1 = static_cast<float *>(outputs1.front().data.data());
-
-    ASSERT_GT(num_elements, 0UL);
-    for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
-      EXPECT_NEAR(data0[i], data1[i], 1e-3);
-    }
-  }
-}
-
-TEST(paddle_inference_api_tensorrt_subgraph_engine, without_tensorrt) {
-  CompareTensorRTWithFluid(false);
-}
-
-TEST(paddle_inference_api_tensorrt_subgraph_engine, with_tensorrt) {
-  CompareTensorRTWithFluid(true);
-}
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
index 5446fd4d4256c10442a53ea09a447cf308cbd681..6ae5198dab9a16d5a861c641dee39e4806595352 100644
--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include <memory>
 #include <thread>  //NOLINT
 
-#include "paddle/include/paddle_inference_api.h"
+#include "utils.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
 DEFINE_bool(use_gpu, false, "Whether use gpu.");
diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
index 4a8404f21c6ec6a1647e964ac3538b4b49151009..72d20bc59e036afb84e2501f6af75c09be78b57e 100644
--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
@@ -36,16 +36,15 @@ namespace demo {
  */
 void Main() {
   std::unique_ptr<PaddlePredictor> predictor;
-  paddle::contrib::MixedRTConfig config;
+  paddle::contrib::AnalysisConfig config(true);
   config.param_file = FLAGS_modeldir + "/__params__";
   config.prog_file = FLAGS_modeldir + "/__model__";
-  config.use_gpu = true;
   config.device = 0;
-  config.max_batch_size = 1;
+  config.EnableTensorRtEngine();
   config.fraction_of_gpu_memory = 0.1;  // set by yourself
-  predictor = CreatePaddlePredictor<paddle::contrib::MixedRTConfig>(config);
+  predictor = CreatePaddlePredictor(config);
 
-  VLOG(3) << "begin to process data";
+  VLOG(30) << "begin to process data";
   // Just a single batch of data.
   std::string line;
   std::ifstream file(FLAGS_data);
@@ -60,13 +59,13 @@ void Main() {
       PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
   input.dtype = PaddleDType::FLOAT32;
 
-  VLOG(3) << "run executor";
+  VLOG(30) << "run executor";
   std::vector<PaddleTensor> output;
   predictor->Run({input}, &output, 1);
 
-  VLOG(3) << "output.size " << output.size();
+  VLOG(30) << "output.size " << output.size();
   auto& tensor = output.front();
-  VLOG(3) << "output: " << SummaryTensor(tensor);
+  VLOG(30) << "output: " << SummaryTensor(tensor);
 
   // compare with reference result
   CheckOutput(FLAGS_refer, tensor);
diff --git a/paddle/fluid/inference/api/demo_ci/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h
index d70c6aea791219a40c3164b51499f9d5e562be71..664b9d01c7810aa4f053cd6ebbff5f3f7619fd05 100644
--- a/paddle/fluid/inference/api/demo_ci/utils.h
+++ b/paddle/fluid/inference/api/demo_ci/utils.h
@@ -47,7 +47,7 @@ static void split(const std::string& str, char sep,
 }
 
 Record ProcessALine(const std::string& line) {
-  VLOG(3) << "process a line";
+  VLOG(30) << "process a line";
   std::vector<std::string> columns;
   split(line, '\t', &columns);
   CHECK_EQ(columns.size(), 2UL)
@@ -65,8 +65,8 @@ Record ProcessALine(const std::string& line) {
   for (auto& s : shape_strs) {
     record.shape.push_back(std::stoi(s));
   }
-  VLOG(3) << "data size " << record.data.size();
-  VLOG(3) << "data shape size " << record.shape.size();
+  VLOG(30) << "data size " << record.data.size();
+  VLOG(30) << "data shape size " << record.shape.size();
   return record;
 }
 
@@ -78,8 +78,8 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
   file.close();
 
   size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
-  VLOG(3) << "predictor output numel " << numel;
-  VLOG(3) << "reference output numel " << refer.data.size();
+  VLOG(30) << "predictor output numel " << numel;
+  VLOG(30) << "reference output numel " << refer.data.size();
   CHECK_EQ(numel, refer.data.size());
   switch (output.dtype) {
     case PaddleDType::INT64: {
diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
index 8d546e3e9c740c10bcf2984e073c956e3612625c..bc8891455dc8e4a30ddfcc5f89792296e59c2548 100644
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -17,7 +17,7 @@ limitations under the License. */
  */
 
 #include <gflags/gflags.h>
-#include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
+#include <glog/logging.h>
 #include "utils.h"  // NOLINT
 
 #ifdef PADDLE_WITH_CUDA
@@ -40,20 +40,17 @@ using contrib::AnalysisConfig;
  */
 void Main(bool use_gpu) {
   std::unique_ptr<PaddlePredictor> predictor, analysis_predictor;
-  AnalysisConfig config;
+  AnalysisConfig config(use_gpu);
   config.param_file = FLAGS_modeldir + "/__params__";
   config.prog_file = FLAGS_modeldir + "/__model__";
-  config.use_gpu = use_gpu;
   config.device = 0;
   if (FLAGS_use_gpu) {
     config.fraction_of_gpu_memory = 0.1;  // set by yourself
   }
 
-  VLOG(3) << "init predictor";
   predictor = CreatePaddlePredictor<NativeConfig>(config);
-  analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+  analysis_predictor = CreatePaddlePredictor(config);
 
-  VLOG(3) << "begin to process data";
   // Just a single batch of data.
   std::string line;
   std::ifstream file(FLAGS_data);
@@ -68,13 +65,10 @@ void Main(bool use_gpu) {
       PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
   input.dtype = PaddleDType::FLOAT32;
 
-  VLOG(3) << "run executor";
   std::vector<PaddleTensor> output, analysis_output;
   predictor->Run({input}, &output, 1);
 
-  VLOG(3) << "output.size " << output.size();
   auto& tensor = output.front();
-  VLOG(3) << "output: " << SummaryTensor(tensor);
 
   // compare with reference result
   CheckOutput(FLAGS_refer, tensor);
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc
index 4ae6c6dc9f44650c1c62f5be5448864d817513b1..244b0b567b5df6735acd7f1bf3c2056f449be872 100644
--- a/paddle/fluid/inference/api/details/reset_tensor_array.cc
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc
@@ -26,7 +26,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
       // parameter.
       if (var_name == "feed" || var_name == "fetch") continue;
       if (var->Type() == typeid(framework::LoDTensorArray)) {
-        VLOG(4) << "collect " << var_name;
+        VLOG(40) << "collect " << var_name;
         arrays_.push_back(var->GetMutable<framework::LoDTensorArray>());
       }
     }
@@ -34,7 +34,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
       CollectTensorArrays(kid);
     }
 
-    VLOG(3) << "Collect " << arrays_.size() << " arrays";
+    VLOG(30) << "Collect " << arrays_.size() << " arrays";
     flag_ = false;
   }
 }
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 14698f6dfc8885ec1d35f1912bad10a9caa13db4..0f540699b8ffea94c3f3aaf3354a0462e0e674a9 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -51,7 +51,7 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
 }
 
 template <typename T>
-T *ZeroCopyTensor::data(PaddlePlace *place, int *size) {
+T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const {
   auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
   auto *res = tensor->data<T>();
 
@@ -67,8 +67,10 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) {
   return res;
 }
 
-template float *ZeroCopyTensor::data<float>(PaddlePlace *place, int *size);
-template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place, int *size);
+template float *ZeroCopyTensor::data<float>(PaddlePlace *place,
+                                            int *size) const;
+template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place,
+                                                int *size) const;
 template float *ZeroCopyTensor::mutable_data<float>(PaddlePlace place);
 template int64_t *ZeroCopyTensor::mutable_data<int64_t>(PaddlePlace place);
 
@@ -84,7 +86,7 @@ void *ZeroCopyTensor::FindTensor() const {
   return tensor;
 }
 
-std::vector<int64_t> ZeroCopyTensor::shape() {
+std::vector<int64_t> ZeroCopyTensor::shape() const {
   auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
   PADDLE_ENFORCE(tensor, "not found tensor called %s in the scope", name_);
   return framework::vectorize(tensor->dims());
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
index 2d5b561d801cd9e734cab13b28e7285493e30f94..12071e09f8442f2c52a06b7c3fe4bed2c28b524a 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
@@ -24,18 +24,20 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
 }
 
 template <typename T>
-T *ZeroCopyTensor::data(PaddlePlace *place, int *size) {
+T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const {
   return nullptr;
 }
 
-template float *ZeroCopyTensor::data<float>(PaddlePlace *place, int *size);
-template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place, int *size);
+template float *ZeroCopyTensor::data<float>(PaddlePlace *place,
+                                            int *size) const;
+template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place,
+                                                int *size) const;
 template float *ZeroCopyTensor::mutable_data(PaddlePlace place);
 template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place);
 
 void *ZeroCopyTensor::FindTensor() const { return nullptr; }
 
-std::vector<int64_t> ZeroCopyTensor::shape() { return {}; }
+std::vector<int64_t> ZeroCopyTensor::shape() const { return {}; }
 
 void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {}
 
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index e46dc1326951f68fd030f2208b9bea1647d0026d..252960d89e067d6b55c8f28d01e90a452cae2e92 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -16,13 +16,14 @@
 
 #include <glog/logging.h>
 #include <sys/time.h>
+#include <algorithm>
 #include <chrono>  // NOLINT
 #include <numeric>
 #include <sstream>
 #include <string>
 #include <vector>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/string/printf.h"
-#include "paddle_inference_api.h"
 
 namespace paddle {
 namespace inference {
@@ -124,6 +125,51 @@ static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
   return size;
 }
 
+static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) {
+  if (a.dtype != b.dtype) {
+    LOG(ERROR) << "dtype not match";
+    return false;
+  }
+
+  if (a.lod.size() != b.lod.size()) {
+    LOG(ERROR) << "lod not match";
+    return false;
+  }
+  for (size_t i = 0; i < a.lod.size(); i++) {
+    if (a.lod[i].size() != b.lod[i].size()) {
+      LOG(ERROR) << "lod not match";
+      return false;
+    }
+    for (size_t j = 0; j < a.lod[i].size(); j++) {
+      if (a.lod[i][j] != b.lod[i][j]) {
+        LOG(ERROR) << "lod not match";
+        return false;
+      }
+    }
+  }
+
+  if (a.shape.size() != b.shape.size()) {
+    LOG(INFO) << "shape not match";
+    return false;
+  }
+  for (size_t i = 0; i < a.shape.size(); i++) {
+    if (a.shape[i] != b.shape[i]) {
+      LOG(ERROR) << "shape not match";
+      return false;
+    }
+  }
+
+  auto *adata = static_cast<float *>(a.data.data());
+  auto *bdata = static_cast<float *>(b.data.data());
+  for (int i = 0; i < VecReduceToInt(a.shape); i++) {
+    if (adata[i] != bdata[i]) {
+      LOG(ERROR) << "data not match";
+      return false;
+    }
+  }
+  return true;
+}
+
 static std::string DescribeTensor(const PaddleTensor &tensor) {
   std::stringstream os;
   os << "Tensor [" << tensor.name << "]\n";
@@ -156,6 +202,26 @@ static std::string DescribeTensor(const PaddleTensor &tensor) {
   return os.str();
 }
 
+static std::string DescribeZeroCopyTensor(const ZeroCopyTensor &tensor) {
+  std::stringstream os;
+  os << "Tensor [" << tensor.name() << "]\n";
+
+  os << " - shape: " << to_string(tensor.shape()) << '\n';
+  os << " - lod: ";
+  for (auto &l : tensor.lod()) {
+    os << to_string(l) << "; ";
+  }
+  os << "\n";
+  os << " - data: ";
+  PaddlePlace place;
+  int size;
+  const auto *data = tensor.data<float>(&place, &size);
+  for (int i = 0; i < size; i++) {
+    os << data[i] << " ";
+  }
+  return os.str();
+}
+
 static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
                       double latency, int epoch = 1) {
   LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
diff --git a/paddle/fluid/inference/analysis/analyzer_main.cc b/paddle/fluid/inference/api/paddle_anakin_config.h
similarity index 56%
rename from paddle/fluid/inference/analysis/analyzer_main.cc
rename to paddle/fluid/inference/api/paddle_anakin_config.h
index 5e1fe3eb797cdced56a61aa2db0c3d18601824f8..0e91c2624bed4459b936ac4477d73ae954e55bcc 100644
--- a/paddle/fluid/inference/analysis/analyzer_main.cc
+++ b/paddle/fluid/inference/api/paddle_anakin_config.h
@@ -11,23 +11,25 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#pragma once
 
-/*
- * This file implements analysizer -- an executation help to analyze and
- * optimize trained model.
- */
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include <gflags/gflags.h>
-#include <glog/logging.h>
+#include <cassert>
+#include <memory>
+#include <string>
+#include <vector>
 
-int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
-  using paddle::inference::analysis::Analyzer;
-  using paddle::inference::analysis::Argument;
+#include "paddle_api.h"  // NOLINT
 
-  Argument argument;
-  Analyzer analyzer;
-  analyzer.Run(&argument);
+namespace paddle {
+namespace contrib {
+// Configurations for Anakin engine.
+struct AnakinConfig : public PaddlePredictor::Config {
+  enum TargetType { NVGPU = 0, X86 };
+  int device;
+  std::string model_file;
+  int max_batch_size{-1};
+  TargetType target_type;
+};
 
-  return 0;
-}
+}  // namespace contrib
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..82c04e9f3f043df9db82969e2a5ce8825a3a41f6
--- /dev/null
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <vector>
+
+// Here we include some header files with relative paths, for that in deploy,
+// the abstract path of this header file will be changed.
+#include "paddle_api.h"           // NOLINT
+#include "paddle_pass_builder.h"  // NOLINT
+
+namespace paddle {
+
+class AnalysisPredictor;
+// ==
+//
+// -----------------------------------------------------------------------------------
+// NOTE: The following APIs are not mature yet, we are still working on them.
+namespace contrib {
+
+// NOTE WIP, not stable yet.
+struct AnalysisConfig : public NativeConfig {
+  explicit AnalysisConfig(bool use_gpu = false);
+  explicit AnalysisConfig(const AnalysisConfig& other);
+  explicit AnalysisConfig(AnalysisConfig&& other);
+
+  // Determine whether to perform graph optimization.
+  bool enable_ir_optim = true;
+
+  // Get a pass builder for customize the passes in IR analysis phase.
+  PassStrategy* pass_builder() const;
+
+  // NOT stable yet.
+  bool use_feed_fetch_ops{true};
+
+  void EnableTensorRtEngine(int workspace_size = 1 << 20,
+                            int max_batch_size = 1);
+  // NOTE this is just for internal development, please not use it.
+  // NOT stable yet.
+  void EnableMKLDNN();
+  bool use_mkldnn() const { return use_mkldnn_; }
+
+  friend class ::paddle::AnalysisPredictor;
+
+ protected:
+  bool use_tensorrt_{false};
+  bool use_mkldnn_{false};
+  int tensorrt_workspace_size_;
+  int tensorrt_max_batchsize_;
+  std::unique_ptr<PassStrategy> pass_builder_;
+};
+
+// Configurations for Anakin engine.
+struct AnakinConfig : public PaddlePredictor::Config {
+  enum TargetType { NVGPU = 0, X86 };
+  int device;
+  std::string model_file;
+  int max_batch_size{-1};
+  TargetType target_type;
+};
+
+}  // namespace contrib
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a2a2a1a23401b5aa4d3402da6f7a3369280d8f5
--- /dev/null
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -0,0 +1,220 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace paddle {
+
+// Data type.
+enum PaddleDType {
+  FLOAT32,
+  INT64,
+  // TODO(Superjomn) support more data types if needed.
+};
+
+/*
+ * Memory menage for PaddleTensor.
+ * The PaddleBuf holds a buffer for data input or output. The memory can be
+ * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
+ * should be reused for better performance.
+ *
+ * For user allocated memory, the following API can be used:
+ * - PaddleBuf(void* data, size_t length) to set an external memory by
+ * specifying
+ *   the memory address and length.
+ * - Reset(void* data, size_t length) to reset the PaddleBuf with an external
+ * memory.
+ * ATTENTION, for user allocated memory, deallocation should be done by users
+ * externally after the program finished. The PaddleBuf won't do any allocation
+ * or deallocation.
+ *
+ * To have the PaddleBuf allocate and manage the memory:
+ * - PaddleBuf(size_t length) will allocate a memory of size `length`.
+ * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
+ *   if the allocated memory is larger than `length`, nothing will done.
+ */
+class PaddleBuf {
+ public:
+  // PaddleBuf allocate memory internally, and manage it.
+  explicit PaddleBuf(size_t length)
+      : data_(new char[length]), length_(length), memory_owned_(true) {}
+  // Set external memory, the PaddleBuf won't manage it.
+  PaddleBuf(void* data, size_t length)
+      : data_(data), length_(length), memory_owned_{false} {}
+  // Copy only available when memory is managed externally.
+  explicit PaddleBuf(const PaddleBuf&);
+
+  // Resize the memory.
+  void Resize(size_t length);
+  // Reset to external memory, with address and length set.
+  void Reset(void* data, size_t length);
+  // Tell whether the buffer is empty.
+  bool empty() const { return length_ == 0; }
+  // Get the memory address.
+  void* data() const { return data_; }
+  // Get the memory length.
+  size_t length() const { return length_; }
+
+  ~PaddleBuf() { Free(); }
+  PaddleBuf& operator=(const PaddleBuf&);
+  PaddleBuf& operator=(PaddleBuf&&);
+  PaddleBuf() = default;
+  PaddleBuf(PaddleBuf&& other);
+
+ private:
+  void Free();
+  void* data_{nullptr};  // pointer to the data memory.
+  size_t length_{0};     // number of memory bytes.
+  bool memory_owned_{true};
+};
+
+// Basic input and output data structure for PaddlePredictor.
+struct PaddleTensor {
+  PaddleTensor() = default;
+  std::string name;  // variable name.
+  std::vector<int> shape;
+  PaddleBuf data;  // blob of data.
+  PaddleDType dtype;
+  std::vector<std::vector<size_t>> lod;  // Tensor+LoD equals LoDTensor
+};
+
+enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
+// Tensor without copy, currently only supports AnalysisPredictor.
+class ZeroCopyTensor {
+ public:
+  void Reshape(const std::vector<int>& shape);
+
+  // Get the memory in CPU or GPU with specific data type, should Reshape first
+  // to tell the data size.
+  // Once can directly call this data to feed the data.
+  // This is for write the input tensor.
+  template <typename T>
+  T* mutable_data(PaddlePlace place);
+  // Get the memory directly, will return the place and memory size by pointer.
+  // This is for reading the output tensor.
+  template <typename T>
+  T* data(PaddlePlace* place, int* size) const;
+
+  std::vector<int64_t> shape() const;
+
+  void SetLoD(const std::vector<std::vector<size_t>>& x);
+  std::vector<std::vector<size_t>> lod() const;
+  const std::string& name() const { return name_; }
+
+ protected:
+  explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
+  void SetName(const std::string& name) { name_ = name; }
+  void* FindTensor() const;
+
+ private:
+  std::string name_;
+  bool input_or_output_;
+  friend class AnalysisPredictor;
+  void* scope_{nullptr};
+};
+
+/*
+ * A simple Inference API for Paddle.
+ */
+class PaddlePredictor {
+ public:
+  struct Config;
+  PaddlePredictor() = default;
+  PaddlePredictor(const PaddlePredictor&) = delete;
+  PaddlePredictor& operator=(const PaddlePredictor&) = delete;
+
+  // Predict an record.
+  // The caller should be responsible for allocating and releasing the memory of
+  // `inputs`. `inputs` should be available until Run returns. Caller should be
+  // responsible for the output tensor's buffer, either allocated or passed from
+  // outside.
+  virtual bool Run(const std::vector<PaddleTensor>& inputs,
+                   std::vector<PaddleTensor>* output_data,
+                   int batch_size = -1) = 0;
+
+  // Zero copy input and output optimization.
+  // Get the input or output tensors, and operate on their memory directly,
+  // without copy.
+  virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
+      const std::string& name) {
+    return nullptr;
+  }
+  virtual std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
+      const std::string& name) {
+    return nullptr;
+  }
+  virtual bool ZeroCopyRun() { return false; }
+
+  // Clone a predictor that share the model weights, the Cloned predictor should
+  // be thread-safe.
+  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
+
+  // Destroy the Predictor.
+  virtual ~PaddlePredictor() = default;
+
+  // The common configs for all the predictors.
+  struct Config {
+    std::string model_dir;  // path to the model directory.
+  };
+};
+
+struct NativeConfig : public PaddlePredictor::Config {
+  // GPU related fields.
+  bool use_gpu{false};
+  int device{0};
+  float fraction_of_gpu_memory{-1.f};  // Change to a float in (0,1] if needed.
+
+  // Specify the exact path of program and parameter files.
+  std::string prog_file;
+  std::string param_file;
+
+  // Specify the variable's name of each input if input tensors don't follow the
+  // `feeds` and `fetches` of the phase `save_inference_model`.
+  bool specify_input_name{false};
+};
+
+// A factory to help create different predictors.
+//
+// Usage:
+//
+// NativeConfig config;
+// ... // change the configs.
+// auto native_predictor = CreatePaddlePredictor(config);
+//
+// FOR EXTENSION DEVELOPER:
+// Different predictors are designated by config type. Similar configs can be
+// merged, but there shouldn't be a huge config containing different fields for
+// more than one kind of predictors.
+template <typename ConfigT>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+
+// NOTE The following APIs are too trivial, we will discard it in the following
+// versions.
+enum class PaddleEngineKind {
+  kNative = 0,         // Use the native Fluid facility.
+  kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
+  kAnalysis,           // More optimization.
+  kAnakin              // Use Anakin for inference, not mature yet.
+};
+
+template <typename ConfigT, PaddleEngineKind engine>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+
+int PaddleDtypeSize(PaddleDType dtype);
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index a755ccb93bdee018dfeaf91157e7971b4d4cd832..92fb51d647cf4e2c8a4914d8df2e8b7b6318d1d1 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -26,265 +26,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-namespace paddle {
-
-// Data type.
-enum PaddleDType {
-  FLOAT32,
-  INT64,
-  // TODO(Superjomn) support more data types if needed.
-};
-
-/*
- * Memory menage for PaddleTensor.
- * The PaddleBuf holds a buffer for data input or output. The memory can be
- * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
- * should be reused for better performance.
- *
- * For user allocated memory, the following API can be used:
- * - PaddleBuf(void* data, size_t length) to set an external memory by
- * specifying
- *   the memory address and length.
- * - Reset(void* data, size_t length) to reset the PaddleBuf with an external
- * memory.
- * ATTENTION, for user allocated memory, deallocation should be done by users
- * externally after the program finished. The PaddleBuf won't do any allocation
- * or deallocation.
- *
- * To have the PaddleBuf allocate and manage the memory:
- * - PaddleBuf(size_t length) will allocate a memory of size `length`.
- * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
- *   if the allocated memory is larger than `length`, nothing will done.
- */
-class PaddleBuf {
- public:
-  // PaddleBuf allocate memory internally, and manage it.
-  explicit PaddleBuf(size_t length)
-      : data_(new char[length]), length_(length), memory_owned_(true) {}
-  // Set external memory, the PaddleBuf won't manage it.
-  PaddleBuf(void* data, size_t length)
-      : data_(data), length_(length), memory_owned_{false} {}
-  // Copy only available when memory is managed externally.
-  explicit PaddleBuf(const PaddleBuf&);
-
-  // Resize the memory.
-  void Resize(size_t length);
-  // Reset to external memory, with address and length set.
-  void Reset(void* data, size_t length);
-  // Tell whether the buffer is empty.
-  bool empty() const { return length_ == 0; }
-  // Get the memory address.
-  void* data() const { return data_; }
-  // Get the memory length.
-  size_t length() const { return length_; }
-
-  ~PaddleBuf() { Free(); }
-  PaddleBuf& operator=(const PaddleBuf&);
-  PaddleBuf& operator=(PaddleBuf&&);
-  PaddleBuf() = default;
-  PaddleBuf(PaddleBuf&& other);
-
- private:
-  void Free();
-  void* data_{nullptr};  // pointer to the data memory.
-  size_t length_{0};     // number of memory bytes.
-  bool memory_owned_{true};
-};
-
-// Basic input and output data structure for PaddlePredictor.
-struct PaddleTensor {
-  PaddleTensor() = default;
-  std::string name;  // variable name.
-  std::vector<int> shape;
-  PaddleBuf data;  // blob of data.
-  PaddleDType dtype;
-  std::vector<std::vector<size_t>> lod;  // Tensor+LoD equals LoDTensor
-};
-
-enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
-// Tensor without copy, currently only supports AnalysisPredictor.
-class ZeroCopyTensor {
- public:
-  void Reshape(const std::vector<int>& shape);
-
-  // Get the memory in CPU or GPU with specific data type, should Reshape first
-  // to tell the data size.
-  // Once can directly call this data to feed the data.
-  // This is for write the input tensor.
-  template <typename T>
-  T* mutable_data(PaddlePlace place);
-  // Get the memory directly, will return the place and memory size by pointer.
-  // This is for reading the output tensor.
-  template <typename T>
-  T* data(PaddlePlace* place, int* size);
-
-  std::vector<int64_t> shape();
-
-  void SetLoD(const std::vector<std::vector<size_t>>& x);
-  std::vector<std::vector<size_t>> lod() const;
-
- protected:
-  explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
-  void SetName(const std::string& name) { name_ = name; }
-  void* FindTensor() const;
-
- private:
-  std::string name_;
-  bool input_or_output_;
-  friend class AnalysisPredictor;
-  void* scope_{nullptr};
-};
-
-/*
- * A simple Inference API for Paddle.
- */
-class PaddlePredictor {
- public:
-  struct Config;
-  PaddlePredictor() = default;
-  PaddlePredictor(const PaddlePredictor&) = delete;
-  PaddlePredictor& operator=(const PaddlePredictor&) = delete;
-
-  // Predict an record.
-  // The caller should be responsible for allocating and releasing the memory of
-  // `inputs`. `inputs` should be available until Run returns. Caller should be
-  // responsible for the output tensor's buffer, either allocated or passed from
-  // outside.
-  virtual bool Run(const std::vector<PaddleTensor>& inputs,
-                   std::vector<PaddleTensor>* output_data,
-                   int batch_size = -1) = 0;
-
-  // Zero copy input and output optimization.
-  // Get the input or output tensors, and operate on their memory directly,
-  // without copy.
-  virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
-      const std::string& name) {
-    return nullptr;
-  }
-  virtual std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
-      const std::string& name) {
-    return nullptr;
-  }
-  virtual bool ZeroCopyRun() { return false; }
-
-  // Clone a predictor that share the model weights, the Cloned predictor should
-  // be thread-safe.
-  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
-
-  // Destroy the Predictor.
-  virtual ~PaddlePredictor() = default;
-
-  // The common configs for all the predictors.
-  struct Config {
-    std::string model_dir;  // path to the model directory.
-  };
-};
-
-struct NativeConfig : public PaddlePredictor::Config {
-  // GPU related fields.
-  bool use_gpu{false};
-  int device{0};
-  float fraction_of_gpu_memory{-1.f};  // Change to a float in (0,1] if needed.
-
-  // Specify the exact path of program and parameter files.
-  std::string prog_file;
-  std::string param_file;
-
-  // Specify the variable's name of each input if input tensors don't follow the
-  // `feeds` and `fetches` of the phase `save_inference_model`.
-  bool specify_input_name{false};
-};
-
-// A factory to help create different predictors.
-//
-// Usage:
-//
-// NativeConfig config;
-// ... // change the configs.
-// auto native_predictor = CreatePaddlePredictor(config);
-//
-// FOR EXTENSION DEVELOPER:
-// Different predictors are designated by config type. Similar configs can be
-// merged, but there shouldn't be a huge config containing different fields for
-// more than one kind of predictors.
-template <typename ConfigT>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
-
-// NOTE The following APIs are too trivial, we will discard it in the following
-// versions.
-enum class PaddleEngineKind {
-  kNative = 0,         // Use the native Fluid facility.
-  kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
-  kAnalysis,           // More optimization.
-  kAnakin              // Use Anakin for inference, not mature yet.
-};
-
-template <typename ConfigT, PaddleEngineKind engine>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
-
-// ==
-//
-// -----------------------------------------------------------------------------------
-// NOTE: The following APIs are not mature yet, we are still working on them.
-
-namespace contrib {
-
-// Accelerate GPU computation with TensorRT engine.
-struct MixedRTConfig : public NativeConfig {
-  // Determine whether a subgraph will be executed by TRT.
-  int min_subgraph_size{1};
-  // While TensorRT allows an engine optimized for a given max batch size
-  // to run at any smaller size, the performance for those smaller
-  // sizes may not be as well-optimized. Therefore, Max batch is best
-  // equivalent to the runtime batch size.
-  int max_batch_size{1};
-  // For workspace_size, refer it from here:
-  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
-  int workspace_size{1 << 30};
-  //  We transform the Ops that can be converted into TRT layer in the model,
-  //  and aggregate these Ops into subgraphs for TRT execution.
-  //  We set this variable to control the minimum number of nodes in the
-  //  subgraph, 3 as default value.
-  int minimum_subgraph_size = 3;
-  // Reserved configuration
-  // We just support "FP32" now, "FP16" and "INT8" will be supported.
-  std::string precision_mode = "FP32";
-};
-
-// NOTE WIP, not stable yet.
-struct AnalysisConfig : public NativeConfig {
-  enum class IrPassMode {
-    kSystem,   // Use system default passes, not customize.
-    kInclude,  // Specify the passes in `ir_passes`.
-    kExclude   // Specify the disabled passes in `ir_passes`.
-  };
-
-  // Determine whether to perform graph optimization.
-  bool enable_ir_optim = true;
-  // Manually determine the IR passes to run.
-  IrPassMode ir_mode{IrPassMode::kExclude};
-  // passes to be excluded/included
-  std::vector<std::string> ir_passes{"embedding_fc_lstm_fuse_pass"};
-
-  // NOT stable yet.
-  bool use_feed_fetch_ops{true};
-
-  // NOTE this is just for internal development, please not use it.
-  // NOT stable yet.
-  bool _use_mkldnn{false};
-};
-
-// Configurations for Anakin engine.
-struct AnakinConfig : public PaddlePredictor::Config {
-  enum TargetType { NVGPU = 0, X86 };
-  int device;
-  std::string model_file;
-  int max_batch_size{-1};
-  TargetType target_type;
-};
-
-}  // namespace contrib
-
-int PaddleDtypeSize(PaddleDType dtype);
-
-}  // namespace paddle
+#include "paddle_api.h"  // NOLINT
+#ifndef WITH_ANAKIN
+#include "paddle_analysis_config.h"  // NOLINT
+#else
+#include "paddle_anakin_config.h"  // NOLINT
+#endif
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc3ce72f0832c4bf029f86e023bd9ff11f6578bd
--- /dev/null
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/paddle_pass_builder.h"
+#include <glog/logging.h>
+
+namespace paddle {
+
+void PaddlePassBuilder::AppendPass(const std::string &pass_type) {
+  passes_.push_back(pass_type);
+}
+
+void PaddlePassBuilder::TurnOnDebug() {
+  std::vector<std::string> passes;
+  auto it = std::begin(passes_);
+  while (it != std::end(passes_)) {
+    if (*it != "graph_viz_pass") {
+      it = passes_.insert(it + 1, "graph_viz_pass");
+    } else {
+      ++it;
+    }
+  }
+}
+
+std::string PaddlePassBuilder::DebugString() {
+  std::stringstream ss;
+  ss << "Passes to apply:\n";
+  for (auto &pass : passes_) {
+    ss << "  - " << pass << '\n';
+  }
+  return ss.str();
+}
+
+void PaddlePassBuilder::DeletePass(const std::string &pass_type) {
+  auto it = std::begin(passes_);
+  while (it != std::end(passes_)) {
+    if (*it == pass_type) {
+      it = passes_.erase(it);
+    } else {
+      ++it;
+    }
+  }
+}
+
+void PaddlePassBuilder::InsertPass(size_t idx, const std::string &pass_type) {
+  passes_.insert(std::begin(passes_) + idx, pass_type);
+}
+
+void PaddlePassBuilder::DeletePass(size_t idx) {
+  passes_.erase(std::begin(passes_) + idx);
+}
+
+void GpuPassStrategy::EnableMKLDNN() {
+  LOG(ERROR) << "GPU not support MKLDNN yet";
+}
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..8aad5c5984891546776bc52327337c94c387d6dc
--- /dev/null
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -0,0 +1,131 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace paddle {
+/*
+ * This is a pass builder based on string. It is part of inference API.
+ */
+class PaddlePassBuilder {
+ public:
+  explicit PaddlePassBuilder(const std::vector<std::string> &passes)
+      : passes_(passes) {}
+
+  void AppendPass(const std::string &pass_type);
+
+  void InsertPass(size_t idx, const std::string &pass_type);
+
+  // Delete the `idx`-th pass.
+  void DeletePass(size_t idx);
+
+  // Delete all the passes that has type `pass_type`.
+  void DeletePass(const std::string &pass_type);
+
+  // Visualize the computation graph after each pass by generating a DOT
+  // language file, one can draw them with the Graphviz toolkit.
+  void TurnOnDebug();
+
+  // Human-readible information.
+  std::string DebugString();
+
+  const std::vector<std::string> &AllPasses() const { return passes_; }
+
+ protected:
+  std::vector<std::string> passes_;
+};
+
+/*
+ * Pass strategy to help control the IR passes.
+ */
+class PassStrategy : public PaddlePassBuilder {
+ public:
+  explicit PassStrategy(const std::vector<std::string> &passes)
+      : PaddlePassBuilder(passes) {}
+
+  // The MKLDNN control exists in both CPU and GPU mode, because there can be
+  // still some CPU kernels running in CPU mode.
+  virtual void EnableMKLDNN() = 0;
+
+  virtual ~PassStrategy() = default;
+};
+
+/*
+ * The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
+ */
+class CpuPassStrategy : public PassStrategy {
+ public:
+  CpuPassStrategy() : PassStrategy({}) {
+    // NOTE the large fusions should be located in the front, so that they will
+    // not be damaged by smaller ones.
+    passes_.assign({
+        "infer_clean_graph_pass",         //
+        "attention_lstm_fuse_pass",       //
+        "seqconv_eltadd_relu_fuse_pass",  //
+        // "embedding_fc_lstm_fuse_pass", //
+        "fc_lstm_fuse_pass",             //
+        "mul_lstm_fuse_pass",            //
+        "fc_gru_fuse_pass",              //
+        "mul_gru_fuse_pass",             //
+        "seq_concat_fc_fuse_pass",       //
+        "fc_fuse_pass",                  //
+        "conv_bn_fuse_pass",             //
+        "conv_eltwiseadd_bn_fuse_pass",  //
+    });
+  }
+
+  virtual ~CpuPassStrategy() = default;
+
+  virtual void EnableMKLDNN() override {
+// TODO(Superjomn) Consider the way to mix CPU with GPU.
+#ifdef PADDLE_WITH_MKLDNN
+    passes_.insert(passes_.begin(), "mkldnn_placement_pass");
+
+    for (auto &pass :
+         std::vector<std::string>({"depthwise_conv_mkldnn_pass",  //
+                                   "conv_bias_mkldnn_fuse_pass",  //
+                                   "conv_relu_mkldnn_fuse_pass",  //
+                                   "conv_elementwise_add_mkldnn_fuse_pass"})) {
+      passes_.push_back(pass);
+    }
+#endif
+  }
+
+  CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.passes_) {}
+};
+
+/*
+ * The GPU passes strategy, it is used in
+ */
+class GpuPassStrategy : public PassStrategy {
+ public:
+  GpuPassStrategy() : PassStrategy({}) {
+    passes_.assign({
+        "infer_clean_graph_pass", "conv_bn_fuse_pass",
+    });
+  }
+
+  GpuPassStrategy(const GpuPassStrategy &other)
+      : PassStrategy(other.AllPasses()) {}
+
+  virtual void EnableMKLDNN() override;
+
+  virtual ~GpuPassStrategy() = default;
+};
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index e246a06fd079d837ac321197914c9f70b528f2c8..bb749e8f8b0ba9d5cd82d91ce86c619f52f34c30 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -59,7 +59,8 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) {
 bool IsPersistable(const framework::VarDesc* var) {
   if (var->Persistable() &&
       var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
-      var->GetType() != framework::proto::VarType::FETCH_LIST) {
+      var->GetType() != framework::proto::VarType::FETCH_LIST &&
+      var->GetType() != framework::proto::VarType::RAW) {
     return true;
   }
   return false;
@@ -77,7 +78,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
 
   for (auto* var : global_block.AllVars()) {
     if (IsPersistable(var)) {
-      VLOG(3) << "persistable variable's name: " << var->Name();
+      VLOG(30) << "persistable variable's name: " << var->Name();
 
       framework::VarDesc* new_var = load_block->Var(var->Name());
       new_var->SetShape(var->GetShape());
@@ -120,7 +121,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
                                              const std::string& dirname) {
   std::string model_filename = dirname + "/__model__";
   std::string program_desc_str;
-  VLOG(3) << "loading model from " << model_filename;
+  VLOG(30) << "loading model from " << model_filename;
   ReadBinaryFile(model_filename, &program_desc_str);
 
   std::unique_ptr<framework::ProgramDesc> main_program(
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index e73c5bbf57501e4ff3c080a46d91685035652bfa..0b756534ec6fbf27a3e92bf39fb7544d9785ca48 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -27,7 +27,7 @@ class ActivationOpConverter : public OpConverter {
     // Here the two nullptr looks strange, that's because the
     // framework::OpDesc's constructor is strange.
     framework::OpDesc op_desc(op, nullptr);
-    LOG(INFO)
+    VLOG(3)
         << "convert a fluid Activation op to tensorrt activation layer whose "
            "type is "
         << op_type_;
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
index 3330af2da6c97ad153dcecd86be4b441eac62b5e..d017bac66dd99a4b54c44ec786de61d1e66b8981 100644
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -23,7 +23,7 @@ class BatchNormOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-    LOG(INFO) << "convert a fluid batch norm op to tensorrt batch_norm";
+    VLOG(3) << "convert a fluid batch norm op to tensorrt batch_norm";
 
     framework::OpDesc op_desc(op, nullptr);
     PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
index a11dfa1e8f2dacfad067d025678911200db500fb..b2e7c593e85974898012f8a353817a27ca212f4d 100644
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
@@ -25,7 +25,7 @@ class ConcatOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-    VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
+    VLOG(3) << "convert a fluid mul op to tensorrt mul layer without bias";
 
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 7bcf2dd1eeb17e802c5647df31945284ae08fa95..43950b8c048b4e1b8974956948caa639812b2f78 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -37,8 +37,7 @@ class Conv2dOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-    LOG(INFO)
-        << "convert a fluid conv2d op to tensorrt conv layer without bias";
+    VLOG(3) << "convert a fluid conv2d op to tensorrt conv layer without bias";
 
     framework::OpDesc op_desc(op, nullptr);
     PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
index 9533ecbcfda4e2500fd201d8efc64fc5bd97169a..ddbc724e3b2a48b75df17f9bda691a1fd3883c32 100644
--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
@@ -25,7 +25,7 @@ class DropoutOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-    VLOG(4) << "convert a fluid dropout op to tensorrt dropout layer";
+    VLOG(3) << "convert a fluid dropout op to tensorrt dropout layer";
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 0a6ce568f194f03c7259e1ebf28dd6ce4df2d594..671bcd8aa9a9fff34644a056499961cf6ab81287 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -26,7 +26,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
     // Here the two nullptr looks strange, that's because the
     // framework::OpDesc's constructor is strange.
     framework::OpDesc op_desc(op, nullptr);
-    LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer";
+    VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer";
 
     PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
     PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
@@ -108,7 +108,7 @@ class ElementwiseTensorOpConverter : public OpConverter {
     // Here the two nullptr looks strange, that's because the
     // framework::OpDesc's constructor is strange.
     framework::OpDesc op_desc(op, nullptr);
-    LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer";
+    VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer";
 
     PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
     PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index 7c21ecd95da07b498eed2ab1bbdcc0e8cd184787..eef4fab4e86f05fa80bc614371f1aa43e433407e 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -52,7 +52,7 @@ class FcOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-    VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias";
+    VLOG(3) << "convert a fluid fc op to tensorrt fc layer without bias";
 
     framework::OpDesc op_desc(op, nullptr);
     PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
index 514eb659a8da73b6e56b5d17148ec0cb2aeaa135..5b6aaad49833cedbd8d1ee0ec5d24c7f983190e6 100644
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -25,7 +25,7 @@ class MulOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-    VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
+    VLOG(3) << "convert a fluid mul op to tensorrt mul layer without bias";
 
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
index 218030a591fcc7e533ef37062265449d4b6044bc..4afcb0aecec9d07b52d2fd701fae8750067a6041 100644
--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
@@ -25,7 +25,7 @@ class PadOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-    VLOG(4) << "convert a fluid transpose op to tensorrt tranpose layer";
+    VLOG(3) << "convert a fluid transpose op to tensorrt tranpose layer";
 
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 677f85152f202b514d0563f885d872c84faba19a..48850020840a49bd309c007943f14b2f7eec5e2d 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -25,7 +25,7 @@ class Pool2dOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-    VLOG(4)
+    VLOG(3)
         << "convert a fluid pool2d op to tensorrt pool2d layer without bias";
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
index 0064f90fd7944403c14d4d47616ea82f681ceb74..80bfb2d190a5637032e7c18fbac7f22b3a9e81e1 100644
--- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
@@ -25,7 +25,7 @@ class SoftMaxOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-    VLOG(4)
+    VLOG(3)
         << "convert a fluid softmax op to tensorrt softmax layer without bias";
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 9e0f95844761db7571c5313726d34685a9aa66b2..8adc3baca64845f596477a0abe61be31e7377d9f 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -61,6 +61,7 @@ TensorRTEngine::~TensorRTEngine() {
 }
 
 void TensorRTEngine::FreezeNetwork() {
+  VLOG(3) << "TRT to freeze network";
   freshDeviceId();
   PADDLE_ENFORCE(infer_builder_ != nullptr,
                  "Call InitNetwork first to initialize network.");
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index d9d3827321127631c0af6e5cfd2dfdd640cee146..828181200e300c370bbfa234c3c23ae44810878c 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -134,7 +134,7 @@ class TensorRTEngine : public EngineBase {
   std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
       weight_map;
 
-  // TODO: (NHZLX)
+  // TODO(NHZLX)
   // In the normal case, the paddle-trt exists bug when runing the googlenet.
   // When there are more than two convolutions of 1 * 1 with the same input, the
   // paddle-tensorrt will do the merging optimization, which fuse those conv
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
index b6e7968108403c9c9c192759c44eac040d1c5073..fc7ca7714e9325d2b6bce6189300aa339c81c2ba 100644
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -52,7 +52,7 @@ class NaiveLogger : public nvinfer1::ILogger {
   void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
     switch (severity) {
       case Severity::kINFO:
-        LOG(INFO) << msg;
+        VLOG(3) << msg;
         break;
       case Severity::kWARNING:
         LOG(WARNING) << msg;
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 401ef508bc5918ba55f237a8f123aa83eba10b41..fe0937da104cf678221143c54b456ac476f4ddf1 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -112,7 +112,8 @@ if(WITH_GPU AND TENSORRT_FOUND)
    if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
        inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
    endif()
-   cc_test(test_trt_models SRCS trt_models_tester.cc  
-     ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models
-     DEPS paddle_inference_tensorrt_subgraph_engine)
+
+   inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
+      EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor
+        ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
 endif()
diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
index e5c8dfd22a006d5271248c5b083aab2c22417502..5c92096d9d3e607d79ca74e16a558a4999c44414 100644
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -37,7 +37,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 void profile(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  cfg._use_mkldnn = use_mkldnn;
+
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
   std::vector<PaddleTensor> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -65,7 +68,9 @@ TEST(Analyzer_resnet50, fuse_statis) {
 void compare(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  cfg._use_mkldnn = use_mkldnn;
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
index e0416ff953b61f56a2ca1a45cb382d40a6cffa4a..612ae121b2ecbccb0ba8faf72aef83ec01a104bd 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -210,7 +210,6 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->device = 0;
   cfg->specify_input_name = true;
   cfg->enable_ir_optim = true;
-  cfg->ir_passes.clear();  // Do not exclude any pass.
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
@@ -226,13 +225,15 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 
 // Easy for profiling independently.
 TEST(Analyzer_rnn1, profile) {
-  contrib::AnalysisConfig cfg;
+  contrib::AnalysisConfig cfg(false);
   SetConfig(&cfg);
-  cfg.use_gpu = false;
+  cfg.fraction_of_gpu_memory = 0.1;
+  cfg.pass_builder()->TurnOnDebug();
   std::vector<PaddleTensor> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
+  LOG(INFO) << "to test prediction";
   TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
 }
 
@@ -274,31 +275,6 @@ TEST(Analyzer_rnn1, multi_thread) {
   TestPrediction(cfg, input_slots_all, &outputs, 4 /* multi_thread */);
 }
 
-bool CompareTensors(const framework::Scope &a_scope,
-                    const framework::Scope &b_scope,
-                    const std::vector<std::string> &tensors) {
-  for (auto &x : tensors) {
-    auto *a_var = a_scope.FindVar(x);
-    auto *b_var = b_scope.FindVar(x);
-    if (a_var && b_var) {
-      if (a_var->Type() == typeid(framework::LoDTensor) ||
-          a_var->Type() == typeid(framework::Tensor)) {
-        LOG(INFO) << "comparing tensor " << x;
-        auto &a_t = a_var->Get<framework::LoDTensor>();
-        auto &b_t = b_var->Get<framework::LoDTensor>();
-        if (!inference::CompareTensor(a_t, b_t)) {
-          LOG(ERROR) << string::Sprintf("tensor %s not match in two scopes", x);
-        }
-      } else {
-        LOG(INFO) << "skip no tensor " << x;
-      }
-    } else {
-      LOG(INFO) << "skip tensor " << x;
-    }
-  }
-  return true;
-}
-
 // Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
 // on the complex RNN1 model.
 TEST(Analyzer_rnn1, ZeroCopy) {
@@ -307,7 +283,6 @@ TEST(Analyzer_rnn1, ZeroCopy) {
   config.use_feed_fetch_ops = false;
 
   PaddlePlace place;
-  int output_size{0};
 
   auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
 
@@ -353,86 +328,22 @@ TEST(Analyzer_rnn1, ZeroCopy) {
 
   Timer timer;
   double total_time{0};
-  double native_total_time{0};
-  double analysis_total_time{0.};
-
   for (int i = 0; i < FLAGS_repeat; i++) {
     timer.tic();
     predictor->ZeroCopyRun();
     total_time += timer.toc();
   }
+  LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor);
 
-  auto *output_data = output_tensor->data<float>(&place, &output_size);
-  ASSERT_GT(output_size, 0);  // more than one output!
-
-  for (int i = 0; i < FLAGS_repeat; i++) {
-    // Run native predictor.
-    timer.tic();
-    ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs));
-    native_total_time += timer.toc();
-  }
-
-  for (int i = 0; i < FLAGS_repeat; i++) {
-    timer.tic();
-    ASSERT_TRUE(
-        analysis_predictor->Run(native_inputs.front(), &analysis_outputs));
-    analysis_total_time += timer.toc();
-  }
-
-  if (!FLAGS_with_precision_check) {
-    return;
-  }
-  int native_output_size = VecReduceToInt(native_outputs.front().shape);
-
-  EXPECT_EQ(native_output_size, output_size);
+  ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs));
+  LOG(INFO) << "native output " << DescribeTensor(native_outputs.front());
 
-  // Compare tensors between analysis and zerocopy
-  auto *p0 = static_cast<AnalysisPredictor *>(predictor.get());
-  auto *p1 = static_cast<AnalysisPredictor *>(analysis_predictor.get());
-  auto *p2 = static_cast<NativePaddlePredictor *>(native_predictor.get());
-
-  std::vector<std::string> tensor_names;
-  for (auto &var_desc : p0->program().Block(0).AllVars()) {
-    tensor_names.push_back(var_desc->Name());
-  }
-
-  LOG(INFO) << "Comparing tensors";
-  ASSERT_TRUE(
-      CompareTensors(*p0->scope(), *p1->scope(), {"final_output.tmp_1"}));
-  ASSERT_TRUE(
-      CompareTensors(*p0->scope(), *p2->scope(), {"final_output.tmp_1"}));
-
-  LOG(INFO) << "output1 " << inference::LoDTensorSummary<float>(
-                                 p0->scope()
-                                     ->FindVar("final_output.tmp_1")
-                                     ->Get<framework::LoDTensor>());
-  LOG(INFO) << "output2 " << inference::LoDTensorSummary<float>(
-                                 p1->scope()
-                                     ->FindVar("final_output.tmp_1")
-                                     ->Get<framework::LoDTensor>());
-  LOG(INFO) << "output3 " << inference::LoDTensorSummary<float>(
-                                 p2->scope()
-                                     ->FindVar("final_output.tmp_1")
-                                     ->Get<framework::LoDTensor>());
-
-  for (int i = 0; i < output_size; i++) {
-    LOG(INFO) << output_data[i] << " "
-              << static_cast<float *>(native_outputs.front().data.data())[i]
-              << " "
-              << static_cast<float *>(analysis_outputs.front().data.data())[i];
-    EXPECT_NEAR(output_data[i],
-                static_cast<float *>(native_outputs.front().data.data())[i],
-                1e-3);
+  int output_size{0};
+  auto *zero_copy_data = output_tensor->data<float>(&place, &output_size);
+  auto *native_data = static_cast<float *>(native_outputs.front().data.data());
+  for (size_t i = 0; i < output_size / sizeof(float); i++) {
+    EXPECT_NEAR(zero_copy_data[i], native_data[i], 1e-3);
   }
-
-  LOG(INFO) << "batch_size: " << FLAGS_batch_size;
-
-  LOG(INFO) << "zero average time: "
-            << total_time / (FLAGS_repeat * FLAGS_batch_size);
-  LOG(INFO) << "analysis average time: "
-            << analysis_total_time / (FLAGS_repeat * FLAGS_batch_size);
-  LOG(INFO) << "native average time: "
-            << native_total_time / (FLAGS_repeat * FLAGS_batch_size);
 }
 
 TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
index ca19475bda372398d425b0fa6f9a732cd79a8166..05bffede472d9674aa4213468662d7e08792035b 100644
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -108,9 +108,7 @@ TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
   // Enable embedding_fc_lstm_fuse_pass (disabled by default)
-  auto it = std::find(cfg.ir_passes.begin(), cfg.ir_passes.end(),
-                      "embedding_fc_lstm_fuse_pass");
-  if (it != cfg.ir_passes.end()) cfg.ir_passes.erase(it);
+  cfg.pass_builder()->InsertPass(2, "embedding_fc_lstm_fuse_pass");
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index 8933296490793a7693124eba23f8cf0801881e14..8fafd25b781a1755cce3d882e92b7ed018d3686c 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -27,7 +27,7 @@ struct Record {
 };
 
 Record ProcessALine(const std::string &line) {
-  VLOG(3) << "process a line";
+  VLOG(30) << "process a line";
   std::vector<std::string> columns;
   split(line, '\t', &columns);
   CHECK_EQ(columns.size(), 2UL)
@@ -45,8 +45,8 @@ Record ProcessALine(const std::string &line) {
   for (auto &s : shape_strs) {
     record.shape.push_back(std::stoi(s));
   }
-  VLOG(3) << "data size " << record.data.size();
-  VLOG(3) << "data shape size " << record.shape.size();
+  VLOG(30) << "data size " << record.data.size();
+  VLOG(30) << "data shape size " << record.shape.size();
   return record;
 }
 
@@ -58,7 +58,10 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->enable_ir_optim = true;
   cfg->specify_input_name = true;
   // TODO(TJ): fix fusion gru
-  cfg->ir_passes.push_back("fc_gru_fuse_pass");
+  cfg->pass_builder()->DeletePass("fc_gru_fuse_pass");
+#ifdef PADDLE_WITH_MKLDNN
+  cfg->EnableMKLDNN();
+#endif
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
@@ -84,7 +87,9 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 void profile(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  cfg._use_mkldnn = use_mkldnn;
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
   std::vector<PaddleTensor> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -125,7 +130,9 @@ TEST(Analyzer_vis, fuse_statis) {
 void compare(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  cfg._use_mkldnn = use_mkldnn;
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 8c5888d8da7b33eeca77311c10dd818648e8e524..ab4ab20b58020e45f5002d4436d621004e4326fa 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -20,6 +20,7 @@
 #include <thread>  // NOLINT
 #include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
@@ -88,22 +89,25 @@ size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }
 
 std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
                                                    int *num_ops) {
+  std::unordered_map<std::string, int> res;
   auto *analysis_predictor = static_cast<AnalysisPredictor *>(predictor);
-  auto &fuse_statis = analysis_predictor->analysis_argument()
-                          .Get<std::unordered_map<std::string, int>>(
-                              framework::ir::kFuseStatisAttr);
-  for (auto &item : fuse_statis) {
+  auto *fusion_status =
+      analysis_predictor->analysis_argument().fusion_statis_ptr();
+  if (!fusion_status) {
+    return res;
+  }
+  for (auto &item : *fusion_status) {
     LOG(INFO) << "fused " << item.first << " " << item.second;
   }
   int num = 0;
   for (auto &node :
-       analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
-    if (node->IsFunction()) {
+       analysis_predictor->analysis_argument().main_graph().Nodes()) {
+    if (node->IsOp()) {
       ++num;
     }
   }
   *num_ops = num;
-  return fuse_statis;
+  return *fusion_status;
 }
 
 void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
@@ -161,11 +165,12 @@ void TestMultiThreadPrediction(
   int num_times = FLAGS_repeat;
   std::vector<std::thread> threads;
   std::vector<std::unique_ptr<PaddlePredictor>> predictors;
-  // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
-  // because AttentionLSTM's hard code nodeid will be damanged.
-  for (int tid = 0; tid < num_threads; ++tid) {
-    predictors.emplace_back(CreateTestPredictor(config, use_analysis));
+  predictors.emplace_back(CreateTestPredictor(config, use_analysis));
+  for (int tid = 1; tid < num_threads; ++tid) {
+    predictors.emplace_back(predictors.front()->Clone());
   }
+
+  size_t total_time{0};
   for (int tid = 0; tid < num_threads; ++tid) {
     threads.emplace_back([&, tid]() {
 #ifdef PADDLE_WITH_MKLDNN
@@ -173,17 +178,21 @@ void TestMultiThreadPrediction(
 #endif
       // Each thread should have local inputs and outputs.
       // The inputs of each thread are all the same.
-      std::vector<std::vector<PaddleTensor>> inputs_tid = inputs;
       std::vector<PaddleTensor> outputs_tid;
+      auto &predictor = predictors[tid];
+      LOG(INFO) << "running thread " << tid;
       Timer timer;
       timer.tic();
       for (int i = 0; i < num_times; i++) {
-        for (size_t j = 0; j < inputs_tid.size(); j++) {
-          predictors[tid]->Run(inputs_tid[j], &outputs_tid);
+        for (const auto &input : inputs) {
+          ASSERT_TRUE(predictor->Run(input, &outputs_tid));
         }
       }
-      PrintTime(batch_size, num_times, num_threads, tid,
-                timer.toc() / num_times, inputs_tid.size());
+
+      auto time = timer.toc();
+      total_time += time;
+      PrintTime(batch_size, num_times, num_threads, tid, time / num_times,
+                inputs.size());
     });
   }
   for (int i = 0; i < num_threads; ++i) {
@@ -196,7 +205,7 @@ void TestPrediction(const AnalysisConfig &config,
                     std::vector<PaddleTensor> *outputs, int num_threads,
                     bool use_analysis = FLAGS_use_analysis) {
   LOG(INFO) << "use_analysis: " << use_analysis
-            << ", use_mkldnn: " << config._use_mkldnn;
+            << ", use_mkldnn: " << config.use_mkldnn();
   if (num_threads == 1) {
     TestOneThreadPrediction(config, inputs, outputs, use_analysis);
   } else {
@@ -208,7 +217,7 @@ void TestPrediction(const AnalysisConfig &config,
 void CompareNativeAndAnalysis(
     const AnalysisConfig &config,
     const std::vector<std::vector<PaddleTensor>> &inputs) {
-  LOG(INFO) << "use_mkldnn: " << config._use_mkldnn;
+  LOG(INFO) << "use_mkldnn: " << config.use_mkldnn();
   std::vector<PaddleTensor> native_outputs, analysis_outputs;
   TestOneThreadPrediction(config, inputs, &native_outputs, false);
   TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
index 75840a9c437d956da4f542a38b2532ea20ee96c5..71423154f84797cf564dd4e71941853fae5a0767 100644
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -16,10 +16,13 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
-using paddle::contrib::MixedRTConfig;
+using paddle::contrib::AnalysisConfig;
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
 
@@ -27,33 +30,24 @@ NativeConfig GetConfigNative() {
   NativeConfig config;
   config.model_dir = FLAGS_dirname;
   // LOG(INFO) << "dirname  " << config.model_dir;
-  config.fraction_of_gpu_memory = 0.45;
+  config.fraction_of_gpu_memory = 0.15;
   config.use_gpu = true;
   config.device = 0;
   return config;
 }
 
-MixedRTConfig GetConfigTRT() {
-  MixedRTConfig config;
-  config.model_dir = FLAGS_dirname;
-  config.use_gpu = true;
-  config.fraction_of_gpu_memory = 0.2;
-  config.device = 0;
-  config.max_batch_size = 3;
-  return config;
+void PrepareTRTConfig(AnalysisConfig *config) {
+  config->model_dir = FLAGS_dirname + "/" + "mobilenet";
+  config->fraction_of_gpu_memory = 0.15;
+  config->EnableTensorRtEngine(1 << 10, 5);
+  config->pass_builder()->DeletePass("conv_bn_fuse_pass");
+  config->pass_builder()->DeletePass("fc_fuse_pass");
+  config->pass_builder()->TurnOnDebug();
 }
 
-void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
-  NativeConfig config0 = GetConfigNative();
-  config0.model_dir = model_dirname;
-
-  MixedRTConfig config1 = GetConfigTRT();
-  config1.model_dir = model_dirname;
-  config1.max_batch_size = batch_size;
-
-  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
-  auto predictor1 = CreatePaddlePredictor<MixedRTConfig>(config1);
-  // Prepare inputs
+void PrepareInputs(std::vector<PaddleTensor> *tensors, int batch_size) {
+  PADDLE_ENFORCE_EQ(tensors->size(), 1UL);
+  auto &tensor = tensors->front();
   int height = 224;
   int width = 224;
   float *data = new float[batch_size * 3 * height * width];
@@ -61,25 +55,34 @@ void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
   data[0] = 1.0f;
 
   // Prepare inputs
-  PaddleTensor tensor;
   tensor.name = "input_0";
   tensor.shape = std::vector<int>({batch_size, 3, height, width});
   tensor.data = PaddleBuf(static_cast<void *>(data),
                           sizeof(float) * (batch_size * 3 * height * width));
   tensor.dtype = PaddleDType::FLOAT32;
-  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+}
+
+void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
+  auto config0 = GetConfigNative();
+  config0.model_dir = model_dirname;
+
+  AnalysisConfig config1(true);
+  PrepareTRTConfig(&config1);
+  config1.model_dir = model_dirname;
+
+  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
+  auto predictor1 = CreatePaddlePredictor(config1);
+
+  // Prepare inputs
+  std::vector<PaddleTensor> paddle_tensor_feeds(1);
+  PrepareInputs(&paddle_tensor_feeds, batch_size);
 
   // Prepare outputs
   std::vector<PaddleTensor> outputs0;
   std::vector<PaddleTensor> outputs1;
   CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0));
-
   CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
 
-  // Get output.
-  ASSERT_EQ(outputs0.size(), 1UL);
-  ASSERT_EQ(outputs1.size(), 1UL);
-
   const size_t num_elements = outputs0.front().data.length() / sizeof(float);
   const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
   EXPECT_EQ(num_elements, num_elements1);
@@ -94,15 +97,52 @@ void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
 }
 
 TEST(trt_models_test, mobilenet) {
-  CompareTensorRTWithFluid(1, FLAGS_dirname + "/mobilenet");
+  CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "mobilenet");
 }
-
 TEST(trt_models_test, resnet50) {
-  CompareTensorRTWithFluid(1, FLAGS_dirname + "/resnet50");
+  CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "resnet50");
 }
-
 TEST(trt_models_test, resnext50) {
-  CompareTensorRTWithFluid(1, FLAGS_dirname + "/resnext50");
+  CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "resnext50");
+}
+
+TEST(trt_models_test, raw_gpu) {
+  std::string model_dir = FLAGS_dirname + "/" + "mobilenet";
+  auto config0 = GetConfigNative();
+  config0.model_dir = model_dir;
+  int batch_size = 2;
+
+  AnalysisConfig config1(true);
+  config1.fraction_of_gpu_memory = 0.1;
+  config1.enable_ir_optim = true;
+  config1.model_dir = model_dir;
+
+  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
+  auto predictor1 = CreatePaddlePredictor(config1);
+
+  // Prepare inputs
+  std::vector<PaddleTensor> paddle_tensor_feeds(1);
+  PrepareInputs(&paddle_tensor_feeds, batch_size);
+
+  // Prepare outputs
+  std::vector<PaddleTensor> outputs0;
+  std::vector<PaddleTensor> outputs1;
+  CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0));
+  CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
+
+  const size_t num_elements = outputs0.front().data.length() / sizeof(float);
+  const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
+  EXPECT_EQ(num_elements, num_elements1);
+
+  auto *data0 = static_cast<float *>(outputs0.front().data.data());
+  auto *data1 = static_cast<float *>(outputs1.front().data.data());
+
+  ASSERT_GT(num_elements, 0UL);
+  for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
+    EXPECT_NEAR(data0[i], data1[i], 1e-3);
+  }
 }
 
 }  // namespace paddle
+
+USE_PASS(tensorrt_subgraph_pass);
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 26ef27c3caafadb4801b0ae52133f6175655ce0a..dd7ffaa26426edebd47ec3f6fb275ad5a2d23322 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -32,11 +32,11 @@ BuddyAllocator::BuddyAllocator(
       system_allocator_(std::move(system_allocator)) {}
 
 BuddyAllocator::~BuddyAllocator() {
-  VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these "
-              "have actually been freed";
+  VLOG(100) << "BuddyAllocator Disconstructor makes sure that all of these "
+               "have actually been freed";
   while (!pool_.empty()) {
     auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
-    VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
+    VLOG(100) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
 
     system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
     cache_.invalidate(block);
@@ -57,12 +57,12 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
   // acquire the allocator lock
   std::lock_guard<std::mutex> lock(mutex_);
 
-  VLOG(10) << "Allocate " << unaligned_size << " bytes from chunk size "
-           << size;
+  VLOG(100) << "Allocate " << unaligned_size << " bytes from chunk size "
+            << size;
 
   // if the allocation is huge, send directly to the system allocator
   if (size > max_chunk_size_) {
-    VLOG(10) << "Allocate from system allocator.";
+    VLOG(100) << "Allocate from system allocator.";
     return SystemAlloc(size);
   }
 
@@ -77,9 +77,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
       return nullptr;
     }
   } else {
-    VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it)
-             << " at address "
-             << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
+    VLOG(100) << "Allocation from existing memory block " << std::get<2>(*it)
+              << " at address "
+              << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
   }
 
   total_used_ += size;
@@ -96,10 +96,10 @@ void BuddyAllocator::Free(void* p) {
   // Acquire the allocator lock
   std::lock_guard<std::mutex> lock(mutex_);
 
-  VLOG(10) << "Free from address " << block;
+  VLOG(100) << "Free from address " << block;
 
   if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
-    VLOG(10) << "Free directly from system allocator";
+    VLOG(100) << "Free directly from system allocator";
     system_allocator_->Free(block, block->total_size(cache_),
                             block->index(cache_));
 
@@ -116,8 +116,8 @@ void BuddyAllocator::Free(void* p) {
 
   // Trying to merge the right buddy
   if (block->has_right_buddy(cache_)) {
-    VLOG(10) << "Merging this block " << block << " with its right buddy "
-             << block->right_buddy(cache_);
+    VLOG(100) << "Merging this block " << block << " with its right buddy "
+              << block->right_buddy(cache_);
 
     auto right_buddy = block->right_buddy(cache_);
 
@@ -134,8 +134,8 @@ void BuddyAllocator::Free(void* p) {
 
   // Trying to merge the left buddy
   if (block->has_left_buddy(cache_)) {
-    VLOG(10) << "Merging this block " << block << " with its left buddy "
-             << block->left_buddy(cache_);
+    VLOG(100) << "Merging this block " << block << " with its left buddy "
+              << block->left_buddy(cache_);
 
     auto left_buddy = block->left_buddy(cache_);
 
@@ -151,8 +151,8 @@ void BuddyAllocator::Free(void* p) {
   }
 
   // Dumping this block into pool
-  VLOG(10) << "Inserting free block (" << block << ", "
-           << block->total_size(cache_) << ")";
+  VLOG(100) << "Inserting free block (" << block << ", "
+            << block->total_size(cache_) << ")";
   pool_.insert(
       IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
 
@@ -174,7 +174,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
   size_t index = 0;
   void* p = system_allocator_->Alloc(&index, size);
 
-  VLOG(10) << "Allocated " << p << " from system allocator.";
+  VLOG(100) << "Allocated " << p << " from system allocator.";
 
   if (p == nullptr) return nullptr;
 
@@ -200,8 +200,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
 
   if (p == nullptr) return pool_.end();
 
-  VLOG(10) << "Creating and inserting new block " << p
-           << " from system allocator";
+  VLOG(100) << "Creating and inserting new block " << p
+            << " from system allocator";
 
   static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
                                      max_chunk_size_, nullptr, nullptr);
@@ -245,19 +245,19 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
   auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
   pool_.erase(it);
 
-  VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_)
-           << ") into";
+  VLOG(100) << "Split block (" << block << ", " << block->total_size(cache_)
+            << ") into";
   block->split(&cache_, size);
 
-  VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_)
-           << ")";
+  VLOG(100) << "Left block (" << block << ", " << block->total_size(cache_)
+            << ")";
   block->set_type(&cache_, MemoryBlock::ARENA_CHUNK);
 
   // the rest of memory if exist
   if (block->has_right_buddy(cache_)) {
     if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
-      VLOG(10) << "Insert right block (" << block->right_buddy(cache_) << ", "
-               << block->right_buddy(cache_)->total_size(cache_) << ")";
+      VLOG(100) << "Insert right block (" << block->right_buddy(cache_) << ", "
+                << block->right_buddy(cache_)->total_size(cache_) << ")";
 
       pool_.insert(
           IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
@@ -284,7 +284,7 @@ void BuddyAllocator::CleanIdleFallBackAlloc() {
       return;
     }
 
-    VLOG(10) << "Return block " << block << " to fallback allocator.";
+    VLOG(100) << "Return block " << block << " to fallback allocator.";
 
     system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
     cache_.invalidate(block);
@@ -320,7 +320,7 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
 
     MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
 
-    VLOG(10) << "Return block " << block << " to base allocator.";
+    VLOG(100) << "Return block " << block << " to base allocator.";
 
     system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
     cache_.invalidate(block);
diff --git a/paddle/fluid/memory/detail/meta_cache.cc b/paddle/fluid/memory/detail/meta_cache.cc
index b86e4f38c42a26e155f276f9b73cbed1d0d83f7d..152e4e7f9fa2e18a2b3e5b4042089660d291badf 100644
--- a/paddle/fluid/memory/detail/meta_cache.cc
+++ b/paddle/fluid/memory/detail/meta_cache.cc
@@ -29,7 +29,7 @@ MemoryBlock::Desc MetadataCache::load(const MemoryBlock* block) const {
     return existing_desc->second;
   } else {
     auto* desc = reinterpret_cast<const MemoryBlock::Desc*>(block);
-    VLOG(10) << "Load MemoryBlock::Desc type=" << desc->type;
+    VLOG(100) << "Load MemoryBlock::Desc type=" << desc->type;
     PADDLE_ASSERT(desc->check_guards());
     return *reinterpret_cast<const MemoryBlock::Desc*>(block);
   }
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 0f13a4ea9c1af175771f5cc201ea5c0a8a0f7555..3400b5274679d8e859a008dcf47ac7122ace6b2d 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <string>
 #include <vector>
 
 #include "paddle/fluid/memory/malloc.h"
@@ -21,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/string/printf.h"
 
 DEFINE_bool(init_allocated_mem, false,
             "It is a mistake that the values of the memory allocated by "
@@ -71,18 +73,18 @@ struct NaiveAllocator {
 
 template <>
 void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
-  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  VLOG(100) << "Allocate " << size << " bytes on " << platform::Place(place);
   void* p = GetCPUBuddyAllocator()->Alloc(size);
   if (FLAGS_init_allocated_mem) {
     memset(p, 0xEF, size);
   }
-  VLOG(10) << "  pointer=" << p;
+  VLOG(100) << "  pointer=" << p;
   return p;
 }
 
 template <>
 void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
-  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  VLOG(100) << "Free pointer=" << p << " on " << platform::Place(place);
   GetCPUBuddyAllocator()->Free(p);
 }
 
@@ -110,12 +112,12 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
           std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)),
           platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
 
-      VLOG(10) << "\n\nNOTE: each GPU device use "
-               << FLAGS_fraction_of_gpu_memory_to_use * 100
-               << "% of GPU memory.\n"
-               << "You can set GFlags environment variable '"
-               << "FLAGS_fraction_of_gpu_memory_to_use"
-               << "' to change the fraction of GPU usage.\n\n";
+      VLOG(100) << "\n\nNOTE: each GPU device use "
+                << FLAGS_fraction_of_gpu_memory_to_use * 100
+                << "% of GPU memory.\n"
+                << "You can set GFlags environment variable '"
+                << "FLAGS_fraction_of_gpu_memory_to_use"
+                << "' to change the fraction of GPU usage.\n\n";
     }
   });
 
@@ -137,12 +139,18 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
     platform::SetDeviceId(place.device);
     size_t avail, total;
     platform::GpuMemoryUsage(&avail, &total);
-    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
-                 << place.device << ", available " << avail << " bytes";
+    LOG(WARNING) << "Cannot allocate " << string::HumanReadableSize(size)
+                 << " in GPU " << place.device << ", available "
+                 << string::HumanReadableSize(avail);
     LOG(WARNING) << "total " << total;
-    LOG(WARNING) << "GpuMinChunkSize " << buddy_allocator->GetMinChunkSize();
-    LOG(WARNING) << "GpuMaxChunkSize " << buddy_allocator->GetMaxChunkSize();
-    LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
+    LOG(WARNING) << "GpuMinChunkSize "
+                 << string::HumanReadableSize(
+                        buddy_allocator->GetMinChunkSize());
+    LOG(WARNING) << "GpuMaxChunkSize "
+                 << string::HumanReadableSize(
+                        buddy_allocator->GetMaxChunkSize());
+    LOG(WARNING) << "GPU memory used: "
+                 << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
     platform::SetDeviceId(cur_dev);
   }
   if (FLAGS_init_allocated_mem) {
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 919ad96f7adfc5025d9a8367c467f639c6fe3101..776bdfaee8ac24b066b95328fdb59d240f16a446 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -5,6 +5,8 @@ list(REMOVE_DUPLICATES GENERAL_OPS)
 set(DEPS_OPS "")
 set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h)
 file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
+
+set(PART_CUDA_KERNEL_FILES)
 function(op_library TARGET)
     # op_library is a function to create op library. The interface is same as
     # cc_library. But it handle split GPU/CPU code and link some common library
@@ -37,6 +39,12 @@ function(op_library TARGET)
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
             list(APPEND cu_srcs ${TARGET}.cu)
         endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
+            set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
+                    ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
+            list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
+        endif()
+
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
             list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
         endif()
@@ -296,7 +304,6 @@ op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
 op_library(unsqueeze_op DEPS reshape_op)
 op_library(squeeze_op DEPS reshape_op)
-op_library(extract_rows_op DEPS memory)
 op_library(flatten_op DEPS reshape_op)
 op_library(sequence_pad_op DEPS sequence_padding)
 op_library(unstack_op DEPS stack_op)
@@ -318,6 +325,7 @@ op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
 op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
+op_library(tensor_array_to_tensor_op DEPS concat_op)
 op_library(concat_op DEPS concat_and_split)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
@@ -327,6 +335,8 @@ foreach(src ${GENERAL_OPS})
 endforeach()
 
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
+
+
 if (NOT WIN32)
 add_subdirectory(reader)
 endif(NOT WIN32)
@@ -353,3 +363,14 @@ if(NOT WIN32)
     nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
+
+if(WITH_GPU)
+    foreach(CUDA_KERNEL_FILE ${PART_CUDA_KERNEL_FILES})
+        file(READ ${CUDA_KERNEL_FILE} TARGET_CONTENT)
+        string(REGEX MATCH "REGISTER_OP_CUDA_KERNEL\\(\\n?([^,]+),.*" MATCHED ${TARGET_CONTENT})
+        if (MATCHED)
+            string(STRIP ${CMAKE_MATCH_1} MATCHED)
+            file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${MATCHED}, CUDA);\n")
+        endif()
+    endforeach()
+endif()
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 9ddb3a5d29f973047507855b43b226913a3600b5..ea260a3e92b775023085fd02eec33e6ecfaf2e81 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -91,16 +91,12 @@ class ActivationOp : public framework::OperatorWithKernel {
   }
 };
 
-class ActivationOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto x_name = op_desc.Input("X")[0];
-    auto out_name = op_desc.Output("Out")[0];
-    auto& x = block->FindRecursiveOrCreateVar(x_name);
-    auto& out = block->FindRecursiveOrCreateVar(out_name);
-    out.SetType(x.GetType());
-    out.SetDataType(x.GetDataType());
+class ActivationOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
   }
 };
 
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 0747469e0f4c4fe6a323a499c720a54d1e278e09..4ffc7f364bcb9bda5f94be5fe071c73bd5c40ca7 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -95,7 +95,7 @@ class ActivationGradKernel
       auto x = framework::EigenVector<T>::Flatten(*X);
       functor(*place, x, out, dout, dx);
     } else {
-      VLOG(10) << " Inplace activation ";
+      VLOG(100) << " Inplace activation ";
       auto x = framework::EigenVector<T>::Flatten(*dX);
       functor(*place, x, out, dout, dx);
     }
diff --git a/paddle/fluid/operators/adagrad_op.cc b/paddle/fluid/operators/adagrad_op.cc
index a3ef9ad9f91f1f626bd33876693ecc17ad76b96b..c88297ff544ddb0e5a97452a8ad2e8f9f77825ba 100644
--- a/paddle/fluid/operators/adagrad_op.cc
+++ b/paddle/fluid/operators/adagrad_op.cc
@@ -119,8 +119,8 @@ struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
     auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
 
     // 2. m += g_m * g_m
-    math::scatter::Mul<platform::CPUDeviceContext, T> sqare_func;
-    auto grad_square = sqare_func(context, grad_merge, grad_merge);
+    auto grad_square =
+        SquareSelectedRows<platform::CPUDeviceContext, T>(context, grad_merge);
 
     math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
     functor(context, grad_square, moment);
diff --git a/paddle/fluid/operators/adagrad_op.cu b/paddle/fluid/operators/adagrad_op.cu
index b25268786d622bc7a94117849763833e528bef48..b99b33343d36fbb7f6b1a2928e142ca615b238b3 100644
--- a/paddle/fluid/operators/adagrad_op.cu
+++ b/paddle/fluid/operators/adagrad_op.cu
@@ -84,8 +84,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
     auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
     framework::Vector<int64_t> merge_rows(grad_merge.rows());
     // 2. m += g_m * g_m
-    math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func;
-    auto grad_square = sqare_func(context, grad_merge, grad_merge);
+    auto grad_square =
+        SquareSelectedRows<platform::CUDADeviceContext, T>(context, grad_merge);
 
     math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
     functor(context, grad_square, moment);
diff --git a/paddle/fluid/operators/adagrad_op.h b/paddle/fluid/operators/adagrad_op.h
index 0a16ce00f71586ef55007c3753e024be29d0ed56..9f6ef391696aa8718be71ae945e746b876813d94 100644
--- a/paddle/fluid/operators/adagrad_op.h
+++ b/paddle/fluid/operators/adagrad_op.h
@@ -28,6 +28,20 @@ struct SparseAdagradFunctor {
                   framework::Tensor *moment, framework::Tensor *param);
 };
 
+template <typename DeviceContext, typename T>
+framework::SelectedRows SquareSelectedRows(
+    const DeviceContext &context, const framework::SelectedRows &input) {
+  framework::SelectedRows out;
+  out.set_rows(input.rows());
+  out.set_height(input.height());
+  out.mutable_value()->mutable_data<T>(input.value().dims(),
+                                       context.GetPlace());
+  auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+  auto e_in = framework::EigenVector<T>::Flatten(input.value());
+  e_out.device(*context.eigen_device()) = e_in.square();
+  return out;
+}
+
 template <typename DeviceContext, typename T>
 class AdagradOpKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h
index 3455d1ee54e8e6e498d0b0e6932ec099af9c0b30..48e0448d09c64e2c2fa655d125064e7a6572e30e 100644
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -297,7 +297,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
       auto& grad =
           Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
       if (grad.rows().size() == 0) {
-        VLOG(3) << "grad row size is 0!!";
+        VLOG(30) << "grad row size is 0!!";
         return;
       }
 
diff --git a/paddle/fluid/operators/add_position_encoding_op.h b/paddle/fluid/operators/add_position_encoding_op.h
index 5f371235f160c416058e877dbba2d9fe89abf7db..0b40d3de890a02a9dbec2328f9f6388ffa35561b 100644
--- a/paddle/fluid/operators/add_position_encoding_op.h
+++ b/paddle/fluid/operators/add_position_encoding_op.h
@@ -66,9 +66,10 @@ class AddPositionEncodingKernel : public framework::OpKernel<T> {
           x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i];
       for (int j = 0; j < max_length; ++j) {
         for (int k = 0; k < half_size; ++k) {
-          const double val = (half_size > 1)
-                                 ? j / pow(10000.0, double(k) / (half_size - 1))
-                                 : j / 10000.0;
+          const double val =
+              (half_size > 1)
+                  ? j / pow(10000.0, static_cast<double>(k) / (half_size - 1))
+                  : j / 10000.0;
           dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta;
           dst_ptr[half_size + k] =
               src_ptr[half_size + k] * alpha + cos(val) * beta;
diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h
index 4309f0a5497456065e5c43bc8f7b265fa711f699..eddf34494bdab18c9d4ae1fb3d1e5d1a71fe590e 100644
--- a/paddle/fluid/operators/array_operator.h
+++ b/paddle/fluid/operators/array_operator.h
@@ -49,7 +49,7 @@ class ArrayOp : public framework::OperatorBase {
     } else {
       offset = static_cast<size_t>(*i_tensor.data<int64_t>());
     }
-    VLOG(10) << " Offset = " << offset;
+    VLOG(100) << " Offset = " << offset;
     return offset;
   }
 };
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index 6257e04b010d8c580e69e466759e8e80d344c105..3c40135eca00f4e0bbff9b0f0f7cf2a4c85ec556 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -148,8 +148,8 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
 
         size_t start_offset = lod_and_offset.second.first;
         size_t end_offset = lod_and_offset.second.second;
-        VLOG(10) << "idx=" << idx << " x_idx=" << x_idx << " ["
-                 << ", " << end_offset << "]";
+        VLOG(100) << "idx=" << idx << " x_idx=" << x_idx << " ["
+                  << ", " << end_offset << "]";
         // Copy data
         PADDLE_ENFORCE_GE(end_offset, start_offset);
         size_t len = end_offset - start_offset;
diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc
index 0784920064a879963cd9725cd9acf4cec7b874ce..cb98bc514083ad113fdebfbac043a9516fd9435a 100644
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -53,7 +53,7 @@ class AucOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Predict")->type()),
-        ctx.device_context());
+        platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 3eb473832577bd348b33ba9b0be9e597b78f26bc..cf245f5038f5f5ad1b623542aa14686eff8aad32 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -170,6 +170,15 @@ The required data format for this layer is one of the following:
   }
 };
 
+class BatchNormOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
+  }
+};
+
 template <typename T>
 class BatchNormKernel<platform::CPUDeviceContext, T>
     : public framework::OpKernel<T> {
@@ -525,7 +534,7 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
-                  ops::BatchNormGradMaker);
+                  ops::BatchNormOpInferVarType, ops::BatchNormGradMaker);
 REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp);
 
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc
index aaed335c905c0d80cd519afc5fecb06af73fcfe7..0609027c6940533483173209176f3243ccb36f8f 100644
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@@ -96,7 +96,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     mode_ = CUDNN_BATCHNORM_SPATIAL;
 #endif
 
-    VLOG(3) << "Setting descriptors.";
+    VLOG(30) << "Setting descriptors.";
     std::vector<int> dims;
     std::vector<int> strides;
     if (data_layout == DataLayout::kNCHW) {
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index 62771d09f112785ca1ba741a0ba239b1f0234633..791f8a4d3be6780c584997113b7ffcfb7ab35667 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -33,11 +33,11 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
 
   auto items = SelectTopBeamSizeItems(pre_ids, pre_scores);
   auto selected_items = ToMap(items, high_level.back());
-  VLOG(3) << "selected_items:";
+  VLOG(30) << "selected_items:";
   for (size_t i = 0; i < selected_items.size(); ++i) {
-    VLOG(3) << "offset:" << i;
+    VLOG(30) << "offset:" << i;
     for (auto &item : selected_items[i]) {
-      VLOG(3) << ItemToString(item);
+      VLOG(30) << ItemToString(item);
     }
   }
 
@@ -138,11 +138,11 @@ std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems(
     }
     result.emplace_back(items);
   }
-  VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
+  VLOG(30) << "SelectTopBeamSizeItems result size " << result.size();
   for (auto &items : result) {
-    VLOG(3) << "item set:";
+    VLOG(30) << "item set:";
     for (auto &item : items) {
-      VLOG(3) << ItemToString(item);
+      VLOG(30) << ItemToString(item);
     }
   }
 
diff --git a/paddle/fluid/operators/bilinear_interp_op.cu b/paddle/fluid/operators/bilinear_interp_op.cu
deleted file mode 100644
index 4c1971538495c6f111e9db18f4014786f6f0dd58..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/bilinear_interp_op.cu
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/operators/bilinear_interp_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-template <typename T>
-__global__ void KeBilinearInterpFw(
-    const T* in, const size_t in_img_h, const size_t in_img_w,
-    const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
-    const size_t out_img_w, const size_t output_h, const size_t output_w,
-    const size_t num_channels, const T ratio_h, const T ratioW) {
-  int nthreads = output_h * output_w;
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < nthreads) {
-    int out_id_h = tid / output_w;
-    int out_id_w = tid % output_w;
-    int in_img_size = input_w / num_channels;
-    int out_img_size = output_w / num_channels;
-    int channel_id = out_id_w / out_img_size;
-
-    int out_img_idy = (out_id_w % out_img_size) / out_img_w;
-    int in_img_idy = ratio_h * out_img_idy;
-    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
-    T h1lambda = ratio_h * out_img_idy - in_img_idy;
-    T h2lambda = 1.f - h1lambda;
-
-    int out_img_idx = tid % out_img_w;
-    int in_img_idx = ratioW * out_img_idx;
-    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
-    T w1lambda = ratioW * out_img_idx - in_img_idx;
-    T w2lambda = 1.f - w1lambda;
-
-    const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
-                          in_img_idy * in_img_w + in_img_idx];
-
-    // bilinear interpolation
-    out[out_id_h * output_w + out_id_w] =
-        h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) +
-        h1lambda * (w2lambda * in_pos[h_id * in_img_w] +
-                    w1lambda * in_pos[h_id * in_img_w + w_id]);
-  }
-}
-
-template <typename T>
-__global__ void KeBilinearInterpBw(
-    T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
-    const size_t input_w, const T* out, const size_t out_img_h,
-    const size_t out_img_w, const size_t output_h, const size_t output_w,
-    const size_t num_channels, const T ratio_h, const T ratioW) {
-  int nthreads = output_h * output_w;
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < nthreads) {
-    int out_id_h = tid / output_w;
-    int out_id_w = tid % output_w;
-    int in_img_size = input_w / num_channels;
-    int out_img_size = output_w / num_channels;
-    int channel_id = out_id_w / out_img_size;
-
-    int out_img_idy = (out_id_w % out_img_size) / out_img_w;
-    int in_img_idy = ratio_h * out_img_idy;
-    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
-    T h1lambda = ratio_h * out_img_idy - in_img_idy;
-    T h2lambda = 1.f - h1lambda;
-
-    int out_img_idx = tid % out_img_w;
-    int in_img_idx = ratioW * out_img_idx;
-    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
-    T w1lambda = ratioW * out_img_idx - in_img_idx;
-    T w2lambda = 1.f - w1lambda;
-
-    T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
-                    in_img_idy * in_img_w + in_img_idx];
-    const T* out_pos = &out[out_id_h * output_w + out_id_w];
-    atomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
-    atomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
-    atomicAdd(&in_pos[h_id * in_img_w], h1lambda * w2lambda * out_pos[0]);
-    atomicAdd(&in_pos[h_id * in_img_w + w_id],
-              h1lambda * w1lambda * out_pos[0]);
-  }
-}
-
-template <typename T>
-class BilinearInterpOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto* input_t = ctx.Input<Tensor>("X");      // float tensor
-    auto* output_t = ctx.Output<Tensor>("Out");  // float tensor
-    auto* input = input_t->data<T>();
-
-    int out_h = ctx.Attr<int>("out_h");
-    int out_w = ctx.Attr<int>("out_w");
-    auto out_dims = output_t->dims();
-    auto out_size_t = ctx.Input<Tensor>("OutSize");
-    if (out_size_t != nullptr) {
-      Tensor sizes;
-      framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes);
-      auto size_data = sizes.data<int>();
-      out_h = size_data[0];
-      out_w = size_data[1];
-    }
-    auto* output = output_t->mutable_data<T>(
-        {out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
-
-    int batch_size = input_t->dims()[0];
-    int channels = input_t->dims()[1];
-    int in_h = input_t->dims()[2];
-    int in_w = input_t->dims()[3];
-
-    int in_hw = in_h * in_w;
-    int out_hw = out_h * out_w;
-    int in_chw = channels * in_hw;
-    int out_chw = channels * out_hw;
-
-    T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f;
-    T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f;
-
-    if (in_h == out_h && in_w == out_w) {
-      memcpy(output, input, input_t->numel() * sizeof(T));
-    } else {
-      int threadNum = batch_size * out_chw;
-      int blocks = (threadNum + 1024 - 1) / 1024;
-
-      KeBilinearInterpFw<
-          T><<<blocks, 1024, 0, ctx.cuda_device_context().stream()>>>(
-          input, in_h, in_w, batch_size, in_chw, output, out_h, out_w,
-          batch_size, out_chw, channels, ratio_h, ratio_w);
-    }
-  }
-};
-
-template <typename T>
-class BilinearInterpGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_output = d_output_t->data<T>();
-    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
-
-    auto& device_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    math::SetConstant<platform::CUDADeviceContext, T> zero;
-    zero(device_ctx, d_input_t, static_cast<T>(0.0));
-
-    int out_h = ctx.Attr<int>("out_h");
-    int out_w = ctx.Attr<int>("out_w");
-
-    auto out_size_t = ctx.Input<Tensor>("OutSize");
-    if (out_size_t != nullptr) {
-      Tensor sizes;
-      framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes);
-      auto size_data = sizes.data<int>();
-      out_h = size_data[0];
-      out_w = size_data[1];
-    }
-
-    int batch_size = d_input_t->dims()[0];
-    int channels = d_input_t->dims()[1];
-    int in_h = d_input_t->dims()[2];
-    int in_w = d_input_t->dims()[3];
-
-    int in_hw = in_h * in_w;
-    int out_hw = out_h * out_w;
-    int in_chw = channels * in_hw;
-    int out_chw = channels * out_hw;
-
-    T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f;
-    T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f;
-
-    if (in_h == out_h && in_w == out_w) {
-      memcpy(d_input, d_output, d_input_t->numel() * sizeof(T));
-    } else {
-      int threadNum = batch_size * out_chw;
-      int blocks = (threadNum + 1024 - 1) / 1024;
-
-      KeBilinearInterpBw<
-          T><<<blocks, 1024, 0, ctx.cuda_device_context().stream()>>>(
-          d_input, in_h, in_w, batch_size, in_chw, d_output, out_h, out_w,
-          batch_size, out_chw, channels, ratio_h, ratio_w);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(bilinear_interp,
-                        ops::BilinearInterpOpCUDAKernel<float>);
-REGISTER_OP_CUDA_KERNEL(bilinear_interp_grad,
-                        ops::BilinearInterpGradOpCUDAKernel<float>);
diff --git a/paddle/fluid/operators/bilinear_interp_op.h b/paddle/fluid/operators/bilinear_interp_op.h
deleted file mode 100644
index 70847cb8c1abe2e94bc844ab8117d1f23fea533b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/bilinear_interp_op.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class BilinearInterpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input_t = ctx.Input<Tensor>("X");      // float tensor
-    auto* output_t = ctx.Output<Tensor>("Out");  // float tensor
-    auto out_dims = output_t->dims();
-    auto* input = input_t->data<T>();
-    int out_h = ctx.Attr<int>("out_h");
-    int out_w = ctx.Attr<int>("out_w");
-    auto out_size_t = ctx.Input<Tensor>("OutSize");
-    if (out_size_t != nullptr) {
-      auto out_size_data = out_size_t->data<int>();
-      out_h = out_size_data[0];
-      out_w = out_size_data[1];
-    }
-    auto* output = output_t->mutable_data<T>(
-        {out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
-    int batch_size = input_t->dims()[0];
-    int channels = input_t->dims()[1];
-    int in_h = input_t->dims()[2];
-    int in_w = input_t->dims()[3];
-
-    int in_hw = in_h * in_w;
-    int out_hw = out_h * out_w;
-    int in_chw = channels * in_hw;
-    int out_chw = channels * out_hw;
-
-    float ratio_h =
-        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
-    float ratio_w =
-        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
-
-    if (in_h == out_h && in_w == out_w) {
-      memcpy(output, input, input_t->numel() * sizeof(T));
-    } else {
-      for (int k = 0; k < batch_size; ++k) {  // loop for batches
-        for (int i = 0; i < out_h; ++i) {     // loop for images
-          int h = ratio_h * i;
-          int hid = (h < in_h - 1) ? 1 : 0;
-          float h1lambda = ratio_h * i - h;
-          float h2lambda = 1.f - h1lambda;
-
-          for (int j = 0; j < out_w; ++j) {
-            int w = ratio_w * j;
-            int wid = (w < in_w - 1) ? 1 : 0;
-            float w1lambda = ratio_w * j - w;
-            float w2lambda = 1.f - w1lambda;
-            // calculate four position for bilinear interpolation
-            const T* in_pos = &input[k * in_chw + h * in_w + w];
-            T* out_pos = &output[k * out_chw + i * out_w + j];
-
-            for (int c = 0; c < channels; ++c) {  // loop for channels
-              // bilinear interpolation
-              out_pos[0] = static_cast<T>(
-                  h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) +
-                  h1lambda * (w2lambda * in_pos[hid * in_w] +
-                              w1lambda * in_pos[hid * in_w + wid]));
-              in_pos += in_hw;
-              out_pos += out_hw;
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class BilinearInterpGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_output = d_output_t->data<T>();
-    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
-    auto& device_ctx =
-        ctx.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, T> zero;
-    zero(device_ctx, d_input_t, static_cast<T>(0.0));
-
-    int out_h = ctx.Attr<int>("out_h");
-    int out_w = ctx.Attr<int>("out_w");
-
-    auto out_size_t = ctx.Input<Tensor>("OutSize");
-    if (out_size_t != nullptr) {
-      auto out_size_data = out_size_t->data<int>();
-      out_h = out_size_data[0];
-      out_w = out_size_data[1];
-    }
-
-    int batch_size = d_input_t->dims()[0];
-    int channels = d_input_t->dims()[1];
-    int in_h = d_input_t->dims()[2];
-    int in_w = d_input_t->dims()[3];
-
-    int in_hw = in_h * in_w;
-    int out_hw = out_h * out_w;
-    int in_chw = channels * in_hw;
-    int out_chw = channels * out_hw;
-
-    float ratio_h =
-        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
-    float ratio_w =
-        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
-
-    if (in_h == out_h && in_w == out_w) {
-      memcpy(d_input, d_output, d_input_t->numel() * sizeof(T));
-    } else {
-      for (int k = 0; k < batch_size; ++k) {  // loop for batches
-        for (int i = 0; i < out_h; ++i) {     // loop for images
-          int h = ratio_h * i;
-          int hid = (h < in_h - 1) ? 1 : 0;
-          float h1lambda = ratio_h * i - h;
-          float h2lambda = 1 - h1lambda;
-
-          for (int j = 0; j < out_w; ++j) {
-            int w = ratio_w * j;
-            int wid = (w < in_w - 1) ? 1 : 0;
-            float w1lambda = ratio_w * j - w;
-            float w2lambda = 1 - w1lambda;
-            T* in_pos = &d_input[k * in_chw + h * in_w + w];
-            const T* out_pos = &d_output[k * out_chw + i * out_w + j];
-
-            for (int c = 0; c < channels; ++c) {  // loop for channels
-              in_pos[0] += static_cast<T>(h2lambda * w2lambda * out_pos[0]);
-              in_pos[wid] += static_cast<T>(h2lambda * w1lambda * out_pos[0]);
-              in_pos[hid * in_w] +=
-                  static_cast<T>(h1lambda * w2lambda * out_pos[0]);
-              in_pos[hid * in_w + wid] +=
-                  static_cast<T>(h1lambda * w1lambda * out_pos[0]);
-              in_pos += in_hw;
-              out_pos += out_hw;
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/checkpoint_notify_op.cc b/paddle/fluid/operators/checkpoint_notify_op.cc
index 7c072cb071a5d1e6a0549cf6d9eff18fd2533edc..defa287bdb913e406aa7e2a383cefc3cca8c4d94 100644
--- a/paddle/fluid/operators/checkpoint_notify_op.cc
+++ b/paddle/fluid/operators/checkpoint_notify_op.cc
@@ -46,8 +46,8 @@ class CheckpointNotifyOp : public framework::OperatorBase {
       auto lookup_table_save_dir =
           string::Sprintf("%s/%s_%d", dir, lookup_table_name, i);
       rpc_client->AsyncCheckpointNotify(epmap[i], lookup_table_save_dir);
-      VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name
-              << " and dir:" << dir << " to " << epmap[i];
+      VLOG(30) << "checkpoint notify sending lookup table: "
+               << lookup_table_name << " and dir:" << dir << " to " << epmap[i];
     }
     PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
   }
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 57817da71adfd80faad29a48b05ba2f326de6c07..093b0a9a1f9ac05cf4d72fc748fac827387e5dbe 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -37,7 +37,7 @@ class ConcatOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0.");
     if (n == 1) {
-      VLOG(3) << "Warning: concat op have only one input, may waste memory";
+      VLOG(30) << "Warning: concat op have only one input, may waste memory";
     }
 
     auto out_dims = ins[0];
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 76eda51ad414030074b69ee8d4f796c5c32d12f3..3083e622c3066879e107f930a45bcec36d347f80 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -15,15 +15,22 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/profiler.h"
 
 DEFINE_bool(cudnn_deterministic, false,
             "Whether allow using an autotuning algorithm for convolution "
             "operator. The autotuning algorithm may be non-deterministic. If "
             "true, the algorithm is deterministic.");
+DEFINE_uint64(conv_workspace_size_limit, 4096,
+              "cuDNN convolution workspace limit in MB unit.");
+DEFINE_bool(cudnn_exhaustive_search, false,
+            "Whether enable exhaustive search for cuDNN convolution or "
+            "not, defalut is False.");
 
 namespace paddle {
 namespace operators {
@@ -36,13 +43,25 @@ using DataLayout = platform::DataLayout;
 template <typename T>
 using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
 
+static constexpr char kCUDNNFwdAlgoCache[] = "kCUDNNFwdAlgoCache";
+static constexpr char kCUDNNBwdDataAlgoCache[] = "kCUDNNBwdDataAlgoCache";
+static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache";
+
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
     static_cast<size_t>(1024) * 1024 * 1024;
 
+static constexpr size_t kNUM_CUDNN_FWD_ALGS =
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
+static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
+static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
+
 template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "It must use CUDAPlace.");
     auto* input = ctx.Input<Tensor>("Input");
@@ -55,6 +74,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     int groups = ctx.Attr<int>("groups");
     int64_t user_workspace_size =
         static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
+    bool exhaustive_search =
+        FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
 
     const T* input_data = input->data<T>();
     const T* filter_data = filter->data<T>();
@@ -120,19 +141,19 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn conv workspace ---------------------
     size_t workspace_size_in_bytes;  // final workspace to allocate.
     size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
-    if (user_workspace_size > 0) {
-      workspace_size_limit = user_workspace_size * 1024 * 1024;
+    if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
+      int64_t max_user_size =
+          std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
+                   user_workspace_size);
+      workspace_size_limit = max_user_size * 1024 * 1024;
     }
+
     // ------------------- cudnn conv algorithm ---------------------
     cudnnConvolutionFwdAlgo_t algo;
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
+    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
 
-    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-        cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-        workspace_size_limit, &algo));
-
+    bool half_float = false;
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
     // Tensor core is supported since the volta GPU and
     // is only enabled when input and filter data are float16
@@ -143,14 +164,66 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
           cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
       // Currently tensor core is only enabled using this algo
       algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-      VLOG(5) << "use cudnn_tensor_op_math";
+      half_float = true;
+      VLOG(50) << "use cudnn_tensor_op_math";
     } else {
       CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
           cudnn_conv_desc, CUDNN_DEFAULT_MATH));
-      VLOG(5) << "NOT use cudnn_tensor_op_math";
+      VLOG(50) << "NOT use cudnn_tensor_op_math";
     }
 #endif
 
+    auto x_dims = framework::vectorize(input->dims());
+    auto f_dims = framework::vectorize(filter->dims());
+    if ((!exhaustive_search) && (!half_float)) {
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+          handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+          cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+          workspace_size_limit, &algo));
+      VLOG(3) << "cuDNN forward algo " << algo;
+    } else if (exhaustive_search && (!half_float)) {
+      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
+      if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
+        algo_cache =
+            ctx.scope()
+                .FindVar(kCUDNNFwdAlgoCache)
+                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
+      } else {
+        algo_cache =
+            const_cast<framework::Scope&>(ctx.scope())
+                .Var(kCUDNNFwdAlgoCache)
+                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
+      }
+      algo = algo_cache->GetAlgorithm(
+          x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
+            int returned_algo_count;
+            std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
+                fwd_perf_stat;
+            auto cudnn_find_func = [&](void* cudnn_workspace) {
+              CUDNN_ENFORCE(
+                  platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
+                      handle, cudnn_input_desc, input_data, cudnn_filter_desc,
+                      filter_data, cudnn_conv_desc, cudnn_output_desc,
+                      output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
+                      fwd_perf_stat.data(), cudnn_workspace,
+                      workspace_size_limit));
+            };
+            workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
+
+            VLOG(3) << "Perf result: (algo: stat, time, memory)";
+            for (int i = 0; i < returned_algo_count; ++i) {
+              const auto& stat = fwd_perf_stat[i];
+              VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
+                      << " " << stat.memory;
+            }
+            return fwd_perf_stat[0].algo;
+          });
+      VLOG(3) << "choose algo " << algo;
+    } else {
+      PADDLE_ENFORCE(half_float,
+                     "cuDNN exhaustive search doesn't support half float.");
+    }
+
     // get workspace size able to allocate
     CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
@@ -162,7 +235,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
 
     // ------------------- cudnn conv forward ---------------------
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
     for (int i = 0; i < groups; i++) {
       auto cudnn_func = [&](void* cudnn_workspace) {
         CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
@@ -180,6 +252,7 @@ template <typename T>
 class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "It must use CUDAPlace.");
     auto input = ctx.Input<Tensor>("Input");
@@ -198,6 +271,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     int groups = ctx.Attr<int>("groups");
     int64_t user_workspace_size =
         static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
+    bool exhaustive_search =
+        FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
+    if (exhaustive_search && FLAGS_cudnn_deterministic) {
+      PADDLE_THROW(
+          "Cann't set exhaustive_search True and "
+          "FLAGS_cudnn_deterministic True at same time.");
+    }
 
     // ------------------- cudnn descriptors ---------------------
     ScopedTensorDescriptor input_desc;
@@ -265,14 +345,66 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     cudnnConvolutionBwdFilterAlgo_t filter_algo;
     size_t workspace_size_in_bytes = 0, tmp_size = 0;
     size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
-    if (user_workspace_size > 0) {
-      workspace_size_limit = user_workspace_size * 1024 * 1024;
+    if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
+      int64_t max_user_size =
+          std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
+                   user_workspace_size);
+      workspace_size_limit = max_user_size * 1024 * 1024;
     }
 
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto x_dims = framework::vectorize(input->dims());
+    auto f_dims = framework::vectorize(filter->dims());
     auto handle = dev_ctx.cudnn_handle();
+    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
     if (input_grad) {
-      if (!FLAGS_cudnn_deterministic) {
+      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+      if (exhaustive_search) {
+        AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>* data_algo_cache;
+        if (ctx.scope().FindVar(kCUDNNBwdDataAlgoCache)) {
+          data_algo_cache =
+              ctx.scope()
+                  .FindVar(kCUDNNBwdDataAlgoCache)
+                  ->GetMutable<
+                      AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
+        } else {
+          data_algo_cache =
+              const_cast<framework::Scope&>(ctx.scope())
+                  .Var(kCUDNNBwdDataAlgoCache)
+                  ->GetMutable<
+                      AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
+        }
+        data_algo = data_algo_cache->GetAlgorithm(
+            x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
+              int returned_algo_count;
+              std::array<cudnnConvolutionBwdDataAlgoPerf_t,
+                         kNUM_CUDNN_BWD_DATA_ALGS>
+                  data_perf_stat;
+              auto cudnn_find_bd_data_func = [&](void* cudnn_workspace) {
+                CUDNN_ENFORCE(
+                    platform::dynload::
+                        cudnnFindConvolutionBackwardDataAlgorithmEx(
+                            handle, cudnn_filter_desc, filter_data,
+                            cudnn_output_grad_desc, output_grad_data,
+                            cudnn_conv_desc, cudnn_input_desc, input_grad_data,
+                            kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count,
+                            data_perf_stat.data(), cudnn_workspace,
+                            workspace_size_limit));
+              };
+              workspace_handle.RunFunc(cudnn_find_bd_data_func,
+                                       workspace_size_limit);
+
+              VLOG(3) << "Perf result: (algo: stat, time, memory)";
+              for (int i = 0; i < returned_algo_count; ++i) {
+                const auto& stat = data_perf_stat[i];
+                VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
+                        << " " << stat.memory;
+              }
+              return data_perf_stat[0].algo;
+            });
+        VLOG(3) << "cuDNN backward data algo " << data_algo;
+      } else if (FLAGS_cudnn_deterministic) {
+        data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+      } else {
         CUDNN_ENFORCE(
             platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
                 handle, cudnn_filter_desc,
@@ -285,10 +417,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                 cudnn_input_desc,
                 CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
                 workspace_size_limit, &data_algo));
-      } else {
-        data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
       }
-
       CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
               handle, cudnn_filter_desc, cudnn_output_grad_desc,
@@ -297,17 +426,54 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     }
 
     if (filter_grad) {
-      if (!FLAGS_cudnn_deterministic) {
+      T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
+      if (exhaustive_search) {
+        AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>* f_algo_cache;
+        if (ctx.scope().FindVar(kCUDNNBwdFilterAlgoCache)) {
+          f_algo_cache =
+              ctx.scope()
+                  .FindVar(kCUDNNBwdFilterAlgoCache)
+                  ->GetMutable<
+                      AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
+        } else {
+          f_algo_cache =
+              const_cast<framework::Scope&>(ctx.scope())
+                  .Var(kCUDNNBwdFilterAlgoCache)
+                  ->GetMutable<
+                      AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
+        }
+        filter_algo = f_algo_cache->GetAlgorithm(
+            x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
+              int returned_algo_count;
+              std::array<cudnnConvolutionBwdFilterAlgoPerf_t,
+                         kNUM_CUDNN_BWD_FILTER_ALGS>
+                  filter_perf_stat;
+              auto cudnn_find_bd_f_func = [&](void* cudnn_workspace) {
+                CUDNN_ENFORCE(
+                    platform::dynload::
+                        cudnnFindConvolutionBackwardFilterAlgorithmEx(
+                            handle, cudnn_input_desc, input_data,
+                            cudnn_output_grad_desc, output_grad_data,
+                            cudnn_conv_desc, cudnn_filter_desc,
+                            filter_grad_data, kNUM_CUDNN_BWD_FILTER_ALGS,
+                            &returned_algo_count, filter_perf_stat.data(),
+                            cudnn_workspace, workspace_size_limit));
+              };
+              workspace_handle.RunFunc(cudnn_find_bd_f_func,
+                                       workspace_size_limit);
+              return filter_perf_stat[0].algo;
+            });
+        VLOG(3) << "cuDNN backward filter algo " << filter_algo;
+      } else if (FLAGS_cudnn_deterministic) {
+        filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+      } else {
         CUDNN_ENFORCE(
             platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
                 handle, cudnn_input_desc, cudnn_output_grad_desc,
                 cudnn_conv_desc, cudnn_filter_desc,
                 CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
                 workspace_size_limit, &filter_algo));
-      } else {
-        filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
       }
-
       CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
               handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
@@ -317,7 +483,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
 
     // ------------------- cudnn conv backward data ---------------------
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b534321f746d5620005743eb8d45b71259156dd
--- /dev/null
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <unordered_map>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+template <typename TAlgorithm>
+class AlgorithmsCache {
+ public:
+  // Caches the best algorithm for a given
+  // combination of tensor dimensions & compute data type.
+  TAlgorithm GetAlgorithm(
+      const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
+      const std::vector<int>& strides, const std::vector<int>& paddings,
+      const std::vector<int>& dilations,
+      int algorithmFlags,  // can set for different data type
+      std::function<TAlgorithm()> gen_func);
+
+ private:
+  std::unordered_map<int64_t, TAlgorithm> hash_;
+  std::mutex mutex_;
+};
+
+template <typename TAlgorithm>
+TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
+    const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
+    const std::vector<int>& strides, const std::vector<int>& paddings,
+    const std::vector<int>& dilations, int algorithmFlags,
+    std::function<TAlgorithm()> gen_func) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  int64_t seed = 0;
+  // Hash all of the inputs, use to try and look up a previously
+  // discovered algorithm, or fall back to generating a new one.
+  std::hash<int64_t> hashFn;
+  // do hash like boost
+  // https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
+  for (const auto num : dims1) {
+    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+  }
+
+  for (const auto num : dims2) {
+    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1;
+  }
+
+  for (const auto num : strides) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 2;
+  }
+
+  for (const auto num : paddings) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 3;
+  }
+
+  for (const auto num : dilations) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 4;
+  }
+
+  seed ^= hashFn(static_cast<int64_t>(algorithmFlags)) + 0x9e3779b9 +
+          (seed << 6) + (seed >> 2) + 5;
+
+  if (seed == 0) return gen_func();
+
+  if (hash_.find(seed) == hash_.end()) {
+    TAlgorithm value = gen_func();
+    hash_[seed] = value;
+  }
+  return hash_[seed];
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 72cac9bc9fac9d9199e1f45db16e529adef2a676..f2cc6642ee6c45cfd95fa3b5ccc58a4832fb8db4 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -375,8 +375,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto src_md = platform::MKLDNNMemDesc(
         src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(),
-        (g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw);
+        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     std::vector<int> bias_tz;  // TODO(mgallus): avoid empty vector creation.
                                // Currently used whenever bias is != nullptr.
     auto dst_md = platform::MKLDNNMemDesc(
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 2cd9979bd3426a15af34a49002d5db2fdd9aeec7..4d370746382a4247f51aafa189e86eece941c320 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -189,6 +189,11 @@ void Conv2DOpMaker::Make() {
                "workspace size can increase performance but also requires "
                "better hardware. This size should be chosen carefully.")
       .SetDefault(4096);
+  AddAttr<bool>("exhaustive_search",
+                "(bool, default false) cuDNN has many algorithm to calculation "
+                "convolution, whether enable exhaustive search ",
+                "for cuDNN convolution or not, defalut is False.")
+      .SetDefault(false);
   AddComment(R"DOC(
 Convolution Operator.
 
@@ -219,6 +224,15 @@ $$
 )DOC");
 }
 
+class ConvOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{
+        {"Input", /*->*/ "Output"}};
+  }
+};
+
 void Conv3DOpMaker::Make() {
   AddInput(
       "Input",
@@ -283,7 +297,11 @@ void Conv3DOpMaker::Make() {
                "workspace size can increase performance but also requires "
                "better hardware. This size should be chosen carefully.")
       .SetDefault(4096);
-
+  AddAttr<bool>("exhaustive_search",
+                "(bool, default false) cuDNN has many algorithm to calculation "
+                "convolution, whether enable exhaustive search ",
+                "for cuDNN convolution or not, defalut is False.")
+      .SetDefault(false);
   AddComment(R"DOC(
 Convolution3D Operator.
 
@@ -356,6 +374,7 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+                  ops::ConvOpInferVarType,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);
 
@@ -363,7 +382,9 @@ REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);
 REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad);
+
 REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker,
+                  ops::ConvOpInferVarType,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad);
 
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index 66f19fe7ecfa51b2ce917f0c5fcb6d486f1a7307..a904dd91302c951560dc32ac107d4d73b6024c25 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cross_entropy_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -179,6 +180,15 @@ or not. But the output only shares the LoD information with input X.
 )DOC");
   }
 };
+
+class CrossEntropyOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
@@ -186,6 +196,7 @@ namespace ops = paddle::operators;
 using CPUCtx = paddle::platform::CPUDeviceContext;
 
 REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
+                  ops::CrossEntropyOpInferVarType,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
 REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index d5eec148f9b4f76866ec9fca98a596b9bc2860ef..e5c3f0eeb385e1a15fdbb12a989603996420efe3 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -22,6 +22,7 @@ iou_similarity_op.cu)
 detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
 detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc)
 detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
+detection_library(density_prior_box_op SRCS density_prior_box_op.cc)
 detection_library(anchor_generator_op SRCS anchor_generator_op.cc
 anchor_generator_op.cu)
 detection_library(target_assign_op SRCS target_assign_op.cc
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cc b/paddle/fluid/operators/detection/density_prior_box_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..99df15c3226b4305a28a3912398d6d1c766daa73
--- /dev/null
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cc
@@ -0,0 +1,175 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/density_prior_box_op.h"
+
+namespace paddle {
+namespace operators {
+
+class DensityPriorBoxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of DensityPriorBoxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Image"),
+                   "Input(Image) of DensityPriorBoxOp should not be null.");
+
+    auto image_dims = ctx->GetInputDim("Image");
+    auto input_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE(image_dims.size() == 4, "The layout of image is NCHW.");
+    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+
+    PADDLE_ENFORCE_LT(input_dims[2], image_dims[2],
+                      "The height of input must smaller than image.");
+
+    PADDLE_ENFORCE_LT(input_dims[3], image_dims[3],
+                      "The width of input must smaller than image.");
+    auto variances = ctx->Attrs().Get<std::vector<float>>("variances");
+
+    auto fixed_sizes = ctx->Attrs().Get<std::vector<float>>("fixed_sizes");
+    auto fixed_ratios = ctx->Attrs().Get<std::vector<float>>("fixed_ratios");
+    auto densities = ctx->Attrs().Get<std::vector<int>>("densities");
+
+    PADDLE_ENFORCE_EQ(fixed_sizes.size(), densities.size(),
+                      "The number of fixed_sizes and densities must be equal.");
+    size_t num_priors = 0;
+    if ((fixed_sizes.size() > 0) && (densities.size() > 0)) {
+      for (size_t i = 0; i < densities.size(); ++i) {
+        if (fixed_ratios.size() > 0) {
+          num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
+        }
+      }
+    }
+    std::vector<int64_t> dim_vec(4);
+    dim_vec[0] = input_dims[2];
+    dim_vec[1] = input_dims[3];
+    dim_vec[2] = num_priors;
+    dim_vec[3] = 4;
+    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
+    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
+        platform::CPUPlace());
+  }
+};
+
+class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "Input",
+        "(Tensor, default Tensor<float>), "
+        "the input feature data of DensityPriorBoxOp, the layout is NCHW.");
+    AddInput("Image",
+             "(Tensor, default Tensor<float>), "
+             "the input image data of DensityPriorBoxOp, the layout is NCHW.");
+    AddOutput("Boxes",
+              "(Tensor, default Tensor<float>), the output prior boxes of "
+              "DensityPriorBoxOp. The layout is [H, W, num_priors, 4]. "
+              "H is the height of input, W is the width of input, num_priors "
+              "is the box count of each position.");
+    AddOutput("Variances",
+              "(Tensor, default Tensor<float>), the expanded variances of "
+              "DensityPriorBoxOp. The layout is [H, W, num_priors, 4]. "
+              "H is the height of input, W is the width of input, num_priors "
+              "is the box count of each position.");
+    AddAttr<std::vector<float>>("variances",
+                                "(vector<float>) List of variances to be "
+                                "encoded in density prior boxes.")
+        .AddCustomChecker([](const std::vector<float>& variances) {
+          PADDLE_ENFORCE_EQ(variances.size(), 4,
+                            "Must and only provide 4 variance.");
+          for (size_t i = 0; i < variances.size(); ++i) {
+            PADDLE_ENFORCE_GT(variances[i], 0.0,
+                              "variance[%d] must be greater than 0.", i);
+          }
+        });
+    AddAttr<bool>("clip", "(bool) Whether to clip out-of-boundary boxes.")
+        .SetDefault(true);
+
+    AddAttr<float>(
+        "step_w",
+        "Density prior boxes step across width, 0.0 for auto calculation.")
+        .SetDefault(0.0)
+        .AddCustomChecker([](const float& step_w) {
+          PADDLE_ENFORCE_GE(step_w, 0.0, "step_w should be larger than 0.");
+        });
+    AddAttr<float>(
+        "step_h",
+        "Density prior boxes step across height, 0.0 for auto calculation.")
+        .SetDefault(0.0)
+        .AddCustomChecker([](const float& step_h) {
+          PADDLE_ENFORCE_GE(step_h, 0.0, "step_h should be larger than 0.");
+        });
+
+    AddAttr<float>("offset",
+                   "(float) "
+                   "Density prior boxes center offset.")
+        .SetDefault(0.5);
+    AddAttr<std::vector<float>>("fixed_sizes",
+                                "(vector<float>) List of fixed sizes "
+                                "of generated density prior boxes.")
+        .SetDefault(std::vector<float>{})
+        .AddCustomChecker([](const std::vector<float>& fixed_sizes) {
+          for (size_t i = 0; i < fixed_sizes.size(); ++i) {
+            PADDLE_ENFORCE_GT(fixed_sizes[i], 0.0,
+                              "fixed_sizes[%d] should be larger than 0.", i);
+          }
+        });
+
+    AddAttr<std::vector<float>>("fixed_ratios",
+                                "(vector<float>) List of fixed ratios "
+                                "of generated density prior boxes.")
+        .SetDefault(std::vector<float>{})
+        .AddCustomChecker([](const std::vector<float>& fixed_ratios) {
+          for (size_t i = 0; i < fixed_ratios.size(); ++i) {
+            PADDLE_ENFORCE_GT(fixed_ratios[i], 0.0,
+                              "fixed_ratios[%d] should be larger than 0.", i);
+          }
+        });
+
+    AddAttr<std::vector<int>>("densities",
+                              "(vector<float>) List of densities "
+                              "of generated density prior boxes.")
+        .SetDefault(std::vector<int>{})
+        .AddCustomChecker([](const std::vector<int>& densities) {
+          for (size_t i = 0; i < densities.size(); ++i) {
+            PADDLE_ENFORCE_GT(densities[i], 0,
+                              "densities[%d] should be larger than 0.", i);
+          }
+        });
+    AddComment(R"DOC(
+        Density Prior box operator
+        Each position of the input produce N density prior boxes, N is determined by
+        the count of fixed_ratios, densities, the calculation of N is as follows:
+        for density in densities:
+        N += size(fixed_ratios)*density^2
+        )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(density_prior_box, ops::DensityPriorBoxOp,
+                  ops::DensityPriorBoxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(density_prior_box, ops::DensityPriorBoxOpKernel<float>,
+                       ops::DensityPriorBoxOpKernel<double>);
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a52077e9cf90b278549a077af161bd4e282d972
--- /dev/null
+++ b/paddle/fluid/operators/detection/density_prior_box_op.h
@@ -0,0 +1,146 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/operators/detection/prior_box_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
+    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+    auto clip = ctx.Attr<bool>("clip");
+
+    auto fixed_sizes = ctx.Attr<std::vector<float>>("fixed_sizes");
+    auto fixed_ratios = ctx.Attr<std::vector<float>>("fixed_ratios");
+    auto densities = ctx.Attr<std::vector<int>>("densities");
+
+    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
+    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto img_width = image->dims()[3];
+    auto img_height = image->dims()[2];
+
+    auto feature_width = input->dims()[3];
+    auto feature_height = input->dims()[2];
+
+    T step_width, step_height;
+    if (step_w == 0 || step_h == 0) {
+      step_width = static_cast<T>(img_width) / feature_width;
+      step_height = static_cast<T>(img_height) / feature_height;
+    } else {
+      step_width = step_w;
+      step_height = step_h;
+    }
+    int num_priors = 0;
+    if (fixed_sizes.size() > 0 && densities.size() > 0) {
+      for (size_t i = 0; i < densities.size(); ++i) {
+        if (fixed_ratios.size() > 0) {
+          num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
+        }
+      }
+    }
+
+    boxes->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes).setConstant(0.0);
+
+    int step_average = static_cast<int>((step_width + step_height) * 0.5);
+
+    for (int h = 0; h < feature_height; ++h) {
+      for (int w = 0; w < feature_width; ++w) {
+        T center_x = (w + offset) * step_width;
+        T center_y = (h + offset) * step_height;
+        int idx = 0;
+        // Generate density prior boxes with fixed sizes.
+        for (size_t s = 0; s < fixed_sizes.size(); ++s) {
+          auto fixed_size = fixed_sizes[s];
+          int density = densities[s];
+          // Generate density prior boxes with fixed ratios.
+          if (fixed_ratios.size() > 0) {
+            for (size_t r = 0; r < fixed_ratios.size(); ++r) {
+              float ar = fixed_ratios[r];
+              int shift = step_average / density;
+              float box_width_ratio = fixed_size * sqrt(ar);
+              float box_height_ratio = fixed_size / sqrt(ar);
+              for (int di = 0; di < density; ++di) {
+                for (int dj = 0; dj < density; ++dj) {
+                  float center_x_temp =
+                      center_x - step_average / 2. + shift / 2. + dj * shift;
+                  float center_y_temp =
+                      center_y - step_average / 2. + shift / 2. + di * shift;
+                  e_boxes(h, w, idx, 0) =
+                      (center_x_temp - box_width_ratio / 2.) / img_width >= 0
+                          ? (center_x_temp - box_width_ratio / 2.) / img_width
+                          : 0;
+                  e_boxes(h, w, idx, 1) =
+                      (center_y_temp - box_height_ratio / 2.) / img_height >= 0
+                          ? (center_y_temp - box_height_ratio / 2.) / img_height
+                          : 0;
+                  e_boxes(h, w, idx, 2) =
+                      (center_x_temp + box_width_ratio / 2.) / img_width <= 1
+                          ? (center_x_temp + box_width_ratio / 2.) / img_width
+                          : 1;
+                  e_boxes(h, w, idx, 3) =
+                      (center_y_temp + box_height_ratio / 2.) / img_height <= 1
+                          ? (center_y_temp + box_height_ratio / 2.) / img_height
+                          : 1;
+                  idx++;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    if (clip) {
+      platform::Transform<platform::CPUDeviceContext> trans;
+      ClipFunctor<T> clip_func;
+      trans(ctx.template device_context<platform::CPUDeviceContext>(),
+            boxes->data<T>(), boxes->data<T>() + boxes->numel(),
+            boxes->data<T>(), clip_func);
+    }
+    framework::Tensor var_t;
+    var_t.mutable_data<T>(
+        framework::make_ddim({1, static_cast<int>(variances.size())}),
+        ctx.GetPlace());
+
+    auto var_et = framework::EigenTensor<T, 2>::From(var_t);
+
+    for (size_t i = 0; i < variances.size(); ++i) {
+      var_et(0, i) = variances[i];
+    }
+
+    int box_num = feature_height * feature_width * num_priors;
+    auto var_dim = vars->dims();
+    vars->Resize({box_num, static_cast<int>(variances.size())});
+
+    auto e_vars = framework::EigenMatrix<T, Eigen::RowMajor>::From(*vars);
+
+    e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(box_num, 1));
+
+    vars->Resize(var_dim);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc_server.cc b/paddle/fluid/operators/distributed/brpc_server.cc
index 862167f02084cfe81db1c0936bbfb0415fa85721..47a06dd0f378f6cc4f79aee52052717188d72420 100644
--- a/paddle/fluid/operators/distributed/brpc_server.cc
+++ b/paddle/fluid/operators/distributed/brpc_server.cc
@@ -133,10 +133,10 @@ void AsyncBRPCServer::StartServer() {
 void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); }
 
 void AsyncBRPCServer::WaitServerReady() {
-  VLOG(3) << "AsyncGRPCServer is wait server ready";
+  VLOG(30) << "AsyncGRPCServer is wait server ready";
   std::unique_lock<std::mutex> lock(this->mutex_ready_);
   condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
+  VLOG(30) << "AsyncGRPCServer WaitSeverReady";
 }
 
 };  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index be5c20ad2e4b53e0ff98561b92543b03298381d9..c28f86146d3040c6a26cabfb795eff67375d4b76 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -38,7 +38,7 @@ void GRPCClient::SendComplete() {
   std::unique_lock<std::mutex> lk(completed_mutex_);
   if (!completed_) {
     for (auto& it : channels_) {
-      VLOG(3) << "send complete message to " << it.first;
+      VLOG(30) << "send complete message to " << it.first;
       this->AsyncSendComplete(it.first);
     }
     PADDLE_ENFORCE(this->Wait(), "internal grpc error");
@@ -81,7 +81,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
     ::grpc::ByteBuffer req;
     SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_);
 
-    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
+    VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
 
     // stub context
     s->response_call_back_ = nullptr;
@@ -142,7 +142,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
     ::grpc::ByteBuffer buf;
     RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
 
-    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
+    VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
 
     // stub context
     s->response_call_back_ = ProcGetResponse;
@@ -190,7 +190,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
     ::grpc::ByteBuffer req;
     SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);
 
-    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
+    VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
 
     // stub context
     s->response_call_back_ = ProcGetResponse;
@@ -328,14 +328,14 @@ void GRPCClient::Proceed() {
   void* tag = nullptr;
   bool ok = false;
 
-  VLOG(3) << "GRPCClient Proceed begin";
+  VLOG(30) << "GRPCClient Proceed begin";
   while (!stopped_ && cq_.Next(&tag, &ok)) {
     BaseProcessor* c = static_cast<BaseProcessor*>(tag);
     GPR_ASSERT(ok);
     PADDLE_ENFORCE(c);
 
     if (c->status_.ok()) {
-      VLOG(3) << c->GetVarHandlePtr()->String() << " process";
+      VLOG(30) << c->GetVarHandlePtr()->String() << " process";
       c->Process();
     } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
       // FIXME(gongwb): parse error_details?
@@ -370,7 +370,7 @@ void GRPCClient::Proceed() {
       sync_cond_.notify_all();
     }
   }
-  VLOG(3) << "GRPCClient Proceed end";
+  VLOG(30) << "GRPCClient Proceed end";
 }
 
 std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc
index eb9e36029c01fb38ca4438578190dd7895182ea1..ffd2b1707bea6c9379dc09c629fa4c920dac8ed0 100644
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -98,7 +98,7 @@ class RequestSend final : public RequestBase {
 
   void Process() override {
     std::string varname = GetReqName();
-    VLOG(4) << "RequestSend var_name:" << varname;
+    VLOG(40) << "RequestSend var_name:" << varname;
 
     auto scope = request_->GetMutableLocalScope();
     auto invar = request_->GetVar();
@@ -135,7 +135,7 @@ class RequestGet final : public RequestBase {
     // proc request.
     std::string varname = request_.varname();
     int trainer_id = request_.trainer_id();
-    VLOG(4) << "RequestGet " << varname;
+    VLOG(40) << "RequestGet " << varname;
 
     auto scope = request_handler_->scope();
     auto invar = scope->FindVar(varname);
@@ -182,8 +182,8 @@ class RequestPrefetch final : public RequestBase {
     std::string in_var_name = request_->Varname();
     std::string out_var_name = request_->OutVarname();
     int trainer_id = request_->GetTrainerId();
-    VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name
-            << " out_var_name: " << out_var_name;
+    VLOG(40) << "RequestPrefetch, in_var_name: " << in_var_name
+             << " out_var_name: " << out_var_name;
 
     auto scope = request_->GetMutableLocalScope();
     auto invar = scope->FindVar(in_var_name);
@@ -231,8 +231,8 @@ class RequestCheckpointNotify final : public RequestBase {
     std::string checkpoint_dir = request_->OutVarname();
     int trainer_id = request_->GetTrainerId();
 
-    VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
-            << ", dir: " << checkpoint_dir;
+    VLOG(40) << "RequestCheckpointNotify notify: " << checkpoint_notify
+             << ", dir: " << checkpoint_dir;
 
     request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr,
                              trainer_id, checkpoint_dir);
@@ -246,10 +246,10 @@ class RequestCheckpointNotify final : public RequestBase {
 };
 
 void AsyncGRPCServer::WaitServerReady() {
-  VLOG(4) << "AsyncGRPCServer is wait server ready";
+  VLOG(40) << "AsyncGRPCServer is wait server ready";
   std::unique_lock<std::mutex> lock(this->mutex_ready_);
   condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(4) << "AsyncGRPCServer WaitSeverReady";
+  VLOG(40) << "AsyncGRPCServer WaitSeverReady";
 }
 
 void AsyncGRPCServer::StartServer() {
@@ -282,14 +282,15 @@ void AsyncGRPCServer::StartServer() {
     reqs.reserve(kRequestBufSize);
 
     for (int i = 0; i < kRequestBufSize; i++) {
-      VLOG(6) << "TryToRegisterNewOne on RPC NAME: " << rpc_name << " I: " << i;
+      VLOG(60) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
+               << " I: " << i;
       TryToRegisterNewOne(rpc_name, i);
     }
 
     for (int i = 0; i < threadnum; i++) {
       rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
           &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
-      VLOG(4) << t.first << " creates threads!";
+      VLOG(40) << t.first << " creates threads!";
     }
   }
 
@@ -306,7 +307,7 @@ void AsyncGRPCServer::StartServer() {
     auto& threads = t.second;
     for (size_t i = 0; i < threads.size(); ++i) {
       threads[i]->join();
-      VLOG(4) << t.first << " threads ends!";
+      VLOG(40) << t.first << " threads ends!";
     }
   }
 }
@@ -314,7 +315,7 @@ void AsyncGRPCServer::StartServer() {
 void AsyncGRPCServer::ShutdownQueue() {
   for (auto& t : rpc_cq_) {
     t.second->Shutdown();
-    VLOG(4) << t.first << " queue shutdown!";
+    VLOG(40) << t.first << " queue shutdown!";
   }
 }
 
@@ -323,7 +324,7 @@ void AsyncGRPCServer::ShutDownImpl() {
   is_shut_down_ = true;
   ShutdownQueue();
 
-  VLOG(4) << "server_ shutdown!";
+  VLOG(40) << "server_ shutdown!";
   server_->Shutdown();
 }
 
@@ -331,12 +332,12 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
                                           int req_id) {
   std::unique_lock<std::mutex> lock(cq_mutex_);
   if (is_shut_down_) {
-    VLOG(4) << "shutdown, do not TryToRegisterNewSendOne";
+    VLOG(40) << "shutdown, do not TryToRegisterNewSendOne";
     return;
   }
 
-  VLOG(4) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
-          << " REQ ID: " << req_id;
+  VLOG(40) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
+           << " REQ ID: " << req_id;
 
   auto& reqs = rpc_reqs_[rpc_name];
   auto& handler = rpc_call_map_[rpc_name];
@@ -357,7 +358,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
 
   reqs[req_id] = b;
 
-  VLOG(4) << "Create RequestSend status:" << b->Status();
+  VLOG(40) << "Create RequestSend status:" << b->Status();
 }
 
 void AsyncGRPCServer::HandleRequest(
@@ -367,15 +368,15 @@ void AsyncGRPCServer::HandleRequest(
   bool ok = false;
 
   while (true) {
-    VLOG(4) << "HandleRequest " << rpc_name << " wait next";
+    VLOG(40) << "HandleRequest " << rpc_name << " wait next";
     if (!cq->Next(&tag, &ok)) {
-      VLOG(3) << "CompletionQueue " << rpc_name << " shutdown!";
+      VLOG(30) << "CompletionQueue " << rpc_name << " shutdown!";
       break;
     }
 
     int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
-    VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id
-            << " get next";
+    VLOG(40) << "HandleRequest " << rpc_name << ", req_id:" << req_id
+             << " get next";
 
     auto& reqs = rpc_reqs_[rpc_name];
     RequestBase* base = nullptr;
@@ -385,7 +386,7 @@ void AsyncGRPCServer::HandleRequest(
       base = reqs[req_id];
     }
 
-    VLOG(3) << base->Status2String(rpc_name);
+    VLOG(30) << base->Status2String(rpc_name);
 
     // reference:
     // https://github.com/tensorflow/tensorflow/issues/5596
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 3c1db147098055e9974c9dc607266cdaf2e43dae..3bcc59a47ba5f52da1374f220828a0f392e13d27 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -75,7 +75,7 @@ class VarHandle {
       wait_cond_.wait(lk, [this] { return status_ != kDefaultState; });
       ret = status_;
     }
-    VLOG(7) << "VarHandle wait:" << ret;
+    VLOG(70) << "VarHandle wait:" << ret;
     return ret != kErrorState;
   }
 
@@ -84,7 +84,7 @@ class VarHandle {
       std::unique_lock<std::mutex> lk(sync_mutex_);
       status_ = ok ? kFinishState : kErrorState;
     }
-    VLOG(7) << "VarHandle finish:" << ok;
+    VLOG(70) << "VarHandle finish:" << ok;
     wait_cond_.notify_all();
   }
 
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 025528fe70b8f4d353ab92f29b1bd71c77cf7850..dae56cc8436c2241bfc8ae37ba3cad4069a054bf 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -38,19 +38,19 @@ bool RequestSendHandler::Handle(const std::string& varname,
                                 framework::Variable** outvar,
                                 const int trainer_id,
                                 const std::string& out_var_name) {
-  VLOG(4) << "RequestSendHandler:" << varname;
+  VLOG(40) << "RequestSendHandler:" << varname;
 
   // Sync
   if (varname == BATCH_BARRIER_MESSAGE) {
-    VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
+    VLOG(30) << "sync: recv BATCH_BARRIER_MESSAGE";
     rpc_server_->IncreaseBatchBarrier(kRequestSend);
   } else if (varname == COMPLETE_MESSAGE) {
-    VLOG(3) << "sync: recv complete message";
+    VLOG(30) << "sync: recv complete message";
     rpc_server_->Complete();
   } else {
     // Async
     if (!sync_mode_) {
-      VLOG(3) << "async process var: " << varname;
+      VLOG(30) << "async process var: " << varname;
       try {
         executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
                                       scope);
@@ -61,7 +61,7 @@ bool RequestSendHandler::Handle(const std::string& varname,
       return true;
     } else {  // sync
       rpc_server_->WaitCond(kRequestSend);
-      VLOG(3) << "sync: processing received var: " << varname;
+      VLOG(30) << "sync: processing received var: " << varname;
 
       if (invar == nullptr) {
         LOG(FATAL) << "sync: Can not find server side var: " << varname;
@@ -78,10 +78,10 @@ bool RequestGetHandler::Handle(const std::string& varname,
                                framework::Variable** outvar,
                                const int trainer_id,
                                const std::string& out_var_name) {
-  VLOG(4) << "RequestGetHandler:" << varname;
+  VLOG(40) << "RequestGetHandler:" << varname;
   if (sync_mode_) {
     if (varname == FETCH_BARRIER_MESSAGE) {
-      VLOG(3) << "sync: recv fetch barrier message";
+      VLOG(30) << "sync: recv fetch barrier message";
       rpc_server_->IncreaseBatchBarrier(kRequestGet);
     } else {
       rpc_server_->WaitCond(kRequestGet);
@@ -93,13 +93,14 @@ bool RequestGetHandler::Handle(const std::string& varname,
         // NOTE: the format is determined by distributed_transpiler.py
         std::string param_bak_name =
             string::Sprintf("%s.trainer_%d_bak", varname, trainer_id);
-        VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id;
+        VLOG(30) << "getting " << param_bak_name << " trainer_id "
+                 << trainer_id;
         auto var = scope_->FindVar(varname);
         auto t_orig = var->Get<framework::LoDTensor>();
         auto param_bak = scope_->Var(param_bak_name);
         auto t = param_bak->GetMutable<framework::LoDTensor>();
         t->mutable_data(dev_ctx_->GetPlace(), t_orig.type());
-        VLOG(3) << "copying " << varname << " to " << param_bak_name;
+        VLOG(30) << "copying " << varname << " to " << param_bak_name;
         framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t);
       }
       *outvar = scope_->FindVar(varname);
@@ -114,7 +115,7 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
                                     framework::Variable** outvar,
                                     const int trainer_id,
                                     const std::string& out_var_name) {
-  VLOG(4) << "RequestPrefetchHandler " << varname;
+  VLOG(40) << "RequestPrefetchHandler " << varname;
 
   auto var_desc = program_->Block(0).FindVar(out_var_name);
   InitializeVariable(*outvar, var_desc->GetType());
@@ -138,8 +139,8 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
   auto* lt_var = scope_->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
   lt_var->clear();
   lt_var->append(out_var_name);
-  VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: "
-          << out_var_name;
+  VLOG(40) << "RequestCheckpointHandler update var kLookupTablePath to: "
+           << out_var_name;
   executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope_);
   return true;
 }
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
index 3e30ed4ac86bd2cb3f7c4301163e54a947c3d5b4..4055091104f2f96070d0c4e806c6908da691d732 100644
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -39,7 +39,7 @@ void RPCServer::SavePort() const {
   port_file.open(file_path);
   port_file << selected_port_;
   port_file.close();
-  VLOG(4) << "selected port written to " << file_path;
+  VLOG(40) << "selected port written to " << file_path;
 }
 
 void RPCServer::WaitBarrier(const std::string& rpc_name) {
@@ -49,12 +49,12 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) {
             exit_flag_.load());
   });
 
-  VLOG(3) << "batch_barrier_: " << rpc_name << " "
-          << barrier_counter_[rpc_name];
+  VLOG(30) << "batch_barrier_: " << rpc_name << " "
+           << barrier_counter_[rpc_name];
 }
 
 void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
-  VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
+  VLOG(40) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
   int b = 0;
   std::unique_lock<std::mutex> lock(mutex_);
   b = ++barrier_counter_[rpc_name];
@@ -71,7 +71,7 @@ void RPCServer::Complete() {
     client_num_--;
     need_reset_all_vars_ = true;
 
-    VLOG(4) << "decrease client_num to: " << client_num_;
+    VLOG(40) << "decrease client_num to: " << client_num_;
     if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
       barrier_counter_[kRequestGet]--;
     }
@@ -90,7 +90,7 @@ int RPCServer::GetClientNum() {
 }
 
 void RPCServer::ResetBarrierCounter() {
-  VLOG(3) << "RPCServer ResetBarrierCounter ";
+  VLOG(30) << "RPCServer ResetBarrierCounter ";
   std::unique_lock<std::mutex> lock(mutex_);
   for (auto& t : barrier_counter_) {
     t.second = 0;
@@ -105,12 +105,12 @@ void RPCServer::RegisterRPC(const std::string& rpc_name,
 
   static int cond = -1;
   rpc_cond_map_[rpc_name] = ++cond;
-  VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
-          << ", cond:" << rpc_cond_map_[rpc_name];
+  VLOG(40) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
+           << ", cond:" << rpc_cond_map_[rpc_name];
 }
 
 void RPCServer::SetCond(const std::string& rpc_name) {
-  VLOG(3) << "RPCServer SetCond " << rpc_name;
+  VLOG(30) << "RPCServer SetCond " << rpc_name;
   {
     std::unique_lock<std::mutex> lock(mutex_);
     cur_cond_ = rpc_cond_map_[rpc_name];
@@ -120,7 +120,7 @@ void RPCServer::SetCond(const std::string& rpc_name) {
 }
 
 void RPCServer::WaitCond(const std::string& rpc_name) {
-  VLOG(4) << "RPCServer WaitCond " << rpc_name;
+  VLOG(40) << "RPCServer WaitCond " << rpc_name;
   int cond = 0;
   {
     std::unique_lock<std::mutex> lock(mutex_);
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
index b2f73b67dc9bf944892187abd2e5709e54449d7d..d1572ce01aa17273988955c27bdea5b2f40c27ea 100644
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -50,7 +50,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
         size_to_write = length - total_written;
       }
       // This log is useful to see how long a internal block size is of rpc.
-      VLOG(7) << "copy " << size_to_write << " data to CUDAPlace";
+      VLOG(70) << "copy " << size_to_write << " data to CUDAPlace";
       memory::Copy(boost::get<platform::CUDAPlace>(place),
                    reinterpret_cast<void*>(p), cpu, data, size_to_write,
                    gpu_dev_ctx.stream());
@@ -79,7 +79,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
     // TODO(gongwb): can we avoid copy?
     platform::CPUPlace cpu;
     // This log is useful to see how long a internal block size is of rpc.
-    VLOG(7) << "copy " << size_to_write << " data to CPUPlace";
+    VLOG(70) << "copy " << size_to_write << " data to CPUPlace";
     memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);
 
     p += size_to_write;
@@ -198,8 +198,8 @@ bool VariableResponse::ProcSerializedField(
 #endif
   }
 
-  VLOG(7) << "ProcSerializedField:" << meta_.varname()
-          << ", type:" << meta_.type() << std::endl;
+  VLOG(70) << "ProcSerializedField:" << meta_.varname()
+           << ", type:" << meta_.type() << std::endl;
   framework::DDim dims = GetDims(meta_.dims());
   if (meta_.type() == sendrecv::LOD_TENSOR) {
     PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!");
diff --git a/paddle/fluid/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h
index c60cb1f92e99329d52f6ed39dccde406a5f83563..9edbdbefe76600dc4bf937d95e70d11450206cd4 100644
--- a/paddle/fluid/operators/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise_add_op.h
@@ -28,9 +28,9 @@ struct AddFunctor {
 };
 
 template <typename DeviceContext, typename T>
-void default_elementwise_add(const framework::ExecutionContext& ctx,
-                             const framework::Tensor* x,
-                             const framework::Tensor* y, framework::Tensor* z) {
+void default_elementwise_add(const framework::ExecutionContext &ctx,
+                             const framework::Tensor *x,
+                             const framework::Tensor *y, framework::Tensor *z) {
   int axis = ctx.Attr<int>("axis");
   ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
                                                         AddFunctor<T>(), z);
@@ -40,9 +40,9 @@ template <typename DeviceContext, typename T>
 typename std::enable_if<
     std::is_floating_point<T>::value &&
     std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add(const framework::ExecutionContext& ctx,
-                const framework::Tensor* x, const framework::Tensor* y,
-                framework::Tensor* z) {
+elementwise_add(const framework::ExecutionContext &ctx,
+                const framework::Tensor *x, const framework::Tensor *y,
+                framework::Tensor *z) {
   auto eigen_x = framework::EigenVector<T>::Flatten(*x);
   auto eigen_y = framework::EigenVector<T>::Flatten(*y);
   auto eigen_z = framework::EigenVector<T>::Flatten(*z);
@@ -55,21 +55,20 @@ template <typename DeviceContext, typename T>
 typename std::enable_if<
     !std::is_floating_point<T>::value ||
     !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add(const framework::ExecutionContext& ctx,
-                const framework::Tensor* x, const framework::Tensor* y,
-                framework::Tensor* z) {
+elementwise_add(const framework::ExecutionContext &ctx,
+                const framework::Tensor *x, const framework::Tensor *y,
+                framework::Tensor *z) {
   default_elementwise_add<DeviceContext, T>(ctx, x, y, z);
 }
 
 template <typename DeviceContext, typename T>
 class ElementwiseAddKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<framework::LoDTensor>("X");
+    auto *y = ctx.Input<framework::LoDTensor>("Y");
+    auto *z = ctx.Output<framework::LoDTensor>("Out");
 
-    const auto x = ctx.Input<Tensor>("X");
-    const auto y = ctx.Input<Tensor>("Y");
-    auto z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
 
     auto dims_equal = x->dims() == y->dims();
@@ -87,13 +86,13 @@ struct IdentityGrad {
 };
 
 template <typename DeviceContext, typename T>
-void default_elementwise_add_grad(const framework::ExecutionContext& ctx,
-                                  const framework::Tensor* x,
-                                  const framework::Tensor* y,
-                                  const framework::Tensor* out,
-                                  const framework::Tensor* dout,
-                                  framework::Tensor* dx,
-                                  framework::Tensor* dy) {
+void default_elementwise_add_grad(const framework::ExecutionContext &ctx,
+                                  const framework::Tensor *x,
+                                  const framework::Tensor *y,
+                                  const framework::Tensor *out,
+                                  const framework::Tensor *dout,
+                                  framework::Tensor *dx,
+                                  framework::Tensor *dy) {
   int axis = ctx.Attr<int>("axis");
 
   ElemwiseExplicitGradCompute<DeviceContext, T, IdentityGrad<T>,
@@ -106,11 +105,11 @@ template <typename DeviceContext, typename T>
 typename std::enable_if<
     std::is_floating_point<T>::value &&
     std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add_grad(const framework::ExecutionContext& ctx,
-                     const framework::Tensor* x, const framework::Tensor* y,
-                     const framework::Tensor* out,
-                     const framework::Tensor* dout, framework::Tensor* dx,
-                     framework::Tensor* dy) {
+elementwise_add_grad(const framework::ExecutionContext &ctx,
+                     const framework::Tensor *x, const framework::Tensor *y,
+                     const framework::Tensor *out,
+                     const framework::Tensor *dout, framework::Tensor *dx,
+                     framework::Tensor *dy) {
   auto blas = math::GetBlas<DeviceContext, T>(ctx);
 
   if (dx) {
@@ -128,27 +127,27 @@ template <typename DeviceContext, typename T>
 typename std::enable_if<
     !std::is_floating_point<T>::value ||
     !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add_grad(const framework::ExecutionContext& ctx,
-                     const framework::Tensor* x, const framework::Tensor* y,
-                     const framework::Tensor* out,
-                     const framework::Tensor* dout, framework::Tensor* dx,
-                     framework::Tensor* dy) {
+elementwise_add_grad(const framework::ExecutionContext &ctx,
+                     const framework::Tensor *x, const framework::Tensor *y,
+                     const framework::Tensor *out,
+                     const framework::Tensor *dout, framework::Tensor *dx,
+                     framework::Tensor *dy) {
   default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
 }
 
 template <typename DeviceContext, typename T>
 class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext &ctx) const override {
     ElemwiseGradKernel<T>::Compute(ctx);
 
     using Tensor = framework::Tensor;
 
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     // skip out, x, y
-    auto* out = dout;
+    auto *out = dout;
     auto *x = dout, *y = dout;
 
     if (platform::is_cpu_place(ctx.GetPlace()) && dx != nullptr &&
diff --git a/paddle/fluid/operators/elementwise_div_op.h b/paddle/fluid/operators/elementwise_div_op.h
index 41a7950bf0c598507c0fda48c6a43f2fd38c41d2..cdb1264d298ef48d6b3da39d63ff1d09e1561aa4 100644
--- a/paddle/fluid/operators/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise_div_op.h
@@ -28,11 +28,10 @@ template <typename DeviceContext, typename T>
 class ElementwiseDivKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* z = ctx.Output<framework::LoDTensor>("Out");
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
     ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
diff --git a/paddle/fluid/operators/elementwise_max_op.h b/paddle/fluid/operators/elementwise_max_op.h
index bfb5c931958b4ca890ea720af42dad91d5625abb..367489dd563f7d8bdf430517cadf49d4ef2a0105 100644
--- a/paddle/fluid/operators/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise_max_op.h
@@ -29,11 +29,10 @@ template <typename DeviceContext, typename T>
 class ElementwiseMaxKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* z = ctx.Output<framework::LoDTensor>("Out");
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
     ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
diff --git a/paddle/fluid/operators/elementwise_min_op.h b/paddle/fluid/operators/elementwise_min_op.h
index db035ffb52e619b337c8190af4ed0e155aaac48d..1bd0a6279766c8eba92d1e3a76191c59410286b2 100644
--- a/paddle/fluid/operators/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise_min_op.h
@@ -28,11 +28,10 @@ template <typename DeviceContext, typename T>
 class ElementwiseMinKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* z = ctx.Output<framework::LoDTensor>("Out");
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
     ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
diff --git a/paddle/fluid/operators/elementwise_mul_op.h b/paddle/fluid/operators/elementwise_mul_op.h
index b870d08a1a28fd3e678aeb7211f7e3ec8b2c4c65..29e4ab7db1377b6aa80e94a26ab3cb8669f9154a 100644
--- a/paddle/fluid/operators/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise_mul_op.h
@@ -60,11 +60,10 @@ template <typename DeviceContext, typename T>
 class ElementwiseMulKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* z = ctx.Output<framework::LoDTensor>("Out");
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == y->numel()) {
       elementwise_mul<DeviceContext, T>(ctx, x, y, z);
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index 68c6e315cc3b5fa932f8946f6d4f838f4d3fc5a5..f01f67692e1e5dd040971cb0dd1dd793648da97a 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include <string>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -29,7 +31,8 @@ class ElementwiseOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   using Tensor = framework::Tensor;
-  void InferShape(framework::InferShapeContext* ctx) const override {
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of elementwise op should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Y"),
@@ -37,6 +40,17 @@ class ElementwiseOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of elementwise op should not be null.");
 
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("X").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("X").front(), ctx->GetInputsVarType("X").front());
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Y").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Y").front(), ctx->GetInputsVarType("Y").front());
+
     auto x_dim = ctx->GetInputDim("X");
     auto y_dim = ctx->GetInputDim("Y");
     PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
@@ -47,9 +61,8 @@ class ElementwiseOp : public framework::OperatorWithKernel {
   }
 
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
 
 #ifdef PADDLE_WITH_MKLDNN
     if (platform::CanMKLDNNBeUsed(ctx)) {
@@ -62,16 +75,12 @@ class ElementwiseOp : public framework::OperatorWithKernel {
   }
 };
 
-class ElementwiseOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto x_name = op_desc.Input("X")[0];
-    auto out_name = op_desc.Output("Out")[0];
-    auto& x = block->FindRecursiveOrCreateVar(x_name);
-    auto& out = block->FindRecursiveOrCreateVar(out_name);
-    out.SetType(x.GetType());
-    out.SetDataType(x.GetDataType());
+class ElementwiseOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
   }
 };
 
@@ -131,6 +140,7 @@ But the output only shares the LoD information with the input $X$.
 
  protected:
   virtual std::string GetName() const = 0;
+
   virtual std::string GetEquation() const = 0;
 };
 
@@ -139,7 +149,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
   using Tensor = framework::Tensor;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
@@ -165,7 +175,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
   }
 
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     auto input_data_type = framework::ToDataType(
         ctx.Input<Tensor>(framework::GradVarName("Out"))->type());
 
@@ -187,7 +197,7 @@ class ElementwiseOpExplicitGrad : public ElementwiseOpGrad {
   using operators::ElementwiseOpGrad::GetExpectedKernelType;
   using Tensor = framework::Tensor;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null");
 
@@ -209,11 +219,11 @@ class ElementwiseOpExplicitGrad : public ElementwiseOpGrad {
 template <typename T>
 class ElemwiseGradKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* dx =
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *dx =
         context.Output<framework::LoDTensor>(framework::GradVarName("X"));
     if (dx != nullptr) {
-      auto& dout =
+      auto &dout =
           *context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
       dx->set_lod(dout.lod());
     }
@@ -234,7 +244,7 @@ class ElemwiseGradKernel : public framework::OpKernel<T> {
                                                                              \
    protected:                                                                \
     std::unique_ptr<paddle::framework::OpDesc> Apply() const override {      \
-      auto* op = new paddle::framework::OpDesc();                            \
+      auto *op = new paddle::framework::OpDesc();                            \
       op->SetType(#kernel_type "_grad");                                     \
       op->SetInput("Y", Input("Y"));                                         \
       op->SetInput(::paddle::framework::GradVarName("Out"),                  \
diff --git a/paddle/fluid/operators/elementwise_sub_op.h b/paddle/fluid/operators/elementwise_sub_op.h
index 3385df0897700d37d60d8804a01db777ebc02a7e..7204c43464e0b81126148b86f64a36b0e299368b 100644
--- a/paddle/fluid/operators/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise_sub_op.h
@@ -28,11 +28,10 @@ template <typename DeviceContext, typename T>
 class ElementwiseSubKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* z = ctx.Output<framework::LoDTensor>("Out");
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
     ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
diff --git a/paddle/fluid/operators/extract_rows_op.cc b/paddle/fluid/operators/extract_rows_op.cc
deleted file mode 100644
index 3acae3bcdf4a509ab6e7e19f21c4b2ec4d72b7d7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/extract_rows_op.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class ExtractRowsOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ExtractRowsOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ExtractRowsOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("X")[0],
-                      framework::proto::VarType::SELECTED_ROWS,
-                      "The type of input(X) must be SelectedRows.");
-    auto in_dims = ctx->GetInputDim("X");
-
-    ctx->SetOutputDim(
-        "Out", framework::make_ddim(std::vector<int64_t>{in_dims[0], 1}));
-  }
-};
-
-class ExtractRowsOp : public framework::OperatorBase {
- public:
-  ExtractRowsOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
-    auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
-
-    auto &in_rows = in.rows();
-    auto out_dim = framework::make_ddim(
-        std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
-    auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());
-
-    if (paddle::platform::is_gpu_place(in.place())) {
-#ifdef PADDLE_WITH_CUDA
-      platform::DeviceContextPool &pool =
-          platform::DeviceContextPool::Instance();
-      auto *dev_ctx = pool.Get(in.place());
-      auto src_ptr = in_rows.Data(in.place());
-      auto stream =
-          reinterpret_cast<const platform::CUDADeviceContext &>(*dev_ctx)
-              .stream();
-      memory::Copy(boost::get<platform::CUDAPlace>(out->place()), dst_ptr,
-                   boost::get<platform::CUDAPlace>(in.place()), src_ptr,
-                   in_rows.size() * sizeof(int64_t), stream);
-#else
-      PADDLE_THROW("Not compiled with CUDA.");
-#endif
-    } else {
-      memory::Copy(platform::CPUPlace(), dst_ptr, platform::CPUPlace(),
-                   in_rows.data(), in_rows.size() * sizeof(int64_t));
-    }
-  }
-};
-
-class ExtractRowsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(SelectedRows). The input tensor of extract_rows operator,"
-             " and its type is SelectedRows.");
-    AddOutput("Out", "(Tensor). The the rows of input(X).");
-
-    AddComment(R"DOC(
-    ExtractRows Operator.
-
-The function of extract_rows_op is extracting the rows from the input(X)
-whose type is SelectedRows.
-
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(extract_rows, ops::ExtractRowsOp, ops::ExtractRowsOpMaker,
-                  ops::ExtractRowsOpInferShape);
diff --git a/paddle/fluid/operators/feed_op.cc b/paddle/fluid/operators/feed_op.cc
index dc7ef664958238ddbd48745bd59cc7db28e49f5b..5da0a536d96e5184d51638bc6b374d2263b5e9eb 100644
--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
@@ -47,8 +47,8 @@ class FeedOp : public framework::OperatorBase {
 
     auto col = Attr<int>("col");
 
-    VLOG(3) << "Feed Var " << feed_var_name << "'s " << col << " column to var "
-            << out_name;
+    VLOG(30) << "Feed Var " << feed_var_name << "'s " << col
+             << " column to var " << out_name;
 
     auto &feed_list = feed_var->Get<framework::FeedFetchList>();
     auto &feed_item = feed_list.at(static_cast<size_t>(col));
diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc
index 8754856e140ed074782e6fccb8991571a12babab..88a5e59ce7d6c0d14e480922bd328d632c9178e5 100644
--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@@ -43,7 +43,7 @@ class FetchBarrierOp : public framework::OperatorBase {
     PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
 
     for (auto& ep : eps) {
-      VLOG(3) << "fetch barrier, ep: " << ep;
+      VLOG(30) << "fetch barrier, ep: " << ep;
       rpc_client->AsyncSendFetchBarrier(ep);
     }
     PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
diff --git a/paddle/fluid/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc
index c197b45e8196a47def6465128e8ca39d8daefed6..c9e759ebff63948046e67def7fb94e0241029581 100644
--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
@@ -57,7 +57,7 @@ class FetchOp : public framework::OperatorBase {
     TensorCopySync(src_item, platform::CPUPlace(), &dst_item);
     dst_item.set_lod(src_item.lod());
 
-    VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name;
+    VLOG(30) << "Fetch variable " << fetch_var_name << " to " << out_name;
   }
 };
 
diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h
index d74d4db92528d69492ab7b90a98d3775dadc35d1..e4df59c5d51c390cf593add0c5562665c91f33f6 100644
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
@@ -50,7 +50,9 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
                const Tensor& index, Tensor* output) {
   // PADDLE_ENFORCE(platform::is_gpu_place(place));
   // check index of shape 1-D
-  PADDLE_ENFORCE(index.dims().size() == 1);
+  PADDLE_ENFORCE(index.dims().size() == 1 ||
+                 (index.dims().size() == 2 && index.dims()[1] == 1));
+
   int index_size = index.dims()[0];
 
   auto src_dims = src.dims();
diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h
index d72e07d76c97e9e455e54980207d7c02842cc04b..dc08ee5efacde5e232d751b13aaf11f51237634a 100644
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/fluid/operators/gather.h
@@ -38,7 +38,8 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
                const Tensor& index, Tensor* output) {
   PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
   // check index of shape 1-D
-  PADDLE_ENFORCE(index.dims().size() == 1);
+  PADDLE_ENFORCE(index.dims().size() == 1 ||
+                 (index.dims().size() == 2 && index.dims()[1] == 1));
   int64_t index_size = index.dims()[0];
 
   auto src_dims = src.dims();
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index f84ff206fffddef1030b7ed439e887bdfef342a6..95aa9b573c795159079bdb5401b34d7a61252115 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -31,7 +31,8 @@ class GatherOp : public framework::OperatorWithKernel {
                    "Output(Out) of GatherOp should not be null.");
 
     auto index_dims = ctx->GetInputDim("Index");
-    PADDLE_ENFORCE(index_dims.size() == 1);
+    PADDLE_ENFORCE(index_dims.size() == 1 ||
+                   (index_dims.size() == 2 && index_dims[1] == 1));
     int batch_size = ctx->GetInputDim("Index")[0];
     framework::DDim output_dims(ctx->GetInputDim("X"));
     output_dims[0] = batch_size;
@@ -53,6 +54,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X"));
   }
 
  protected:
@@ -75,7 +77,7 @@ Gather Operator.
 
 $Out = X[Index]$
 
-Out is obtained by gathering entries of the outer-most dimension 
+Out is obtained by gathering entries of the outer-most dimension
 of X indexed by Index and concatenate them together.
 
 Example:
diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc
index ef574ccdf48dcf6074a777bcb7667b114415674c..56ea165ff84291babc0e9ee56ada669cbbbe79fe 100644
--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
@@ -64,7 +64,7 @@ class GenNCCLIdOp : public framework::OperatorBase {
         distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
 
     for (auto& ep : endpoint_list) {
-      VLOG(3) << "sending nccl id to " << ep;
+      VLOG(30) << "sending nccl id to " << ep;
       client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
     }
     client->Wait();
@@ -72,7 +72,7 @@ class GenNCCLIdOp : public framework::OperatorBase {
       client->AsyncSendBatchBarrier(ep);
     }
     client->Wait();
-    VLOG(3) << "sending completed...";
+    VLOG(30) << "sending completed...";
   }
 
   void GetIdByServer(framework::Scope* scope,
@@ -99,11 +99,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
         std::bind(&distributed::RPCServer::StartServer, rpc_service.get()));
 
     rpc_service->SetCond(distributed::kRequestSend);
-    VLOG(3) << "start getting nccl id from trainer 0...";
+    VLOG(30) << "start getting nccl id from trainer 0...";
     rpc_service->WaitBarrier(distributed::kRequestSend);
-    VLOG(3) << "got nccl id and stop server...";
+    VLOG(30) << "got nccl id and stop server...";
     rpc_service->ShutDown();
-    VLOG(3) << "rpc server stopped";
+    VLOG(30) << "rpc server stopped";
     server_thread.join();
   }
 };
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
index 0d5874fc0cc4b90bec141690b88f28a27443bd60..4e91a3dcd272c8d368cb8c43e7e1fb4c98265db4 100644
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -63,12 +63,19 @@ static void CalcGridLocations(const platform::CPUDeviceContext& ctx,
   Tensor ones;
   ones.mutable_data<T>({n, h, w}, ctx.GetPlace());
   auto ones_t = EigenTensor<T, 3>::From(ones).setConstant(1.0);
+  Tensor half_xmax, half_ymax;
+  half_xmax.mutable_data<T>({n, h, w}, ctx.GetPlace());
+  auto half_xmax_t =
+      EigenTensor<T, 3>::From(half_xmax).setConstant(0.5 * x_max);
+  half_ymax.mutable_data<T>({n, h, w}, ctx.GetPlace());
+  auto half_ymax_t =
+      EigenTensor<T, 3>::From(half_ymax).setConstant(0.5 * y_max);
 
   // scale grid to [0, h-1/w-1]
   auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
   auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
-  grid_x_t.device(place) = 0.5 * ((grid_x_t + ones_t) * x_max);
-  grid_y_t.device(place) = 0.5 * ((grid_y_t + ones_t) * y_max);
+  grid_x_t.device(place) = (grid_x_t + ones_t) * half_xmax_t;
+  grid_y_t.device(place) = (grid_y_t + ones_t) * half_ymax_t;
 
   // calculate coords of 4 corner points
   x_w->mutable_data<T>({n, h, w}, ctx.GetPlace());
diff --git a/paddle/fluid/operators/bilinear_interp_op.cc b/paddle/fluid/operators/interpolate_op.cc
similarity index 52%
rename from paddle/fluid/operators/bilinear_interp_op.cc
rename to paddle/fluid/operators/interpolate_op.cc
index 2dc3399da183fbcf7664066f6f7ce12db3dc6d5e..8f979e05d31e5a85bc86784943f4588ab650f668 100644
--- a/paddle/fluid/operators/bilinear_interp_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
@@ -9,7 +9,8 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/fluid/operators/bilinear_interp_op.h"
+#include "paddle/fluid/operators/interpolate_op.h"
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -18,27 +19,34 @@ namespace operators {
 
 using framework::Tensor;
 
-class BilinearInterpOp : public framework::OperatorWithKernel {
+class InterpolateOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of BilinearInterOp should not be null.");
+                   "Input(X) of InterpolateOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of BilinearInterOp should not be null.");
+                   "Output(Out) of InterpolationOp should not be null.");
+
+    auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
+    PADDLE_ENFORCE(
+        "bilinear" == interp_method || "nearest" == interp_method,
+        "Interpolation method can only be \"bilinear\" or \"nearest\".");
 
     auto dim_x = ctx->GetInputDim("X");  // NCHW format
     int out_h = ctx->Attrs().Get<int>("out_h");
     int out_w = ctx->Attrs().Get<int>("out_w");
     PADDLE_ENFORCE_EQ(dim_x.size(), 4, "X's dimension must be 4");
 
-    if (ctx->HasInput("OutSize")) {
+    if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
       auto out_size_dim = ctx->GetInputDim("OutSize");
       PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
                         "OutSize's dimension size must be 1");
       PADDLE_ENFORCE_EQ(out_size_dim[0], 2, "OutSize's dim[0] must be 2");
+      ctx->ShareLoD("X", "Out");
+      return;
     }
     std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
     ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
@@ -52,35 +60,53 @@ class BilinearInterpOp : public framework::OperatorWithKernel {
   }
 };
 
-class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker {
+class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "The input tensor of bilinear interpolation, "
-             "This is a 4-D tensor with shape of (N x C x h x w)");
+             "The input tensor of interpolate operator, "
+             "This is a 4-D tensor with shape of [N,  C, H, w].");
     AddInput("OutSize",
-             "This is a 1-D tensor with two number. "
+             "This is a 1-D tensor with two numbers to specify output size. "
              "The first number is height and the second number is width.")
         .AsDispensable();
-    AddOutput("Out", "The dimension of output is (N x C x out_h x out_w)");
+    AddOutput("Out",
+              "The output tensor of interpolate operator, "
+              "This is a 4-D tensor with shape of [N, C, H, W].");
 
-    AddAttr<int>("out_h", "output height of bilinear interpolation op.");
-    AddAttr<int>("out_w", "output width of bilinear interpolation op.");
+    AddAttr<int>("out_h", "output height of interpolate op.");
+    AddAttr<int>("out_w", "output width of interpolate op.");
+    AddAttr<std::string>(
+        "interp_method",
+        "(string), interpolation method, can be \"bilinear\" for "
+        "bilinear interpolation and \"nearest\" for nearest "
+        "neighbor interpolation.");
     AddComment(R"DOC(
+          This operator samples input X to given output shape by using specified
+          interpolation method, the interpolation methods can be \"nearest\"
+          for nearest neighbor interpolation and \"bilinear\" for bilinear 
+          interpolation.
+
+          Nearest neighbor interpolation is to perform nearest neighbor interpolation
+          in both the 3rd dimention(in height direction) and the 4th dimention(in width 
+          direction) on input tensor.
+            
           Bilinear interpolation is an extension of linear interpolation for 
           interpolating functions of two variables (e.g. H-direction and 
-          W-direction in this op) on a rectilinear 2D grid. 
-          
-          The key idea is to perform linear interpolation first in one 
-          direction, and then again in the other direction.
-            
-          For details, please refer to Wikipedia: 
+          W-direction in this op) on a rectilinear 2D grid. The key idea is 
+          to perform linear interpolation first in one direction, and then 
+          again in the other direction.
+
+          For details of nearest neighbor interpolation, please refer to Wikipedia: 
+          https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
+
+          For details of bilinear interpolation, please refer to Wikipedia: 
           https://en.wikipedia.org/wiki/Bilinear_interpolation
          )DOC");
   }
 };
 
-class BilinearInterpOpGrad : public framework::OperatorWithKernel {
+class InterpolateOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -106,11 +132,11 @@ class BilinearInterpOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(bilinear_interp, ops::BilinearInterpOp,
-                  ops::BilinearInterpOpMaker,
+REGISTER_OPERATOR(interpolate, ops::InterpolateOp, ops::InterpolateOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(bilinear_interp_grad, ops::BilinearInterpOpGrad);
-REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel<float>,
-                       ops::BilinearInterpKernel<uint8_t>);
-REGISTER_OP_CPU_KERNEL(bilinear_interp_grad,
-                       ops::BilinearInterpGradKernel<float>);
+REGISTER_OPERATOR(interpolate_grad, ops::InterpolateOpGrad);
+REGISTER_OP_CPU_KERNEL(interpolate, ops::InterpolateKernel<float>,
+                       ops::InterpolateKernel<double>,
+                       ops::InterpolateKernel<uint8_t>);
+REGISTER_OP_CPU_KERNEL(interpolate_grad, ops::InterpolateGradKernel<float>,
+                       ops::InterpolateGradKernel<double>);
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..190afbdac431f863c32e2a4a4b3ad83848e550fc
--- /dev/null
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -0,0 +1,292 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <string>
+#include "paddle/fluid/operators/interpolate_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+__global__ void KeNearestNeighborInterpFw(
+    const T* in, const size_t in_img_h, const size_t in_img_w,
+    const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const float ratio_h, const float ratio_w) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+    int channel_id = out_id_w / out_img_size;
+
+    int out_img_idy = (out_id_w % out_img_size) / out_img_w;
+    int in_img_idy = static_cast<int>(ratio_h * out_img_idy + 0.5);
+
+    int out_img_idx = tid % out_img_w;
+    int in_img_idx = static_cast<int>(ratio_w * out_img_idx + 0.5);
+
+    out[tid] = in[out_id_h * input_w + channel_id * in_img_size +
+                  in_img_idy * in_img_w + in_img_idx];
+  }
+}
+
+template <typename T>
+__global__ void KeNearestNeighborInterpBw(
+    T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
+    const size_t input_w, const T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const float ratio_h, const float ratio_w) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+    int channel_id = out_id_w / out_img_size;
+
+    int out_img_idy = (out_id_w % out_img_size) / out_img_w;
+    int in_img_idy = static_cast<int>(ratio_h * out_img_idy + 0.5);
+
+    int out_img_idx = tid % out_img_w;
+    int in_img_idx = static_cast<int>(ratio_w * out_img_idx + 0.5);
+
+    T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                    in_img_idy * in_img_w + in_img_idx];
+    const T out_pos = out[out_id_h * output_w + out_id_w];
+    platform::CudaAtomicAdd(in_pos, out_pos);
+  }
+}
+
+template <typename T>
+__global__ void KeBilinearInterpFw(
+    const T* in, const size_t in_img_h, const size_t in_img_w,
+    const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const float ratio_h, const float ratio_w) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+    int channel_id = out_id_w / out_img_size;
+
+    int out_img_idy = (out_id_w % out_img_size) / out_img_w;
+    int in_img_idy = ratio_h * out_img_idy;
+    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
+    T h1lambda = ratio_h * out_img_idy - in_img_idy;
+    T h2lambda = 1.f - h1lambda;
+
+    int out_img_idx = tid % out_img_w;
+    int in_img_idx = ratio_w * out_img_idx;
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
+    T w1lambda = ratio_w * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                          in_img_idy * in_img_w + in_img_idx];
+
+    // bilinear interpolation
+    out[out_id_h * output_w + out_id_w] =
+        h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) +
+        h1lambda * (w2lambda * in_pos[h_id * in_img_w] +
+                    w1lambda * in_pos[h_id * in_img_w + w_id]);
+  }
+}
+
+template <typename T>
+__global__ void KeBilinearInterpBw(
+    T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
+    const size_t input_w, const T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const T ratio_h, const T ratio_w) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+    int channel_id = out_id_w / out_img_size;
+
+    int out_img_idy = (out_id_w % out_img_size) / out_img_w;
+    int in_img_idy = ratio_h * out_img_idy;
+    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
+    T h1lambda = ratio_h * out_img_idy - in_img_idy;
+    T h2lambda = 1.f - h1lambda;
+
+    int out_img_idx = tid % out_img_w;
+    int in_img_idx = ratio_w * out_img_idx;
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
+    T w1lambda = ratio_w * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                    in_img_idy * in_img_w + in_img_idx];
+    const T* out_pos = &out[out_id_h * output_w + out_id_w];
+    platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
+    platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
+    platform::CudaAtomicAdd(&in_pos[h_id * in_img_w],
+                            h1lambda * w2lambda * out_pos[0]);
+    platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id],
+                            h1lambda * w1lambda * out_pos[0]);
+  }
+}
+
+template <typename T>
+class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* input_data = input->data<T>();
+
+    auto interp_method = ctx.Attr<std::string>("interp_method");
+    int out_h = ctx.Attr<int>("out_h");
+    int out_w = ctx.Attr<int>("out_w");
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      Tensor sizes;
+      framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_h = size_data[0];
+      out_w = size_data[1];
+    }
+
+    int n = input->dims()[0];
+    int c = input->dims()[1];
+    int in_h = input->dims()[2];
+    int in_w = input->dims()[3];
+
+    auto* output_data =
+        output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
+
+    int in_hw = in_h * in_w;
+    int out_hw = out_h * out_w;
+    int in_chw = c * in_hw;
+    int out_chw = c * out_hw;
+
+    float ratio_h =
+        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
+    float ratio_w =
+        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
+
+    if (in_h == out_h && in_w == out_w) {
+      framework::TensorCopy(*input, ctx.GetPlace(), output);
+      return;
+    }
+
+    int pixelNum = n * out_chw;
+    int grid_dim = (pixelNum + 512 - 1) / 512;
+    grid_dim = grid_dim > 8 ? 8 : grid_dim;
+
+    if ("nearest" == interp_method) {
+      KeNearestNeighborInterpFw<
+          T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+          input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
+          out_chw, c, ratio_h, ratio_w);
+    } else if ("bilinear" == interp_method) {
+      KeBilinearInterpFw<
+          T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+          input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
+          out_chw, c, ratio_h, ratio_w);
+    }
+  }
+};
+
+template <typename T>
+class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* output_grad_data = output_grad->data<T>();
+    auto* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+
+    auto& device_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+    math::SetConstant<platform::CUDADeviceContext, T> zero;
+    zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+    auto interp_method = ctx.Attr<std::string>("interp_method");
+    int out_h = ctx.Attr<int>("out_h");
+    int out_w = ctx.Attr<int>("out_w");
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      Tensor sizes;
+      framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_h = size_data[0];
+      out_w = size_data[1];
+    }
+
+    int n = input_grad->dims()[0];
+    int c = input_grad->dims()[1];
+    int in_h = input_grad->dims()[2];
+    int in_w = input_grad->dims()[3];
+
+    int in_hw = in_h * in_w;
+    int out_hw = out_h * out_w;
+    int in_chw = c * in_hw;
+    int out_chw = c * out_hw;
+
+    float ratio_h =
+        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
+    float ratio_w =
+        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
+
+    if (in_h == out_h && in_w == out_w) {
+      framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad);
+      return;
+    }
+
+    int pixelNum = n * out_chw;
+    int grid_dim = (pixelNum + 512 - 1) / 512;
+    grid_dim = grid_dim > 8 ? 8 : grid_dim;
+
+    if ("nearest" == interp_method) {
+      KeNearestNeighborInterpBw<
+          T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+          input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h,
+          out_w, n, out_chw, c, ratio_h, ratio_w);
+    } else if ("bilinear" == interp_method) {
+      KeBilinearInterpBw<
+          T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+          input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h,
+          out_w, n, out_chw, c, ratio_h, ratio_w);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(interpolate, ops::InterpolateOpCUDAKernel<float>,
+                        ops::InterpolateOpCUDAKernel<double>,
+                        ops::InterpolateOpCUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(interpolate_grad,
+                        ops::InterpolateGradOpCUDAKernel<float>,
+                        ops::InterpolateGradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7fdb3e1f5a2ff82284d89dd0759e357978e1d873
--- /dev/null
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -0,0 +1,236 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+using Tensor = framework::Tensor;
+
+template <typename T>
+static void NearestNeighborInterpolate(const Tensor& input, Tensor* output,
+                                       const float ratio_h, const float ratio_w,
+                                       const int n, const int c,
+                                       const int out_h, const int out_w) {
+  auto input_t = EigenTensor<T, 4>::From(input);
+  auto output_t = EigenTensor<T, 4>::From(*output);
+  for (int k = 0; k < out_h; k++) {  // loop for images
+    int in_k = static_cast<int>(ratio_h * k + 0.5);
+
+    for (int l = 0; l < out_w; l++) {
+      int in_l = static_cast<int>(ratio_w * l + 0.5);
+
+      for (int i = 0; i < n; i++) {    // loop for batches
+        for (int j = 0; j < c; j++) {  // loop for channels
+          output_t(i, j, k, l) = input_t(i, j, in_k, in_l);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void BilinearInterpolation(const Tensor& input, Tensor* output,
+                                  const float ratio_h, const float ratio_w,
+                                  const int in_h, const int in_w, const int n,
+                                  const int c, const int out_h,
+                                  const int out_w) {
+  auto input_t = EigenTensor<T, 4>::From(input);
+  auto output_t = EigenTensor<T, 4>::From(*output);
+  for (int k = 0; k < out_h; k++) {  // loop for images
+    int y_n = static_cast<int>(ratio_h * k);
+    int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
+    float d_n = ratio_h * k - y_n;
+    float d_s = 1.f - d_n;
+
+    for (int l = 0; l < out_w; l++) {
+      int x_w = static_cast<int>(ratio_w * l);
+      int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
+      float d_w = ratio_w * l - x_w;
+      float d_e = 1.f - d_w;
+
+      for (int i = 0; i < n; i++) {    // loop for batches
+        for (int j = 0; j < c; j++) {  // loop for channels
+          // bilinear interpolation
+          output_t(i, j, k, l) = input_t(i, j, y_n, x_w) * d_s * d_e +
+                                 input_t(i, j, y_s, x_w) * d_n * d_e +
+                                 input_t(i, j, y_n, x_e) * d_s * d_w +
+                                 input_t(i, j, y_s, x_e) * d_n * d_w;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void NearestNeighborInterpolateGrad(const Tensor& output_grad,
+                                           Tensor* input_grad,
+                                           const float ratio_h,
+                                           const float ratio_w, const int n,
+                                           const int c, const int out_h,
+                                           const int out_w) {
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+  for (int k = 0; k < out_h; k++) {  // loop for images
+    int in_k = static_cast<int>(ratio_h * k + 0.5);
+
+    for (int l = 0; l < out_w; l++) {
+      int in_l = static_cast<int>(ratio_w * l + 0.5);
+
+      for (int i = 0; i < n; i++) {    // loop for batches
+        for (int j = 0; j < c; j++) {  // loop for channels
+          input_grad_t(i, j, in_k, in_l) += output_grad_t(i, j, k, l);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void BilinearInterpolationGrad(const Tensor& output_grad,
+                                      Tensor* input_grad, const float ratio_h,
+                                      const float ratio_w, const int in_h,
+                                      const int in_w, const int n, const int c,
+                                      const int out_h, const int out_w) {
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+  for (int k = 0; k < out_h; k++) {  // loop for images
+    int y_n = static_cast<int>(ratio_h * k);
+    int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
+    float d_n = ratio_h * k - y_n;
+    float d_s = 1.f - d_n;
+
+    for (int l = 0; l < out_w; l++) {
+      int x_w = static_cast<int>(ratio_w * l);
+      int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
+      float d_w = ratio_w * l - x_w;
+      float d_e = 1.f - d_w;
+
+      for (int i = 0; i < n; i++) {    // loop for batches
+        for (int j = 0; j < c; j++) {  // loop for channels
+          // bilinear interpolation grad
+          const T grad = output_grad_t(i, j, k, l);
+          input_grad_t(i, j, y_n, x_w) += static_cast<T>(grad * d_s * d_e);
+          input_grad_t(i, j, y_s, x_w) += static_cast<T>(grad * d_n * d_e);
+          input_grad_t(i, j, y_n, x_e) += static_cast<T>(grad * d_s * d_w);
+          input_grad_t(i, j, y_s, x_e) += static_cast<T>(grad * d_n * d_w);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+class InterpolateKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+
+    std::string interp_method = ctx.Attr<std::string>("interp_method");
+    int out_h = ctx.Attr<int>("out_h");
+    int out_w = ctx.Attr<int>("out_w");
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      auto out_size_data = out_size->data<int>();
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
+
+    const int n = input->dims()[0];
+    const int c = input->dims()[1];
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
+
+    output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
+    auto& device_ctx =
+        ctx.template device_context<platform::CPUDeviceContext>();
+    math::SetConstant<platform::CPUDeviceContext, T> zero;
+    zero(device_ctx, output, static_cast<T>(0.0));
+
+    if (in_h == out_h && in_w == out_w) {
+      framework::TensorCopy(*input, ctx.GetPlace(), output);
+      return;
+    }
+
+    float ratio_h =
+        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
+    float ratio_w =
+        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
+
+    if ("bilinear" == interp_method) {
+      BilinearInterpolation<T>(*input, output, ratio_h, ratio_w, in_h, in_w, n,
+                               c, out_h, out_w);
+    } else if ("nearest" == interp_method) {
+      NearestNeighborInterpolate<T>(*input, output, ratio_h, ratio_w, n, c,
+                                    out_h, out_w);
+    }
+  }
+};
+
+template <typename T>
+class InterpolateGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    std::string interp_method = ctx.Attr<std::string>("interp_method");
+    int out_h = ctx.Attr<int>("out_h");
+    int out_w = ctx.Attr<int>("out_w");
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      auto out_size_data = out_size->data<int>();
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
+
+    const int n = input->dims()[0];
+    const int c = input->dims()[1];
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
+
+    input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
+    auto& device_ctx =
+        ctx.template device_context<platform::CPUDeviceContext>();
+    math::SetConstant<platform::CPUDeviceContext, T> zero;
+    zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+    if (in_h == out_h && in_w == out_w) {
+      framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad);
+      return;
+    }
+
+    float ratio_h =
+        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
+    float ratio_w =
+        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
+
+    if ("bilinear" == interp_method) {
+      BilinearInterpolationGrad<T>(*output_grad, input_grad, ratio_h, ratio_w,
+                                   in_h, in_w, n, c, out_h, out_w);
+    } else if ("nearest" == interp_method) {
+      NearestNeighborInterpolateGrad<T>(*output_grad, input_grad, ratio_h,
+                                        ratio_w, n, c, out_h, out_w);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 1d8b1411cddf4fe16d2d00313c519cc173e1504d..e3d09e2d14817fe0f2ccda18ed90c9436b399ae3 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -36,7 +36,7 @@ namespace operators {
 
 void RunServer(std::shared_ptr<distributed::RPCServer> service) {
   service->StartServer();
-  VLOG(4) << "RunServer thread end";
+  VLOG(40) << "RunServer thread end";
 }
 static void split(const std::string &str, char sep,
                   std::vector<std::string> *pieces) {
@@ -66,8 +66,8 @@ static void ParallelExecuteBlocks(
     fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() {
       int run_block = idx;  // thread local
       try {
-        VLOG(3) << "running server block: " << run_block
-                << "pointer: " << prepared[run_block].get();
+        VLOG(30) << "running server block: " << run_block
+                 << "pointer: " << prepared[run_block].get();
         executor->RunPreparedContext(prepared[run_block].get(), scope);
       } catch (const std::exception &e) {
         LOG(FATAL) << "run sub program:" << idx << " error " << e.what();
@@ -108,7 +108,7 @@ void ListenAndServOp::RunSyncLoop(
     framework::Scope *recv_scope, platform::DeviceContext *dev_ctx,
     const std::vector<int> &prefetch_block_id_list,
     const int checkpoint_point_block_id) const {
-  VLOG(2) << "RunSyncLoop";
+  VLOG(20) << "RunSyncLoop";
   size_t num_blocks = program->Size();
   auto optimize_blocks =
       Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
@@ -167,7 +167,7 @@ void ListenAndServOp::RunSyncLoop(
     }
     ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program,
                           recv_scope);
-    VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
+    VLOG(20) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
 
     ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars());
 
@@ -183,11 +183,11 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
   for (auto &varname : sparse_vars_) {
     auto var = recv_scope->FindVar(varname);
     if (var == nullptr) {
-      VLOG(2) << "can not find var " << varname << " in received scope";
+      VLOG(20) << "can not find var " << varname << " in received scope";
       continue;
     }
     if (var->IsType<framework::SelectedRows>()) {
-      VLOG(3) << "reset sparse var: " << varname;
+      VLOG(30) << "reset sparse var: " << varname;
       var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
     } else {
       PADDLE_THROW("The type of sparse var should be SelectedRows");
@@ -197,7 +197,7 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
     for (auto &varname : dense_vars_) {
       auto var = recv_scope->FindVar(varname);
       if (var == nullptr) {
-        VLOG(2) << "can not find var " << varname << " in received scope";
+        VLOG(20) << "can not find var " << varname << " in received scope";
         continue;
       }
       if (var->IsType<framework::LoDTensor>()) {
@@ -216,7 +216,7 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
 void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
                                    framework::ProgramDesc *program,
                                    framework::Scope *recv_scope) const {
-  VLOG(2) << "RunAsyncLoop";
+  VLOG(20) << "RunAsyncLoop";
   auto grad_to_block_id_str =
       Attr<std::vector<std::string>>("grad_to_block_id");
   DoubleFindMap<std::string, int32_t> grad_to_block_id;
@@ -225,7 +225,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
                               const std::string &grad_and_id) {
     std::vector<std::string> pieces;
     split(grad_and_id, ':', &pieces);
-    VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
+    VLOG(30) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
     PADDLE_ENFORCE_EQ(pieces.size(), 2);
     PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0);
 
@@ -270,7 +270,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
 
   while (true) {
     if (rpc_service_->IsExit()) {
-      VLOG(4) << "get exit!rpc_processor break!";
+      VLOG(40) << "get exit!rpc_processor break!";
       break;
     }
 
@@ -332,9 +332,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   std::string endpoint = Attr<std::string>("endpoint");
   int checkpoint_block_id = Attr<int>(kCheckpointBlockId);
 
-  VLOG(4) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
-          << ", end_point:" << endpoint
-          << ", checkpoint_block_id: " << checkpoint_block_id;
+  VLOG(40) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
+           << ", end_point:" << endpoint
+           << ", checkpoint_block_id: " << checkpoint_block_id;
 
   rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
 
@@ -383,8 +383,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
        prefetch_var_name_to_block_id_str) {
     std::vector<std::string> pieces;
     split(prefetch_var_name_and_id, ':', &pieces);
-    VLOG(3) << "after split, prefetch_var = " << pieces[0]
-            << ", id=" << pieces[1];
+    VLOG(30) << "after split, prefetch_var = " << pieces[0]
+             << ", id=" << pieces[1];
     PADDLE_ENFORCE_EQ(pieces.size(), 2);
 
     int block_id = std::stoi(pieces[1]);
@@ -415,7 +415,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
 
   // start the server listening after all member initialized.
   server_thread_.reset(new std::thread(RunServer, rpc_service_));
-  VLOG(3) << "wait server thread to become ready...";
+  VLOG(30) << "wait server thread to become ready...";
   rpc_service_->WaitServerReady();
 
   // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 51219504ffa2a778b56351f759e8a8dfb951ad91..df1edc5c2e994b3093d6f6e7e4f6e0e5b2abb469 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -40,8 +40,9 @@ class LoadOp : public framework::OperatorBase {
 
     auto out_var_name = Output("Out");
     auto *out_var = scope.FindVar(out_var_name);
-    PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
-                   out_var_name);
+    PADDLE_ENFORCE(out_var != nullptr,
+                   "Output variable %s cannot be found in scope %p",
+                   out_var_name, &scope);
 
     if (out_var->IsType<framework::LoDTensor>()) {
       LoadLodTensor(fin, place, out_var);
diff --git a/paddle/fluid/operators/lod_rank_table_op.cc b/paddle/fluid/operators/lod_rank_table_op.cc
index 166952fe23192799443ef9c9d1f7ba5056d19290..59ef9cb626d61f918c8ad1990a0f25030fb44ec6 100644
--- a/paddle/fluid/operators/lod_rank_table_op.cc
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
@@ -30,9 +30,9 @@ class LoDRankTableOp : public framework::OperatorBase {
     auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto *out =
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
-    VLOG(10) << "Level = " << static_cast<size_t>(Attr<int>("level"));
+    VLOG(100) << "Level = " << static_cast<size_t>(Attr<int>("level"));
     out->Reset(x.lod(), static_cast<size_t>(Attr<int>("level")));
-    VLOG(10) << Input("X") << "'s lod information is " << *out;
+    VLOG(100) << Input("X") << "'s lod information is " << *out;
   }
 };
 
diff --git a/paddle/fluid/operators/lookup_sparse_table_op.cc b/paddle/fluid/operators/lookup_sparse_table_op.cc
index de3f0990e109cacd49c4d888bbc1f797fb196e01..a6843f20a59a23bd4e875b0f96524cc8d7aa46d6 100644
--- a/paddle/fluid/operators/lookup_sparse_table_op.cc
+++ b/paddle/fluid/operators/lookup_sparse_table_op.cc
@@ -45,6 +45,7 @@ class LookupSparseTableOp : public framework::OperatorBase {
     auto out_var = scope.FindVar(Output("Out"));
     auto w_var = scope.FindVar(Input("W"));
     auto ids_var = scope.FindVar(Input("Ids"));
+    auto is_test = Attr<bool>("is_test");
 
     PADDLE_ENFORCE(out_var->IsType<framework::LoDTensor>(),
                    "The type of Out var should be LodTensor.");
@@ -65,7 +66,7 @@ class LookupSparseTableOp : public framework::OperatorBase {
     PADDLE_ENFORCE_EQ(framework::ToDataType(w_t->value().type()),
                       framework::proto::VarType::FP32,
                       "The sparse table only support FP32");
-    w_t->Get(ids_t, out_t, true);
+    w_t->Get(ids_t, out_t, true, is_test);
   }
 };
 
@@ -91,6 +92,10 @@ class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool default false)"
                   "Whether create new value if for nonexistent key.")
         .SetDefault(true);
+    AddAttr<bool>("is_test",
+                  "In test mode, lookup_sparse_table will "
+                  "return a 0 for unknown id")
+        .SetDefault(false);
     AddComment(R"DOC(
 Lookup Sprase Tablel Operator.
 
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 3226a727b1f5f6de9e97ce2068381be7c9b69ff3..1878dfe8a897db1b8c948d325fa48a38ca224a2b 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -134,13 +134,13 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
     auto attr = op_desc.GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
-      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
-              << " is set to SelectedRows";
+      VLOG(30) << "lookup_table_grad op " << framework::GradVarName("W")
+               << " is set to SelectedRows";
       block->Var(out_var_name)
           ->SetType(framework::proto::VarType::SELECTED_ROWS);
     } else {
-      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
-              << " is set to LoDTensor";
+      VLOG(30) << "lookup_table_grad op " << framework::GradVarName("W")
+               << " is set to LoDTensor";
       block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR);
     }
     block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType());
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 52b459a6a2e56b7c256efdb535b4652c64bae23c..61c3cb34a2472c0ba7d2a7ea5abf8e826a793951 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/lrn_op.h"
 #include <string>
+#include "paddle/fluid/operators/math/blas.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -29,34 +30,43 @@ struct LRNFunctor<platform::CPUDeviceContext, T> {
                   const framework::Tensor& input, framework::Tensor* out,
                   framework::Tensor* mid, int N, int C, int H, int W, int n,
                   T k, T alpha, T beta) {
-    auto x_v = framework::EigenVector<T>::Flatten(input);
-
-    const int start = -(n - 1) / 2;
-    const int end = start + n;
-
-    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
-    e_mid = e_mid.constant(k);
-
-    auto e_x = framework::EigenTensor<T, 4>::From(input);
-    for (int m = 0; m < N; m++) {
-      for (int i = 0; i < C; i++) {
-        for (int c = start; c < end; c++) {
-          int ch = i + c;
-          if (ch >= 0 && ch < C) {
-            auto s = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                 Eigen::array<int, 4>({{1, 1, H, W}}));
-
-            auto r = e_x.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                               Eigen::array<int, 4>({{1, 1, H, W}}));
-
-            s += alpha * r.square();
-          }
-        }
+    const T* idata = input.data<T>();
+    auto place = ctx.GetPlace();
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    T* odata = out->mutable_data<T>(place);
+    T* mdata = mid->mutable_data<T>(place);
+    Tensor squared;
+    T* sdata = squared.mutable_data<T>({1, C + n - 1, H, W}, place);
+    std::memset(sdata, 0, sizeof(T) * squared.numel());
+    for (int i = 0; i < mid->numel(); ++i) {
+      mdata[i] = k;
+    }
+    int img_size = H * W;
+    int fea_size = C * img_size;
+    int pre_pad = (n - 1) / 2;
+    // compute batches one by one
+    for (int i = 0; i < N; ++i) {
+      blas.VSQR(fea_size, idata + i * fea_size, sdata + pre_pad * img_size);
+      // init the first channel of mid
+      for (int c = 0; c < n; ++c) {
+        blas.AXPY(img_size, alpha, sdata + c * img_size, mdata + i * fea_size);
+      }
+      for (int c = 1; c < C; ++c) {
+        // copy previous scale
+        int mid_offset = i * fea_size + c * img_size;
+        std::memcpy(mdata + mid_offset, mdata + mid_offset - img_size,
+                    img_size * sizeof(T));
+        // add last
+        blas.AXPY(img_size, alpha, sdata + (c + n - 1) * img_size,
+                  mdata + mid_offset);
+        // sub rest
+        blas.AXPY(img_size, -alpha, sdata + (c - 1) * img_size,
+                  mdata + mid_offset);
       }
     }
-
-    auto out_e = framework::EigenVector<T>::Flatten(*out);
-    out_e = x_v * e_mid.reshape(Eigen::DSizes<int, 1>(e_mid.size())).pow(-beta);
+    // compute the final output
+    blas.VPOW(mid->numel(), mdata, -beta, odata);
+    blas.VMUL(mid->numel(), odata, idata, odata);
   }
 };
 template struct LRNFunctor<platform::CPUDeviceContext, float>;
@@ -156,6 +166,9 @@ class LRNOp : public framework::OperatorWithKernel {
     auto x_dim = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4.");
 
+    int n = ctx->Attrs().Get<int>("n");
+    PADDLE_ENFORCE(n > 0 && n % 2 == 1, "n should be positive odd value");
+
     ctx->SetOutputDim("Out", x_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
     ctx->SetOutputDim("MidOut", x_dim);
diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h
index 0fd3175e8579df9e61368cc151a94fa45e433884..12d39c3815395896343238b536110aecac66a376 100644
--- a/paddle/fluid/operators/lrn_op.h
+++ b/paddle/fluid/operators/lrn_op.h
@@ -60,7 +60,6 @@ class LRNKernel : public framework::OpKernel<T> {
     T beta = ctx.Attr<float>("beta");
     T k = ctx.Attr<float>("k");
 
-    PADDLE_ENFORCE(n > 0, "n should >= 0");
     PADDLE_ENFORCE(alpha >= 0.0, "alpha should >= 0.0");
     PADDLE_ENFORCE(beta >= 0.0, "beta should >= 0.0");
     PADDLE_ENFORCE(k >= 0.0, "k should >= 0.0");
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index da185d93c09f9b06bd5968b9c8e93176f9ef014b..5d0d562030d2a20e4a1cefd3c36c6533fd35dc96 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -152,6 +152,12 @@ class Blas {
   template <typename T>
   void VEXP(int n, const T* x, T* y) const;
 
+  template <typename T>
+  void VSQR(int n, const T* x, T* y) const;
+
+  template <typename T>
+  void VPOW(int n, const T* x, T alpha, T* y) const;
+
   template <typename T>
   void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta,
             T* C) const;
@@ -238,6 +244,16 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template VEXP<T>(args...);
   }
 
+  template <typename... ARGS>
+  void VSQR(ARGS... args) const {
+    Base()->template VSQR<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void VPOW(ARGS... args) const {
+    Base()->template VPOW<T>(args...);
+  }
+
   template <typename... ARGS>
   void GEMV(ARGS... args) const {
     Base()->template GEMV<T>(args...);
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index e1df78d11e41c5f74e244643f40c6d0581fa6a4a..59454669be9e0f92a6fc0db52445307d88e1c7d8 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
+#include <cmath>
 #include <limits>
 #include <vector>
 #include "paddle/fluid/operators/math/math_function.h"
@@ -102,6 +103,16 @@ struct CBlas<float> {
   static void VEXP(ARGS... args) {
     platform::dynload::vsExp(args...);
   }
+
+  template <typename... ARGS>
+  static void VSQR(ARGS... args) {
+    platform::dynload::vsSqr(args...);
+  }
+
+  template <typename... ARGS>
+  static void VPOW(ARGS... args) {
+    platform::dynload::vsPowx(args...);
+  }
 };
 
 template <>
@@ -182,6 +193,16 @@ struct CBlas<double> {
   static void VEXP(ARGS... args) {
     platform::dynload::vdExp(args...);
   }
+
+  template <typename... ARGS>
+  static void VSQR(ARGS... args) {
+    platform::dynload::vdSqr(args...);
+  }
+
+  template <typename... ARGS>
+  static void VPOW(ARGS... args) {
+    platform::dynload::vdPowx(args...);
+  }
 };
 
 #else
@@ -241,6 +262,8 @@ struct CBlas<platform::float16> {
   }
   static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
   static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
+  static void VSQR(...) { PADDLE_THROW("float16 VSQR not supported on CPU"); }
+  static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
   static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
   static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
 #ifdef PADDLE_WITH_MKLML
@@ -398,6 +421,31 @@ void Blas<platform::CPUDeviceContext>::VEXP(int n, const T *x, T *y) const {
 #endif
 }
 
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::VSQR(int n, const T *x, T *y) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VSQR(n, x, y);
+#else
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::sqrt(x[i]);
+  }
+#endif
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::VPOW(int n, const T *x, T a,
+                                            T *y) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VPOW(n, x, a, y);
+#else
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::pow(x[i], a);
+  }
+#endif
+}
+
 template <>
 template <typename T>
 T Blas<platform::CPUDeviceContext>::DOT(int n, const T *x, const T *y) const {
diff --git a/paddle/fluid/operators/math/cos_sim_functor.cu b/paddle/fluid/operators/math/cos_sim_functor.cu
index 4e6ff5ee0a449b42762748ba1a103876beee01f2..537c7e47155fe9a12196869ceaed84fca198335b 100644
--- a/paddle/fluid/operators/math/cos_sim_functor.cu
+++ b/paddle/fluid/operators/math/cos_sim_functor.cu
@@ -51,7 +51,7 @@ struct CosSimDyFunctor<platform::CUDADeviceContext, T> {
                   T* dy) const {
     const int block_size = 512;
     dim3 threads(block_size, 1);
-    dim3 grid(1, (rows + block_size - 1) / block_size);
+    dim3 grid((rows + block_size - 1) / block_size, 1);
     CosSimDyKernel<T><<<grid, threads, 0, ctx.stream()>>>(
         x_norm, y_norm, x, y, z, dz, rows, cols, dy);
   }
diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc
index cd40f1b2f984126663a5711efac24fdf6d680b32..18a586f8dd9f01357d9facca19c51ed5c293ffd2 100644
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
@@ -96,8 +96,8 @@ void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
   }
   auto et = GetCurrentUS();
 
-  VLOG(3) << "Vec size " << n << ": refer takes: " << (et - mt) / repeat
-          << " us, tgt takes: " << (mt - st) / repeat;
+  VLOG(30) << "Vec size " << n << ": refer takes: " << (et - mt) / repeat
+           << " us, tgt takes: " << (mt - st) / repeat;
   for (int i = 0; i < n; ++i) {
     EXPECT_NEAR(ytgt_data[i], yref_data[i], 1e-3);
   }
diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h
index 87220d4019fc9337fb8355172ca4f1372cfd4558..b072b4c20a171d148bd892c162436d03da404fb9 100644
--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
@@ -36,7 +36,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
                                .template Get<jitkernel::VAddReluKernel<T>>(N);
     for (int i = 0; i < M; i++) {
       T* dst = Y + i * N;
-      vaddrelu->Compute(B, dst, dst);
+      vaddrelu->Compute(B, dst, dst, N);
     }
   } else {
     const auto& vadd = jitkernel::KernelPool::Instance()
@@ -47,7 +47,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
 #endif
     for (int i = 0; i < M; i++) {
       T* dst = Y + i * N;
-      vadd->Compute(B, dst, dst);
+      vadd->Compute(B, dst, dst, N);
     }
   }
 }
diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index 9e2cc18c7a5e396be40b2336382f68a17f8a2bf9..6b3eecfbd11471b5d95dcb10c91acc536f78cb85 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -24,43 +24,95 @@ namespace gen {
 
 using namespace platform::jit;  // NOLINT
 
-bool VMulJitCode::init(int d) {
+bool VXXJitCode::init(int d, int scalar_index) {
   // It's not necessary to use avx512 since it would slow down the frequency
   // and this kernel is not compute bound.
-  return MayIUse(avx);
+  return MayIUse(avx) && scalar_index >= 0 && scalar_index <= 2;
 }
 
-void VMulJitCode::generate() {
+void VXXJitCode::generate() {
   // do not need push stack, and do not need save avx512reg if do not use avx512
   int offset = 0;
+  if (with_relu_) {
+    vxorps(ymm_zero, ymm_zero, ymm_zero);
+  }
+  if (scalar_index_ == 1) {
+    vbroadcastss(ymm_src1, ptr[param1]);
+  } else if (scalar_index_ == 2) {
+    vbroadcastss(ymm_src2, ptr[param2]);
+  }
   for (int i = 0; i < num_ / AVX_FLOAT_BLOCK; ++i) {
-    vmovups(ymm_src1, ptr[param1 + offset]);
-    vmovups(ymm_src2, ptr[param2 + offset]);
-    vmulps(ymm_dst, ymm_src1, ymm_src2);
+    if (scalar_index_ != 1) {
+      vmovups(ymm_src1, ptr[param1 + offset]);
+    }
+    if (scalar_index_ != 2) {
+      vmovups(ymm_src2, ptr[param2 + offset]);
+    }
+    if (type_ == operand_type::mul) {
+      vmulps(ymm_dst, ymm_src1, ymm_src2);
+    } else if (type_ == operand_type::add) {
+      vaddps(ymm_dst, ymm_src1, ymm_src2);
+    }
+    if (with_relu_) {
+      vmaxps(ymm_dst, ymm_zero, ymm_dst);
+    }
     vmovups(ptr[param3 + offset], ymm_dst);
     offset += sizeof(float) * AVX_FLOAT_BLOCK;
   }
   int rest = num_ % AVX_FLOAT_BLOCK;
   if (rest >= 4) {
-    vmovups(xmm_src1, ptr[param1 + offset]);
-    vmovups(xmm_src2, ptr[param2 + offset]);
-    vmulps(xmm_dst, xmm_src1, xmm_src2);
+    if (scalar_index_ != 1) {
+      vmovups(xmm_src1, ptr[param1 + offset]);
+    }
+    if (scalar_index_ != 2) {
+      vmovups(xmm_src2, ptr[param2 + offset]);
+    }
+    if (type_ == operand_type::mul) {
+      vmulps(xmm_dst, xmm_src1, xmm_src2);
+    } else if (type_ == operand_type::add) {
+      vaddps(xmm_dst, xmm_src1, xmm_src2);
+    }
+    if (with_relu_) {
+      vmaxps(xmm_dst, xmm_zero, xmm_dst);
+    }
     vmovups(ptr[param3 + offset], xmm_dst);
     offset += sizeof(float) * 4;
     rest -= 4;
   }
   if (rest >= 2) {
-    vmovq(xmm_src1, ptr[param1 + offset]);
-    vmovq(xmm_src2, ptr[param2 + offset]);
-    vmulps(xmm_dst, xmm_src1, xmm_src2);
+    if (scalar_index_ != 1) {
+      vmovups(xmm_src1, ptr[param1 + offset]);
+    }
+    if (scalar_index_ != 2) {
+      vmovups(xmm_src2, ptr[param2 + offset]);
+    }
+    if (type_ == operand_type::mul) {
+      vmulps(xmm_dst, xmm_src1, xmm_src2);
+    } else if (type_ == operand_type::add) {
+      vaddps(xmm_dst, xmm_src1, xmm_src2);
+    }
+    if (with_relu_) {
+      vmaxps(xmm_dst, xmm_zero, xmm_dst);
+    }
     vmovq(ptr[param3 + offset], xmm_dst);
     offset += sizeof(float) * 2;
     rest -= 2;
   }
   if (rest > 0) {
-    vmovss(xmm_src1, ptr[param1 + offset]);
-    vmovss(xmm_src2, ptr[param2 + offset]);
-    vmulss(xmm_dst, xmm_src1, xmm_src2);
+    if (scalar_index_ != 1) {
+      vmovups(xmm_src1, ptr[param1 + offset]);
+    }
+    if (scalar_index_ != 2) {
+      vmovups(xmm_src2, ptr[param2 + offset]);
+    }
+    if (type_ == operand_type::mul) {
+      vmulss(xmm_dst, xmm_src1, xmm_src2);
+    } else if (type_ == operand_type::add) {
+      vaddss(xmm_dst, xmm_src1, xmm_src2);
+    }
+    if (with_relu_) {
+      vmaxps(xmm_dst, xmm_zero, xmm_dst);
+    }
     vmovss(ptr[param3 + offset], xmm_dst);
   }
   ret();
diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h
index 6007b290815de0ceaa2226962c5273ae7da72e7e..aaedb0ae10323eeddfba9512d9e47c7a22320610 100644
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/operators/math/jit_gen.h"
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -29,17 +29,47 @@ using ymm_t = const Xbyak::Ymm;
 using zmm_t = const Xbyak::Zmm;
 using Label = Xbyak::Label;
 
-class VMulJitCode : public JitCode {
+typedef enum { mul = 0, add } operand_type;
+
+// function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu)
+class VXXJitCode : public JitCode {
  public:
-  DECLARE_JIT_CODE(VMulJitCode);
-  explicit VMulJitCode(int d, size_t code_size = 256 * 1024,
-                       void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr), num_(d) {}
-  static bool init(int d);
+  const char* name() const override {
+    std::string base = "VXXJitCode";
+    if (scalar_index_ == 1) {
+      base += "_Scalar";
+    } else {
+      base += "_Vec";
+    }
+    if (type_ == operand_type::mul) {
+      base += "_Mul";
+    } else if (type_ == operand_type::add) {
+      base += "_Add";
+    }
+    if (scalar_index_ == 2) {
+      base += "_Scalar";
+    } else {
+      base += "_Vec";
+    }
+    base += (with_relu_ ? "_Relu" : "");
+    return base.c_str();
+  }
+  explicit VXXJitCode(int d, operand_type type, int scalar_index,
+                      bool with_relu, size_t code_size = 256 * 1024,
+                      void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr),
+        num_(d),
+        type_(type),
+        scalar_index_(scalar_index),
+        with_relu_(with_relu) {}
+  static bool init(int d, int scalar_index = 0);
   void generate() override;
 
  private:
   int num_;
+  operand_type type_;
+  int scalar_index_;
+  bool with_relu_;
   reg64_t param1{abi_param1};
   reg64_t param2{abi_param2};
   reg64_t param3{abi_param3};
@@ -47,10 +77,12 @@ class VMulJitCode : public JitCode {
   xmm_t xmm_src1 = xmm_t(0);
   xmm_t xmm_src2 = xmm_t(1);
   xmm_t xmm_dst = xmm_t(2);
+  xmm_t xmm_zero = xmm_t(3);
 
   ymm_t ymm_src1 = ymm_t(0);
   ymm_t ymm_src2 = ymm_t(1);
   ymm_t ymm_dst = ymm_t(2);
+  ymm_t ymm_zero = ymm_t(3);
 };
 
 }  // namespace gen
diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h
index 7b6027aa267803ff8ff830deabda536b1b27fec8..e9b259282cd00cc2afc46634423ec09590bf5dd3 100644
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -71,26 +71,27 @@ class VMulKernel : public Kernel {
 template <typename T>
 class VAddKernel : public Kernel {
  public:
-  virtual void Compute(const T *x, const T *y, T *z) const = 0;
+  void (*Compute)(const T *, const T *, T *, int);
 };
 
 template <typename T>
-class VScalKernel : public Kernel {
+class VAddReluKernel : public Kernel {
  public:
-  virtual void Compute(const T a, const T *x, T *y) const = 0;
-  virtual void Compute(const T a, T *x) const = 0;
+  void (*Compute)(const T *, const T *, T *, int);
 };
 
 template <typename T>
-class VAddBiasKernel : public Kernel {
+class VScalKernel : public Kernel {
  public:
-  virtual void Compute(const T a, const T *x, T *y) const = 0;
+  // y = a.*x
+  void (*Compute)(const T *, const T *, T *, int);
 };
 
 template <typename T>
-class VAddReluKernel : public Kernel {
+class VAddBiasKernel : public Kernel {
  public:
-  virtual void Compute(const T *x, const T *y, T *z) const = 0;
+  // y = a.+x
+  void (*Compute)(const T *, const T *, T *, int);
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc
index 8a988f8f482e4a4963f70c39bccd89387c1e0059..c4bfbcf925a2bbdc39f8468049c58e126d3eba1b 100644
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -42,6 +42,35 @@ void VMulRefer(const T* x, const T* y, T* z, int n) {
   }
 }
 
+template <typename T>
+void VAddRefer(const T* x, const T* y, T* z, int n) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] + y[i];
+  }
+}
+
+template <typename T>
+void VAddReluRefer(const T* x, const T* y, T* z, int n) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] + y[i];
+    z[i] = z[i] > 0 ? z[i] : 0;
+  }
+}
+
+template <typename T>
+void VScalRefer(const T* a, const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a[0] * x[i];
+  }
+}
+
+template <typename T>
+void VAddBiasRefer(const T* a, const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a[0] + x[i];
+  }
+}
+
 #ifdef PADDLE_WITH_MKLML
 template <typename T>
 void VMulMKL(const T* x, const T* y, T* z, int n);
@@ -50,28 +79,67 @@ template <>
 void VMulMKL<float>(const float* x, const float* y, float* z, int n) {
   platform::dynload::vsMul(n, x, y, z);
 }
+
 template <>
 void VMulMKL<double>(const double* x, const double* y, double* z, int n) {
   platform::dynload::vdMul(n, x, y, z);
 }
+
+template <typename T>
+void VAddMKL(const T* x, const T* y, T* z, int n);
+
+template <>
+void VAddMKL<float>(const float* x, const float* y, float* z, int n) {
+  platform::dynload::vsAdd(n, x, y, z);
+}
+
+template <>
+void VAddMKL<double>(const double* x, const double* y, double* z, int n) {
+  platform::dynload::vdAdd(n, x, y, z);
+}
+
+template <typename T>
+void VScalMKL(const T* a, const T* x, T* y, int n);
+
+template <>
+void VScalMKL<float>(const float* a, const float* x, float* y, int n) {
+  if (x == y) {
+    platform::dynload::cblas_sscal(n, *a, y, 1);
+  } else {
+    VScalRefer<float>(a, x, y, n);
+  }
+}
+
+template <>
+void VScalMKL<double>(const double* a, const double* x, double* y, int n) {
+  if (x == y) {
+    platform::dynload::cblas_dscal(n, *a, y, 1);
+  } else {
+    VScalRefer<double>(a, x, y, n);
+  }
+}
+
 #endif
 
+#define DECLARE_STATIC_FUNC                                 \
+  static inline std::string name(int d) {                   \
+    PADDLE_THROW("DType should be either float or double"); \
+  }                                                         \
+  static inline bool useJIT(int d) { return false; }        \
+  static inline bool useMKL(int d) { return false; }
+
 /* VMUL JitKernel */
 template <typename T>
 class VMulKernelImpl : public VMulKernel<T> {
  public:
-  static inline std::string name(int d) {
-    PADDLE_THROW("DType should be either float or double");
-  }
-  static inline bool useJIT(int d) { return false; }
-  static inline bool useMKL(int d) { return false; }
-
+  DECLARE_STATIC_FUNC;
   explicit VMulKernelImpl(int d) : VMulKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
     if (useJIT(d)) {
       // roughly estimate the size of code
       size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
-      jitcode_.reset(new gen::VMulJitCode(d, sz > 4096 ? sz : 4096));
+      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::mul, 0, false,
+                                         sz > 4096 ? sz : 4096));
       this->Compute =
           jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
       return;
@@ -89,14 +157,14 @@ class VMulKernelImpl : public VMulKernel<T> {
 #ifdef PADDLE_WITH_XBYAK
 
  private:
-  std::unique_ptr<gen::VMulJitCode> jitcode_{nullptr};
+  std::unique_ptr<gen::VXXJitCode> jitcode_{nullptr};
 #endif
 };
 
 #ifdef PADDLE_WITH_XBYAK
 template <>
 bool VMulKernelImpl<float>::useJIT(int d) {
-  return gen::VMulJitCode::init(d);
+  return gen::VXXJitCode::init(d);
 }
 #endif
 
@@ -112,189 +180,177 @@ bool VMulKernelImpl<double>::useMKL(int d) {
 }
 #endif
 
-REGISTER_JITKERNEL(vmul, VMulKernel);
-
-/* VADD JitKernel */
-template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+/* VAdd JitKernel */
+template <typename T>
 class VAddKernelImpl : public VAddKernel<T> {
  public:
-  explicit VAddKernelImpl(int d) : VAddKernel<T>() { this->num_ = d; }
-  void Compute(const T* x, const T* y, T* z) const override {
-    for (int i = 0; i < this->num_; ++i) {
-      z[i] = x[i] + y[i];
+  DECLARE_STATIC_FUNC;
+  explicit VAddKernelImpl(int d) : VAddKernel<T>() {
+#ifdef PADDLE_WITH_XBYAK
+    if (useJIT(d)) {
+      size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
+      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 0, false,
+                                         sz > 4096 ? sz : 4096));
+      this->Compute =
+          jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
+      return;
     }
-  }
-};
-
+#endif
 #ifdef PADDLE_WITH_MKLML
-#define MKL_FLOAT(isa, block)                           \
-  template <>                                           \
-  void VAddKernelImpl<float, isa, block>::Compute(      \
-      const float* x, const float* y, float* z) const { \
-    platform::dynload::vsAdd(this->num_, x, y, z);      \
+    if (useMKL(d)) {
+      this->Compute = VAddMKL<T>;
+      return;
+    }
+#endif
+    this->Compute = VAddRefer<T>;
   }
+#ifdef PADDLE_WITH_XBYAK
 
-#define MKL_DOUBLE(isa, block)                             \
-  template <>                                              \
-  void VAddKernelImpl<double, isa, block>::Compute(        \
-      const double* x, const double* y, double* z) const { \
-    platform::dynload::vdAdd(this->num_, x, y, z);         \
-  }
+ private:
+  std::unique_ptr<gen::VXXJitCode> jitcode_{nullptr};
+#endif
+};
 
-FOR_EACH_ISA(MKL_FLOAT, kGT16);
-FOR_EACH_ISA_BLOCK(MKL_DOUBLE);
+#ifdef PADDLE_WITH_XBYAK
+template <>
+bool VAddKernelImpl<float>::useJIT(int d) {
+  return gen::VXXJitCode::init(d);
+}
 #endif
 
-#define INTRI8_FLOAT(isa)                               \
-  template <>                                           \
-  void VAddKernelImpl<float, isa, kEQ8>::Compute(       \
-      const float* x, const float* y, float* z) const { \
-    __m256 tmpx, tmpy;                                  \
-    tmpx = _mm256_loadu_ps(x);                          \
-    tmpy = _mm256_loadu_ps(y);                          \
-    tmpx = _mm256_add_ps(tmpx, tmpy);                   \
-    _mm256_storeu_ps(z, tmpx);                          \
-  }
-#ifdef __AVX__
-INTRI8_FLOAT(jit::avx);
+#ifdef PADDLE_WITH_MKLML
+template <>
+bool VAddKernelImpl<float>::useMKL(int d) {
+  return d > 512;
+}
+
+template <>
+bool VAddKernelImpl<double>::useMKL(int d) {
+  return true;
+}
 #endif
-#ifdef __AVX2__
-INTRI8_FLOAT(jit::avx2);
+
+/* VAddRelu JitKernel */
+template <typename T>
+class VAddReluKernelImpl : public VAddReluKernel<T> {
+ public:
+  DECLARE_STATIC_FUNC;
+  explicit VAddReluKernelImpl(int d) : VAddReluKernel<T>() {
+#ifdef PADDLE_WITH_XBYAK
+    if (useJIT(d)) {
+      size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
+      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 0, true,
+                                         sz > 4096 ? sz : 4096));
+      this->Compute =
+          jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
+      return;
+    }
 #endif
-#ifdef __AVX512F__
-INTRI8_FLOAT(jit::avx512f);
+    this->Compute = VAddReluRefer<T>;
+  }
+#ifdef PADDLE_WITH_XBYAK
+
+ private:
+  std::unique_ptr<gen::VXXJitCode> jitcode_{nullptr};
 #endif
-// TODO(TJ): eq16 test and complete avx512
+};
 
-#undef INTRI8_FLOAT
-#undef MKL_FLOAT
-#undef MKL_DOUBLE
+#ifdef PADDLE_WITH_XBYAK
+template <>
+bool VAddReluKernelImpl<float>::useJIT(int d) {
+  return gen::VXXJitCode::init(d);
+}
+#endif
 
-/* VSCAL JitKernel */
-template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+/* VScal JitKernel */
+template <typename T>
 class VScalKernelImpl : public VScalKernel<T> {
  public:
-  explicit VScalKernelImpl(int d) : VScalKernel<T>() { this->num_ = d; }
-  void Compute(const T a, const T* x, T* y) const override {
-    for (int i = 0; i < this->num_; ++i) {
-      y[i] = a * x[i];
-    }
-  }
-  void Compute(const T a, T* x) const override {
-    for (int i = 0; i < this->num_; ++i) {
-      x[i] = a * x[i];
+  DECLARE_STATIC_FUNC;
+  explicit VScalKernelImpl(int d) : VScalKernel<T>() {
+#ifdef PADDLE_WITH_XBYAK
+    if (useJIT(d)) {
+      size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
+      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::mul, 1, false,
+                                         sz > 4096 ? sz : 4096));
+      this->Compute =
+          jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
+      return;
     }
-  }
-};
-
+#endif
 #ifdef PADDLE_WITH_MKLML
-#define MKL_FLOAT(isa, block)                                               \
-  template <>                                                               \
-  void VScalKernelImpl<float, isa, block>::Compute(const float a, float* x) \
-      const {                                                               \
-    platform::dynload::cblas_sscal(this->num_, a, x, 1);                    \
-  }
-
-#define MKL_DOUBLE(isa, block)                                                 \
-  template <>                                                                  \
-  void VScalKernelImpl<double, isa, block>::Compute(const double a, double* x) \
-      const {                                                                  \
-    platform::dynload::cblas_dscal(this->num_, a, x, 1);                       \
-  }
-
-FOR_EACH_ISA(MKL_FLOAT, kGT16);
-FOR_EACH_ISA_BLOCK(MKL_DOUBLE);
+    if (useMKL(d)) {
+      this->Compute = VScalMKL<T>;
+      return;
+    }
 #endif
-
-#define INTRI8_FLOAT(isa)                              \
-  template <>                                          \
-  void VScalKernelImpl<float, isa, kEQ8>::Compute(     \
-      const float a, const float* x, float* y) const { \
-    __m256 tmp;                                        \
-    __m256 scalar = _mm256_set1_ps(a);                 \
-    tmp = _mm256_loadu_ps(x);                          \
-    tmp = _mm256_mul_ps(tmp, scalar);                  \
-    _mm256_storeu_ps(y, tmp);                          \
-  }
-#define INTRI8_INPLACE_FLOAT(isa)                                          \
-  template <>                                                              \
-  void VScalKernelImpl<float, isa, kEQ8>::Compute(const float a, float* x) \
-      const {                                                              \
-    __m256 tmp;                                                            \
-    __m256 scalar = _mm256_set1_ps(a);                                     \
-    tmp = _mm256_loadu_ps(x);                                              \
-    tmp = _mm256_mul_ps(tmp, scalar);                                      \
-    _mm256_storeu_ps(x, tmp);                                              \
+    this->Compute = VScalRefer<T>;
   }
+#ifdef PADDLE_WITH_XBYAK
 
-#ifdef __AVX__
-INTRI8_FLOAT(jit::avx);
-INTRI8_INPLACE_FLOAT(jit::avx);
-#endif
-#ifdef __AVX2__
-INTRI8_FLOAT(jit::avx2);
-INTRI8_INPLACE_FLOAT(jit::avx2);
+ private:
+  std::unique_ptr<gen::VXXJitCode> jitcode_{nullptr};
 #endif
-#ifdef __AVX512F__
-INTRI8_FLOAT(jit::avx512f);
-INTRI8_INPLACE_FLOAT(jit::avx512f);
+};
+
+#ifdef PADDLE_WITH_XBYAK
+template <>
+bool VScalKernelImpl<float>::useJIT(int d) {
+  return gen::VXXJitCode::init(d, 1);
+}
 #endif
-// TODO(TJ): eq16 test and complete avx512
 
-#undef INTRI8_FLOAT
-#undef INTRI8_INPLACE_FLOAT
-#undef MKL_FLOAT
-#undef MKL_DOUBLE
+#ifdef PADDLE_WITH_MKLML
+template <>
+bool VScalKernelImpl<float>::useMKL(int d) {
+  return d > 512;
+}
+template <>
+bool VScalKernelImpl<double>::useMKL(int d) {
+  return true;
+}
+#endif
 
 /* VAddBias JitKernel */
-template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+template <typename T>
 class VAddBiasKernelImpl : public VAddBiasKernel<T> {
  public:
-  explicit VAddBiasKernelImpl(int d) : VAddBiasKernel<T>() { this->num_ = d; }
-  void Compute(const T a, const T* x, T* y) const override {
-    for (int i = 0; i < this->num_; ++i) {
-      y[i] = x[i] + a;
+  DECLARE_STATIC_FUNC;
+  explicit VAddBiasKernelImpl(int d) : VAddBiasKernel<T>() {
+#ifdef PADDLE_WITH_XBYAK
+    if (useJIT(d)) {
+      size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
+      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 1, false,
+                                         sz > 4096 ? sz : 4096));
+      this->Compute =
+          jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
+      return;
     }
-  }
-};
-
-#define INTRI8_FLOAT(isa)                              \
-  template <>                                          \
-  void VAddBiasKernelImpl<float, isa, kEQ8>::Compute(  \
-      const float a, const float* x, float* y) const { \
-    __m256 tmp = _mm256_loadu_ps(x);                   \
-    tmp = _mm256_add_ps(tmp, _mm256_set1_ps(a));       \
-    _mm256_storeu_ps(y, tmp);                          \
-  }
+#endif
 
-#define INTRI16_FLOAT(isa)                             \
-  template <>                                          \
-  void VAddBiasKernelImpl<float, isa, kEQ16>::Compute( \
-      const float a, const float* x, float* y) const { \
-    __m256 tmp0 = _mm256_loadu_ps(x);                  \
-    __m256 tmp1 = _mm256_loadu_ps(x + 8);              \
-    tmp0 = _mm256_add_ps(tmp0, _mm256_set1_ps(a));     \
-    tmp1 = _mm256_add_ps(tmp1, _mm256_set1_ps(a));     \
-    _mm256_storeu_ps(y, tmp0);                         \
-    _mm256_storeu_ps(y + 8, tmp1);                     \
+    this->Compute = VAddBiasRefer<T>;
   }
+#ifdef PADDLE_WITH_XBYAK
 
-#ifdef __AVX__
-INTRI8_FLOAT(jit::avx);
-INTRI16_FLOAT(jit::avx);
-#endif
-#ifdef __AVX2__
-INTRI8_FLOAT(jit::avx2);
-INTRI16_FLOAT(jit::avx2);
+ private:
+  std::unique_ptr<gen::VXXJitCode> jitcode_{nullptr};
 #endif
-#ifdef __AVX512F__
-INTRI8_FLOAT(jit::avx512f);
-INTRI16_FLOAT(jit::avx512f);
+};
+
+#ifdef PADDLE_WITH_XBYAK
+template <>
+bool VAddBiasKernelImpl<float>::useJIT(int d) {
+  return gen::VXXJitCode::init(d, 1);
+}
 #endif
-// TODO(TJ): eq16 test and complete avx512
 
-#undef INTRI8_FLOAT
-#undef INTRI16_FLOAT
+#undef DECLARE_STATIC_FUNC
+
+REGISTER_JITKERNEL(vmul, VMulKernel);
+REGISTER_JITKERNEL(vadd, VAddKernel);
+REGISTER_JITKERNEL(vaddrelu, VAddReluKernel);
+REGISTER_JITKERNEL(vscal, VScalKernel);
+REGISTER_JITKERNEL(vaddbias, VAddBiasKernel);
 
 /* VRelu JitKernel */
 template <typename T, platform::jit::cpu_isa_t isa, jit_block>
@@ -405,98 +461,7 @@ class VIdentityKernelImpl : public VIdentityKernel<T> {
   void Compute(const T* x, T* y) const override {}
 };
 
-/* VAddRelu JitKernel */
-template <typename T, platform::jit::cpu_isa_t isa, jit_block>
-class VAddReluKernelImpl : public VAddReluKernel<T> {
- public:
-  explicit VAddReluKernelImpl(int d) : VAddReluKernel<T>() { this->num_ = d; }
-  void Compute(const T* x, const T* y, T* z) const override {
-    for (int i = 0; i < this->num_; ++i) {
-      z[i] = x[i] + y[i];
-      z[i] = z[i] > 0 ? z[i] : 0;
-    }
-  }
-};
-
-#define INTRI8_FLOAT(isa)                               \
-  template <>                                           \
-  void VAddReluKernelImpl<float, isa, kEQ8>::Compute(   \
-      const float* x, const float* y, float* z) const { \
-    __m256 tmpx = _mm256_loadu_ps(x);                   \
-    __m256 tmpy = _mm256_loadu_ps(y);                   \
-    tmpy = _mm256_add_ps(tmpx, tmpy);                   \
-    tmpy = _mm256_max_ps(tmpy, _mm256_setzero_ps());    \
-    _mm256_storeu_ps(z, tmpy);                          \
-  }
-
-#define INTRI16_FLOAT(isa)                              \
-  template <>                                           \
-  void VAddReluKernelImpl<float, isa, kEQ16>::Compute(  \
-      const float* x, const float* y, float* z) const { \
-    __m256 zeros = _mm256_setzero_ps();                 \
-    __m256 tmp0 = _mm256_loadu_ps(x);                   \
-    __m256 tmp1 = _mm256_loadu_ps(y);                   \
-    tmp0 = _mm256_add_ps(tmp0, tmp1);                   \
-    tmp0 = _mm256_max_ps(tmp0, zeros);                  \
-    tmp1 = _mm256_loadu_ps(x + 8);                      \
-    __m256 tmp2 = _mm256_loadu_ps(y + 8);               \
-    tmp1 = _mm256_add_ps(tmp1, tmp2);                   \
-    tmp1 = _mm256_max_ps(tmp1, zeros);                  \
-    _mm256_storeu_ps(z, tmp0);                          \
-    _mm256_storeu_ps(z + 8, tmp1);                      \
-  }
-
-#define INTRI_COMMON_FLOAT(isa, block)                             \
-  template <>                                                      \
-  VAddReluKernelImpl<float, isa, block>::VAddReluKernelImpl(int d) \
-      : VAddReluKernel<float>() {                                  \
-    this->num_ = d;                                                \
-    this->end_ = d - d % AVX_FLOAT_BLOCK;                          \
-    this->rest_ = d - this->end_;                                  \
-  }                                                                \
-  template <>                                                      \
-  void VAddReluKernelImpl<float, isa, block>::Compute(             \
-      const float* x, const float* y, float* z) const {            \
-    __m256 zeros = _mm256_setzero_ps();                            \
-    for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) {        \
-      __m256 tmpx = _mm256_loadu_ps(x + i);                        \
-      __m256 tmpy = _mm256_loadu_ps(y + i);                        \
-      tmpy = _mm256_add_ps(tmpx, tmpy);                            \
-      tmpy = _mm256_max_ps(tmpy, zeros);                           \
-      _mm256_storeu_ps(z + i, tmpy);                               \
-    }                                                              \
-    for (int i = this->end_; i < this->num_; ++i) {                \
-      z[i] = x[i] + y[i];                                          \
-      z[i] = z[i] > 0 ? z[i] : 0;                                  \
-    }                                                              \
-  }
-
-#ifdef __AVX__
-INTRI8_FLOAT(jit::avx);
-INTRI16_FLOAT(jit::avx);
-INTRI_COMMON_FLOAT(jit::avx, kGT16);
-#endif
-#ifdef __AVX2__
-INTRI8_FLOAT(jit::avx2);
-INTRI16_FLOAT(jit::avx2);
-INTRI_COMMON_FLOAT(jit::avx2, kGT16);
-#endif
-#ifdef __AVX512F__
-// TODO(TJ): refine avx512
-INTRI8_FLOAT(jit::avx512f);
-INTRI16_FLOAT(jit::avx512f);
-INTRI_COMMON_FLOAT(jit::avx512f, kGT16);
-#endif
-
-#undef INTRI8_FLOAT
-#undef INTRI16_FLOAT
-#undef INTRI_COMMON_FLOAT
-
-REGISTER_JITKERNEL_DEPRECATED(vadd, VAddKernel);
-REGISTER_JITKERNEL_DEPRECATED(vscal, VScalKernel);
-REGISTER_JITKERNEL_DEPRECATED(vaddb, VAddBiasKernel);
 REGISTER_JITKERNEL_DEPRECATED(vrelu, VReluKernel);
-REGISTER_JITKERNEL_DEPRECATED(vaddrelu, VAddReluKernel);
 REGISTER_JITKERNEL_DEPRECATED(videntity, VIdentityKernel);
 
 }  // namespace jitkernel
diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc
index d7c177e6782e19e199542e10e1d62587ee0df4cf..c55e54a13f539014c0f582436ca1a105d0b0fedd 100644
--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
@@ -409,10 +409,11 @@ class VTanhKernelImpl : public VTanhKernel<T> {
     vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<T>>(d);
   }
   void Compute(const T* x, T* y) const override {
-    vscal_->Compute(static_cast<T>(2), x, y);
+    const T a = static_cast<T>(2), b = static_cast<T>(-1);
+    vscal_->Compute(&a, x, y, this->num_);
     vsigmoid_->Compute(y, y);
-    vscal_->Compute(static_cast<T>(2), y);
-    vaddbias_->Compute(static_cast<T>(-1), y, y);
+    vscal_->Compute(&a, y, y, this->num_);
+    vaddbias_->Compute(&b, y, y, this->num_);
   }
 
  private:
@@ -472,10 +473,11 @@ class VTanhKernelImpl : public VTanhKernel<T> {
     _mm256_storeu_ps(y, tmp);                                                 \
     x += AVX_FLOAT_BLOCK;                                                     \
     y += AVX_FLOAT_BLOCK;                                                     \
-    vscal_->Compute(2.f, x, y);                                               \
+    const float a = 2.f, b = -1.f;                                            \
+    vscal_->Compute(&a, x, y, this->num_);                                    \
     vsigmoid_->Compute(y, y);                                                 \
-    vscal_->Compute(2.f, y);                                                  \
-    vaddbias_->Compute(-1.f, y, y);                                           \
+    vscal_->Compute(&a, y, y, this->num_);                                    \
+    vaddbias_->Compute(&b, y, y, this->num_);                                 \
   }
 
 #define INTRI_GT16_FLOAT(isa, expisa)                                         \
@@ -502,10 +504,11 @@ class VTanhKernelImpl : public VTanhKernel<T> {
     }                                                                         \
     x += this->end_;                                                          \
     y += this->end_;                                                          \
-    vscal_->Compute(2.f, x, y);                                               \
+    const float a = 2.f, b = -1.f;                                            \
+    vscal_->Compute(&a, x, y, this->num_);                                    \
     vsigmoid_->Compute(y, y);                                                 \
-    vscal_->Compute(2.f, y);                                                  \
-    vaddbias_->Compute(-1.f, y, y);                                           \
+    vscal_->Compute(&a, y, y, this->num_);                                    \
+    vaddbias_->Compute(&b, y, y, this->num_);                                 \
   }
 
 #ifdef __AVX__
diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc
index d0932a37bb85bbc41f662a106c8ef5693a72efeb..ba3e917377cf12192a068a9d71238442e12d5e5e 100644
--- a/paddle/fluid/operators/math/jit_kernel_rnn.cc
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
@@ -181,7 +181,7 @@ class LSTMKernelImpl : public LSTMKernel<T> {
     act_cand_d_->Compute(gates, gates);
     vmul_d_->Compute(gates, gates + d_, gates + d_, d_);
     vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_);
-    vadd_d_->Compute(gates + d_, gates + d2_, ct);
+    vadd_d_->Compute(gates + d_, gates + d2_, ct, d_);
 
     /* H_t = act_cell(C_t) * ogated */
     act_cell_d_->Compute(ct, gates + d2_);
@@ -291,16 +291,16 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
     /* get fgated and igated*/
     vmul_d_->Compute(wp_data, ct_1, checked, d_);
     vmul_d_->Compute(wp_data + d_, ct_1, checked + d_, d_);
-    vadd_d2_->Compute(checked, gates + d_, gates + d_);
+    vadd_d2_->Compute(checked, gates + d_, gates + d_, d2_);
     act_gate_d2_->Compute(gates + d_, gates + d_);
     /* C_t = C_t-1 * fgated + cand_gated * igated*/
     act_cand_d_->Compute(gates, gates);
     vmul_d_->Compute(gates, gates + d_, gates + d_, d_);
     vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_);
-    vadd_d_->Compute(gates + d_, gates + d2_, ct);
+    vadd_d_->Compute(gates + d_, gates + d2_, ct, d_);
     /* get ogated*/
     vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_);
-    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_);
+    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_);
     act_gate_d_->Compute(gates + d3_, gates + d3_);
     /* H_t = act_cell(C_t) * ogated */
     act_cell_d_->Compute(ct, gates + d2_);
@@ -314,7 +314,7 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
     vmul_d_->Compute(gates, gates + d_, ct, d_);
     /* get outgated, put W_oc * C_t on igated */
     vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_);
-    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_);
+    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_);
     /* H_t = act_cell(C_t) * ogated */
     act_gate_d_->Compute(gates + d3_, gates + d3_);
     act_cell_d_->Compute(ct, gates + d2_);
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
index 34fa2b9a7814dbd96de1e7c4a5be5a88978445bd..7dc3e600b564d95b46070ff4436b2d0de2f3e105 100644
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -87,7 +87,7 @@ TEST(JitKernel, vrelu) {
         vrelu_intri8(d, x_data, zref_data);
       }
       auto si1 = GetCurrentUS();
-      VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
+      VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
     }
 #endif
     auto ttgts = GetCurrentUS();
@@ -95,8 +95,9 @@ TEST(JitKernel, vrelu) {
       ker->Compute(x_data, ztgt_data);
     }
     auto ttgte = GetCurrentUS();
-    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
-            << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+    VLOG(30) << "Vec size " << d
+             << ": refer takes: " << (trefe - trefs) / repeat
+             << " us, tgt takes: " << (ttgte - ttgts) / repeat;
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
@@ -128,12 +129,13 @@ TEST(JitKernel, vaddbias) {
     auto trefe = GetCurrentUS();
     auto ttgts = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      ker->Compute(a, x_data, ztgt_data);
+      ker->Compute(&a, x_data, ztgt_data, d);
     }
     auto ttgte = GetCurrentUS();
 
-    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
-            << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+    VLOG(30) << "Vec size " << d
+             << ": refer takes: " << (trefe - trefs) / repeat
+             << " us, tgt takes: " << (ttgte - ttgts) / repeat;
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
@@ -183,13 +185,14 @@ TEST(JitKernel, vexp) {
     }
     auto ttgte = GetCurrentUS();
 
-    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+    VLOG(30) << "Vec size " << d
+             << ": refer takes: " << (trefe - trefs) / repeat
 #ifdef PADDLE_WITH_MKLML
-            << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
+             << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
 #else
-            << " us, "
+             << " us, "
 #endif
-            << "tgt takes: " << (ttgte - ttgts) / repeat;
+             << "tgt takes: " << (ttgte - ttgts) / repeat;
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
@@ -254,9 +257,10 @@ TEST(JitKernel, vsigmoid) {
     }
     auto ttgte = GetCurrentUS();
 
-    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
-            << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
-            << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+    VLOG(30) << "Vec size " << d
+             << ": refer takes: " << (trefe - trefs) / repeat
+             << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
+             << " us, tgt takes: " << (ttgte - ttgts) / repeat;
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
@@ -281,10 +285,11 @@ void vtanh_better(
         const paddle::operators::math::jitkernel::VAddBiasKernel<float>>&
         vaddbias,
     const int n, const float* x, float* y) {
-  vscal->Compute(2.f, x, y);
+  const float a = 2.f, b = -1.f;
+  vscal->Compute(&a, x, y, n);
   vsigmoid->Compute(y, y);
-  vscal->Compute(2.f, y);
-  vaddbias->Compute(-1.f, y, y);
+  vscal->Compute(&a, y, y, n);
+  vaddbias->Compute(&b, y, y, n);
 }
 
 TEST(JitKernel, vtanh) {
@@ -320,9 +325,10 @@ TEST(JitKernel, vtanh) {
     }
     auto ttgte = GetCurrentUS();
 
-    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
-            << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
-            << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+    VLOG(30) << "Vec size " << d
+             << ": refer takes: " << (trefe - trefs) / repeat
+             << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
+             << " us, tgt takes: " << (ttgte - ttgts) / repeat;
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
@@ -371,7 +377,7 @@ void lstm_ctht_better(
   vtanh_d->Compute(gates, gates);
   vmul_d->Compute(gates, gates + d, gates + d, d);
   vmul_d->Compute(ct_1, gates + d2, gates + d2, d);
-  vadd_d->Compute(gates + d, gates + d2, ct);
+  vadd_d->Compute(gates + d, gates + d2, ct, d);
   /* H_t = act_cell(C_t) * ogated */
   vtanh_d->Compute(ct, gates + d2);
   vmul_d->Compute(gates + d2, gates + d * 3, ht, d);
@@ -440,9 +446,10 @@ TEST(JitKernel, lstm) {
       ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data);
     }
     auto ttgte = GetCurrentUS();
-    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
-            << " us, better(jit) takes: " << (tmkle - tmkls) / repeat
-            << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+    VLOG(30) << "Vec size " << d
+             << ": refer takes: " << (trefe - trefs) / repeat
+             << " us, better(jit) takes: " << (tmkle - tmkls) / repeat
+             << " us, tgt takes: " << (ttgte - ttgts) / repeat;
   }
 }
 
@@ -524,30 +531,32 @@ TEST(JitKernel, vscal) {
         vscal_inp_intri8(d, a, y_data);
       }
       auto si3 = GetCurrentUS();
-      VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat
-              << " us, inplace: " << (si3 - si2) / repeat;
+      VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat
+               << " us, inplace: " << (si3 - si2) / repeat;
     }
 #endif
 
     auto ttgts = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      ker->Compute(a, x_data, ztgt_data);
+      ker->Compute(&a, x_data, ztgt_data, d);
     }
     auto ttgte = GetCurrentUS();
     auto ttgts1 = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      ker->Compute(a, y_data);
+      ker->Compute(&a, y_data, y_data, d);
     }
     auto ttgte1 = GetCurrentUS();
-    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
-            << " us, inplace takes: " << (trefe1 - trefs1) / repeat
+    VLOG(30) << "Vec size " << d
+             << ": refer takes: " << (trefe - trefs) / repeat
+             << " us, inplace takes: " << (trefe1 - trefs1) / repeat
 #ifdef PADDLE_WITH_MKLML
-            << " us, mkl inplace takes: " << (tmkle - tmkls) / repeat << " us, "
+             << " us, mkl inplace takes: " << (tmkle - tmkls) / repeat
+             << " us, "
 #else
-            << " us, "
+             << " us, "
 #endif
-            << "tgt takes: " << (ttgte - ttgts) / repeat
-            << "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat;
+             << "tgt takes: " << (ttgte - ttgts) / repeat
+             << "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat;
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
@@ -610,7 +619,7 @@ TEST(JitKernel, vmul) {
         vmul_intri8(d, x_data, y_data, zref_data);
       }
       auto si1 = GetCurrentUS();
-      VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
+      VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
     }
 #endif
 
@@ -620,13 +629,14 @@ TEST(JitKernel, vmul) {
     }
     auto ttgte = GetCurrentUS();
 
-    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+    VLOG(30) << "Vec size " << d
+             << ": refer takes: " << (trefe - trefs) / repeat
 #ifdef PADDLE_WITH_MKLML
-            << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
+             << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
 #else
-            << " us, "
+             << " us, "
 #endif
-            << "tgt takes: " << (ttgte - ttgts) / repeat;
+             << "tgt takes: " << (ttgte - ttgts) / repeat;
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
@@ -689,23 +699,24 @@ TEST(JitKernel, vadd) {
         vadd_intri8(d, x_data, y_data, zref_data);
       }
       auto si1 = GetCurrentUS();
-      VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
+      VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
     }
 #endif
 
     auto ttgts = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      ker->Compute(x_data, y_data, ztgt_data);
+      ker->Compute(x_data, y_data, ztgt_data, d);
     }
     auto ttgte = GetCurrentUS();
 
-    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+    VLOG(30) << "Vec size " << d
+             << ": refer takes: " << (trefe - trefs) / repeat
 #ifdef PADDLE_WITH_MKLML
-            << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
+             << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
 #else
-            << " us, "
+             << " us, "
 #endif
-            << "tgt takes: " << (ttgte - ttgts) / repeat;
+             << "tgt takes: " << (ttgte - ttgts) / repeat;
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
@@ -723,8 +734,8 @@ void vaddrelu_better(
         const paddle::operators::math::jitkernel::VAddKernel<float>>& vadd,
     const std::shared_ptr<
         const paddle::operators::math::jitkernel::VReluKernel<float>>& vrelu,
-    const float* x, const float* y, float* z) {
-  vadd->Compute(x, y, z);
+    const float* x, const float* y, float* z, int d) {
+  vadd->Compute(x, y, z, d);
   vrelu->Compute(z, z);
 }
 
@@ -752,17 +763,18 @@ TEST(JitKernel, vaddrelu) {
     auto trefe = GetCurrentUS();
     auto tmkls = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vaddrelu_better(vadd, vrelu, x_data, y_data, zref_data);
+      vaddrelu_better(vadd, vrelu, x_data, y_data, zref_data, d);
     }
     auto tmkle = GetCurrentUS();
     auto ttgts = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      ker->Compute(x_data, y_data, ztgt_data);
+      ker->Compute(x_data, y_data, ztgt_data, d);
     }
     auto ttgte = GetCurrentUS();
-    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
-            << " us, better takes: " << (tmkle - tmkls) / repeat << " us, "
-            << "tgt takes: " << (ttgte - ttgts) / repeat;
+    VLOG(30) << "Vec size " << d
+             << ": refer takes: " << (trefe - trefs) / repeat
+             << " us, better takes: " << (tmkle - tmkls) / repeat << " us, "
+             << "tgt takes: " << (ttgte - ttgts) / repeat;
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 75946740375d74043960b68e94eb048b3bab4b79..9577a4cb9d275df9604b7578f8685e4d2938a5e9 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -270,7 +270,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
                   const std::vector<const framework::SelectedRows*>& inputs,
                   framework::SelectedRows* output) {
     if (inputs.size() == 0) {
-      VLOG(3) << "no input! return";
+      VLOG(30) << "no input! return";
       return;
     }
     const framework::SelectedRows* has_value_input = nullptr;
@@ -281,7 +281,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
       }
     }
     if (has_value_input == nullptr) {
-      VLOG(3) << "no input has value! just return" << std::endl;
+      VLOG(30) << "no input has value! just return" << std::endl;
       return;
     }
     auto input_width = has_value_input->value().dims()[1];
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index a4fa6f5c898c541120a874f962b0f6a817736510..74b9659cfd38076bf1948b5c664817a6753b7090 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -81,7 +81,7 @@ template <typename T, int block_size>
 __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
                                             const int64_t* rows, T* tensor_out,
                                             int64_t row_numel) {
-  const int ty = blockIdx.y;
+  const int ty = blockIdx.x;
   int tid = threadIdx.x;
 
   selected_rows += ty * row_numel;
@@ -123,7 +123,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
 
     const int block_size = 256;
     dim3 threads(block_size, 1);
-    dim3 grid(1, in1_rows.size());
+    dim3 grid(in1_rows.size(), 1);
     SelectedRowsAddTensorKernel<
         T, block_size><<<grid, threads, 0, context.stream()>>>(
         in1_data, in1_rows.CUDAData(context.GetPlace()), out_data,
@@ -188,7 +188,7 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
                                               const int64_t* rows,
                                               T* tensor_out,
                                               int64_t row_numel) {
-  const int ty = blockIdx.y;
+  const int ty = blockIdx.x;
   int tid = threadIdx.x;
 
   selected_rows += ty * row_numel;
@@ -221,7 +221,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
     auto* in2_data = input2->data<T>();
     const int block_size = 256;
     dim3 threads(block_size, 1);
-    dim3 grid(1, in1_rows.size());
+    dim3 grid(in1_rows.size(), 1);
     SelectedRowsAddToTensorKernel<
         T, block_size><<<grid, threads, 0, context.stream()>>>(
         in1_data, in1_rows.CUDAData(context.GetPlace()), in2_data,
@@ -314,7 +314,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
                   const std::vector<const framework::SelectedRows*>& inputs,
                   framework::SelectedRows* output) {
     if (inputs.size() == 0) {
-      VLOG(3) << "no input! return";
+      VLOG(30) << "no input! return";
       return;
     }
     const framework::SelectedRows* has_value_input = nullptr;
@@ -325,7 +325,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
       }
     }
     if (has_value_input == nullptr) {
-      VLOG(3) << "no input has value! just return" << std::endl;
+      VLOG(30) << "no input has value! just return" << std::endl;
       return;
     }
     auto input_width = has_value_input->value().dims()[1];
@@ -388,7 +388,7 @@ template <typename T, int block_size>
 __global__ void UpdateToTensorKernel(const T* selected_rows,
                                      const int64_t* rows, const ScatterOps& op,
                                      T* tensor_out, int64_t row_numel) {
-  const int ty = blockIdx.y;
+  const int ty = blockIdx.x;
   int tid = threadIdx.x;
 
   selected_rows += ty * row_numel;
@@ -457,7 +457,7 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
     auto* in2_data = input2->data<T>();
 
     dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1);
-    dim3 grid(1, in1_rows.size());
+    dim3 grid(in1_rows.size(), 1);
     UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<<
         grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(),
                                               op, in2_data, in1_row_numel);
diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h
index 521c53dd0d71707c13c4364c5ee59943a03d4a2d..6d146d39d6d07678e859b82b25ba60ed7661546d 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
@@ -64,6 +64,8 @@ struct SelectedRowsSumTo {
                   framework::SelectedRows* input2);
 };
 
+// FIXME: The result of SelectedRowsAddToTensor maybe non deterministic,
+// because it uses CudaAtomicAdd.
 // input2 = input1 + input2
 template <typename DeviceContext, typename T>
 struct SelectedRowsAddToTensor {
@@ -88,57 +90,6 @@ struct MergeAdd {
                   framework::SelectedRows* output);
 };
 
-template <typename DeviceContext, typename T>
-struct Add {
-  framework::SelectedRows operator()(const DeviceContext& context,
-                                     const framework::SelectedRows& input1,
-                                     const framework::SelectedRows& input2) {
-    framework::SelectedRows out;
-    out.set_rows(input1.rows());
-    out.set_height(input1.height());
-    out.mutable_value()->mutable_data<T>(input1.value().dims(),
-                                         context.GetPlace());
-    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
-    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
-    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
-    e_out.device(*context.eigen_device()) = e_in1 + e_in2;
-    return out;
-  }
-};
-
-template <typename DeviceContext, typename T>
-struct Mul {
-  // multiply two SelectedRows
-  framework::SelectedRows operator()(const DeviceContext& context,
-                                     const framework::SelectedRows& input1,
-                                     const framework::SelectedRows& input2) {
-    framework::SelectedRows out;
-    out.set_rows(input1.rows());
-    out.set_height(input1.height());
-    out.mutable_value()->mutable_data<T>(input1.value().dims(),
-                                         context.GetPlace());
-    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
-    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
-    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
-    e_out.device(*context.eigen_device()) = e_in1 * e_in2;
-    return out;
-  }
-  // multiply scalar to SelectedRows
-  framework::SelectedRows operator()(const DeviceContext& context,
-                                     const framework::SelectedRows& input1,
-                                     const T input2) {
-    framework::SelectedRows out;
-    out.set_rows(input1.rows());
-    out.set_height(input1.height());
-    out.mutable_value()->mutable_data<T>(input1.value().dims(),
-                                         context.GetPlace());
-    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
-    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
-    e_out.device(*context.eigen_device()) = input2 * e_in1;
-    return out;
-  }
-};
-
 enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
 
 // out = seleted_rows_in / tensor
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index 19426b3c204095bd415cebcd87cff18468acd564..820636defad0be9fb2e6decefc938658ae70ea9b 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mean_op.h"
-
+#include <string>
 namespace paddle {
 namespace operators {
 
@@ -42,6 +42,14 @@ Mean Operator calculates the mean of all elements in X.
   }
 };
 
+class MeanOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
+  }
+};
+
 class MeanGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -50,6 +58,14 @@ class MeanGradOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
     ctx->ShareLoD("X", framework::GradVarName("X"));
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class MeanGradMaker : public framework::SingleGradOpDescMaker {
@@ -71,7 +87,8 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker);
+REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanOpInferVarType,
+                  ops::MeanGradMaker);
 REGISTER_OPERATOR(mean_grad, ops::MeanGradOp);
 REGISTER_OP_CPU_KERNEL(
     mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/momentum_op.h b/paddle/fluid/operators/momentum_op.h
index 71f079e4d97f5259359ee6572f584894551452ca..e5b756b4fa637f2d4136f8c8a87bf34c6c04413a 100644
--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
@@ -346,7 +346,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
 
       // sparse update maybe empty.
       if (grad->rows().size() == 0) {
-        VLOG(3) << "Grad SelectedRows contains no data!";
+        VLOG(30) << "Grad SelectedRows contains no data!";
         return;
       }
       auto* merged_grad = const_cast<framework::Scope&>(ctx.scope())
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 363abfb0e0c96e8a4d82124dff168f28e339a9ae..7e434c293c9631025a5a725d62838fa12e845838 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -38,9 +38,9 @@ class MulOp : public framework::OperatorWithKernel {
     int x_num_col_dims = ctx->Attrs().Get<int>("x_num_col_dims");
     int y_num_col_dims = ctx->Attrs().Get<int>("y_num_col_dims");
 
-    VLOG(3) << "mul operator x.shape=" << x_dims << " y.shape=" << y_dims
-            << " x_num_col_dims=" << x_num_col_dims
-            << " y_num_col_dims=" << y_num_col_dims;
+    VLOG(30) << "mul operator x.shape=" << x_dims << " y.shape=" << y_dims
+             << " x_num_col_dims=" << x_num_col_dims
+             << " y_num_col_dims=" << y_num_col_dims;
 
     PADDLE_ENFORCE_GT(
         x_dims.size(), x_num_col_dims,
@@ -56,7 +56,8 @@ class MulOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_EQ(x_mat_dims[1], y_mat_dims[0],
                       "First matrix's width must be equal with second matrix's "
-                      "height. %s, %s");
+                      "height. %s, %s",
+                      x_mat_dims[1], y_mat_dims[0]);
     std::vector<int64_t> output_dims;
     output_dims.reserve(
         static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
@@ -126,6 +127,14 @@ or not. But the output only shares the LoD information with input $X$.
   }
 };
 
+class MulOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
+  }
+};
+
 class MulGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -178,7 +187,8 @@ class MulOpGradMaker : public framework::SingleGradOpDescMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpGradMaker);
+REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpInferVarType,
+                  ops::MulOpGradMaker);
 REGISTER_OPERATOR(mul_grad, ops::MulGradOp);
 REGISTER_OP_CPU_KERNEL(
     mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/nccl_op.cu.cc b/paddle/fluid/operators/nccl_op.cu.cc
index 8de974bc2b333fb6ccc5b5f0bb1af86533139925..9db0031a6934537a7d991b775ecac688ae6b66e9 100644
--- a/paddle/fluid/operators/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl_op.cu.cc
@@ -63,16 +63,16 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
     // device id
     int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
     int idx = comm->GetCommId(gpu_id);
-    VLOG(3) << "gpu : "
-            << " invoke allreduce. send " << x->numel() << " recv "
-            << out->numel();
+    VLOG(30) << "gpu : "
+             << " invoke allreduce. send " << x->numel() << " recv "
+             << out->numel();
     PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
         x->data<T>(), out->mutable_data<T>(ctx.GetPlace()), out->numel(),
         NCCLTypeWrapper<T>::type, reduction_op_, comm->comms().at(idx),
         ctx.cuda_device_context().stream()));
-    VLOG(3) << "gpu : "
-            << " finished allreduce. send " << x->numel() << " recv "
-            << out->numel();
+    VLOG(30) << "gpu : "
+             << " finished allreduce. send " << x->numel() << " recv "
+             << out->numel();
   }
 };
 
@@ -109,14 +109,14 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
     } else {
       out->Resize(framework::make_ddim({0}));
     }
-    VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
-            << " recv " << out->numel();
+    VLOG(30) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
+             << " recv " << out->numel();
     PADDLE_ENFORCE(platform::dynload::ncclReduce(
         x->data<T>(), recvbuffer, x->numel(), NCCLTypeWrapper<T>::type,
         reduction_op_, root, comm->comms().at(idx),
         ctx.cuda_device_context().stream()));
-    VLOG(3) << "gpu : " << gpu_id << " finished reduce. send " << x->numel()
-            << " recv " << out->numel();
+    VLOG(30) << "gpu : " << gpu_id << " finished reduce. send " << x->numel()
+             << " recv " << out->numel();
   }
 };
 
@@ -133,21 +133,22 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
     int idx = comm->GetCommId(gpu_id);
     if (idx == root) {
       auto* x = ctx.Input<LoDTensor>("X");
-      VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
+      VLOG(30) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
       PADDLE_ENFORCE(platform::dynload::ncclBcast(
           reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), x->numel(),
           NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
           ctx.cuda_device_context().stream()));
-      VLOG(3) << "gpu : " << gpu_id << " finished Bcast.";
+      VLOG(30) << "gpu : " << gpu_id << " finished Bcast.";
     } else {
       auto* out = ctx.Output<LoDTensor>("Out");
-      VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
-              << framework::product(out->dims());
+      VLOG(30) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
+               << framework::product(out->dims());
       PADDLE_ENFORCE(platform::dynload::ncclBcast(
           out->mutable_data<T>(ctx.GetPlace()), out->numel(),
           NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
           ctx.cuda_device_context().stream()));
-      VLOG(3) << "gpu : " << gpu_id << " finished Bcast. recv " << out->numel();
+      VLOG(30) << "gpu : " << gpu_id << " finished Bcast. recv "
+               << out->numel();
     }
   }
 };
diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc
index d5fb7a12e5d9757f3e639f6de7f0129bd531e2a1..f48ccdd97fa5adb475013cf26e7544c2729b4457 100644
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
@@ -86,9 +86,9 @@ class NCCLTester : public ::testing::Test {
     (*p_scopes).resize(gpu_list_.size());
 
     auto op = f::OpRegistry::CreateOp(*op1);
-    VLOG(1) << "invoke NCCLInitOp.";
+    VLOG(10) << "invoke NCCLInitOp.";
     op->Run(g_scope_, cpu_place);
-    VLOG(1) << "NCCLInitOp finished.";
+    VLOG(10) << "NCCLInitOp finished.";
   }
 
   int GetGPUData(int gpu_id) { return gpu_id + 42; }
@@ -109,7 +109,7 @@ class NCCLTester : public ::testing::Test {
 
       std::vector<T> send_vector(f::product(kDims), GetGPUData(gpu_id));
       paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
-      VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
+      VLOG(10) << "Send Tensor filled with elements " << send_tensor->numel();
     }
 
     lk.unlock();
@@ -119,11 +119,11 @@ class NCCLTester : public ::testing::Test {
 
     auto op = f::OpRegistry::CreateOp(*op1);
 
-    VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
-    VLOG(1) << " send_tensor : " << send_tensor->numel()
-            << " recv_tensor : " << recv_tensor->numel();
+    VLOG(10) << "Device : " << gpu_id << " invoke " << op_desc.Type();
+    VLOG(10) << " send_tensor : " << send_tensor->numel()
+             << " recv_tensor : " << recv_tensor->numel();
     op->Run(*scope, place);
-    VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
+    VLOG(10) << "Device : " << gpu_id << " finished " << op_desc.Type();
   }
 
  public:
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index e471f04662a1fa3e8e77a2db37f0da4521682018..877c9a0528441a7d5b1306c3f8f8be1a5aea577a 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -69,7 +69,7 @@ class NCEOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
-        ctx.GetPlace());
+        platform::CPUPlace());
   }
 };
 
@@ -174,7 +174,7 @@ class NCEOpGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
-        ctx.GetPlace());
+        platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
index ab25628d45699dbcfc1fc5792958bae9e42e72a3..c795d4bdd10c0ffbf30a4849fc773335036e34c2 100644
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -48,7 +48,7 @@ static void SplitTensorAndMoveTensorToScopes(
     auto lod_tensors = tensor.SplitLoDTensor(places);
 
     for (auto &lod : lod_tensors) {
-      VLOG(3) << lod.dims();
+      VLOG(30) << lod.dims();
     }
     if (num_sub_scopes == 0) {
       num_sub_scopes = lod_tensors.size();
@@ -263,7 +263,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
       if (s == framework::kEmptyVarName) {
         continue;
       }
-      VLOG(3) << "Moving " << s;
+      VLOG(30) << "Moving " << s;
       CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
     }
     WaitOnPlaces(places);
@@ -277,7 +277,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
       if (s == framework::kEmptyVarName) {
         continue;
       }
-      VLOG(3) << "Accumulating " << s;
+      VLOG(30) << "Accumulating " << s;
       if (s == framework::kEmptyVarName) continue;
       std::string tmp_name;
       auto *tmp = sub_scopes[0]->Var(&tmp_name);
@@ -289,7 +289,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
         auto sum_op = framework::OpRegistry::CreateOp(
             "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
             framework::AttributeMap{{"use_mkldnn", {false}}});
-        VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
+        VLOG(100) << sum_op->DebugStringEx(sub_scopes[0]);
         sum_op->Run(*sub_scopes[0], places[0]);
         WaitOnPlace(places[0]);
       }
@@ -316,7 +316,7 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
     auto *grad = new framework::OpDesc();
     grad->SetType("parallel_do_grad");
     for (auto &input_param : this->InputNames()) {
-      VLOG(3) << input_param;
+      VLOG(30) << input_param;
       grad->SetInput(input_param, this->Input(input_param));
       if (input_param != kPlaces) {
         grad->SetOutput(framework::GradVarName(input_param),
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 484cb65746612343fafc49fe61b607f2e919cf4f..46a95350a7293c18313811ba9b367fd65955145a 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -40,7 +40,7 @@ int PoolOutputSize(int input_size, int filter_size, int padding, int stride,
   return output_size;
 }
 
-void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
+void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) of Pooling should not be null.");
   PADDLE_ENFORCE(ctx->HasOutput("Out"),
                  "Out(Output) of Pooling should not be null.");
@@ -81,7 +81,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
 }
 
 framework::OpKernelType PoolOp::GetExpectedKernelType(
-    const framework::ExecutionContext &ctx) const {
+    const framework::ExecutionContext& ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
   std::string data_format = ctx.Attr<std::string>("data_format");
   framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
@@ -104,7 +104,7 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
       layout_, library_);
 }
 
-void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
+void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
   PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
                  "Input(X@GRAD) should not be null.");
@@ -112,7 +112,7 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
 }
 
 framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
-    const framework::ExecutionContext &ctx) const {
+    const framework::ExecutionContext& ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
   std::string data_format = ctx.Attr<std::string>("data_format");
   framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
@@ -262,6 +262,14 @@ Example:
 )DOC");
 }
 
+class PoolOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
+  }
+};
+
 void Pool3dOpMaker::Make() {
   AddInput("X",
            "(Tensor) The input tensor of pooling operator. "
@@ -372,6 +380,7 @@ Example:
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(pool2d, ops::PoolOp, ops::Pool2dOpMaker,
+                  ops::PoolOpInferVarType,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad);
 
@@ -383,6 +392,7 @@ REGISTER_OP_CPU_KERNEL(
     ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
 
 REGISTER_OPERATOR(pool3d, ops::PoolOp, ops::Pool3dOpMaker,
+                  ops::PoolOpInferVarType,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad);
 
diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc
index 490dfa41be2de987c51b7f06d988ce27980aa5f2..55853d25460bf6e3d07c829d686e71cc9367118c 100644
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -48,12 +48,12 @@ class PrefetchOp : public framework::OperatorBase {
     std::vector<distributed::VarHandlePtr> rets;
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
-        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
-                << outs[i] << " back";
+        VLOG(30) << "sending " << ins[i] << " to " << epmap[i] << " to get "
+                 << outs[i] << " back";
         rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope,
                                                     ins[i], outs[i]));
       } else {
-        VLOG(3) << "don't send no-initialied variable: " << ins[i];
+        VLOG(30) << "don't send no-initialied variable: " << ins[i];
       }
     }
     for (size_t i = 0; i < rets.size(); i++) {
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
index d68ba9d661698bb0d33b139f5748daec2ead6595..5f1a48b6de01550978638917e3c66ef2851ee2ed 100644
--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -155,8 +155,8 @@ class RandomCropKernel : public framework::OpKernel<T> {
         seed = *cpu_seed.data<int64_t>();
       }
     } else {
-      VLOG(5) << "WARNING: The input 'Seed' is not initialized, use attribute "
-                 "'startup_seed' instead.";
+      VLOG(50) << "WARNING: The input 'Seed' is not initialized, use attribute "
+                  "'startup_seed' instead.";
       seed = ctx.Attr<int>("startup_seed");
     }
     auto shape = ctx.Attr<std::vector<int>>("shape");
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index 51b980acb5a08d431d96a3a92479dec09119c27e..618248f87298d62078aeccfa135b853b9d2b1744 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -42,7 +42,7 @@ class BlockingQueue {
     std::unique_lock<std::mutex> lock(mutex_);
     send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
     if (closed_) {
-      VLOG(5)
+      VLOG(50)
           << "WARNING: Sending an element to a closed reader::BlokcingQueue.";
       return false;
     }
@@ -56,7 +56,7 @@ class BlockingQueue {
     std::unique_lock<std::mutex> lock(mutex_);
     send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
     if (closed_) {
-      VLOG(5)
+      VLOG(50)
           << "WARNING: Sending an element to a closed reader::BlokcingQueue.";
       return false;
     }
diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
index 3f72890a7cee1453585d50afa04fa62a9b059dc3..3fe4e9e7adee071fd56cf9f3d2560829f096ba9b 100644
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -26,7 +26,7 @@ class ShuffleReader : public framework::DecoratedReader {
   ShuffleReader(const std::shared_ptr<ReaderBase>& reader, size_t buffer_size,
                 size_t seed = 0)
       : DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) {
-    VLOG(10) << "Create shuffle reader of " << reader_;
+    VLOG(100) << "Create shuffle reader of " << reader_;
     if (seed_ == 0) {
       std::random_device device;
       seed_ = device();
@@ -37,7 +37,7 @@ class ShuffleReader : public framework::DecoratedReader {
   void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
     out->clear();
     if (iteration_pos_ >= buffer_.size()) {
-      VLOG(10) << "Resetting shuffle buffer";
+      VLOG(100) << "Resetting shuffle buffer";
       ReloadBuffer();
       if (buffer_.empty()) {
         return;
@@ -73,7 +73,7 @@ class ShuffleReader : public framework::DecoratedReader {
     std::mt19937 g(seed_);
     std::shuffle(buffer_.begin(), buffer_.end(), g);
     seed_ = g();  // update seed_;
-    VLOG(10) << "random buffer size = " << buffer_.size();
+    VLOG(100) << "random buffer size = " << buffer_.size();
   }
 
   size_t buffer_size_;
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 162bfcbb0844d29385d0f8ad5d25a3f8de6bd41b..283dce93212ac91fc4a3276598c1f32cfd36d1e7 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -160,7 +160,7 @@ class RecurrentBase : public framework::OperatorBase {
                                      Callback callback) {
     PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
     for (size_t i = 0; i < dst_vars.size(); ++i) {
-      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
+      VLOG(100) << "Link " << src_vars[i] << " to " << dst_vars[i];
       AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback);
     }
   }
@@ -176,7 +176,7 @@ class RecurrentBase : public framework::OperatorBase {
                                      Callback callback) {
     PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
     for (size_t i = 0; i < dst_vars.size(); ++i) {
-      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
+      VLOG(100) << "Link " << src_vars[i] << " to " << dst_vars[i];
       AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback);
     }
   }
@@ -230,7 +230,7 @@ class RecurrentOp : public RecurrentBase {
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
     auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
-    VLOG(3) << "Static RNN input sequence length = " << seq_len;
+    VLOG(30) << "Static RNN input sequence length = " << seq_len;
     StepScopes scopes = CreateStepScopes(scope, seq_len);
     auto reverse = Attr<bool>(kReverse);
 
@@ -241,7 +241,7 @@ class RecurrentOp : public RecurrentBase {
 
     for (size_t i = 0; i < seq_len; ++i) {
       size_t seq_offset = reverse ? seq_len - i - 1 : i;
-      VLOG(3) << "Recurrent operate at the time step " << seq_offset;
+      VLOG(30) << "Recurrent operate at the time step " << seq_offset;
 
       auto &cur_scope = scopes.CurScope();
 
@@ -334,7 +334,7 @@ class RecurrentGradOp : public RecurrentBase {
 
     for (size_t step_id = 0; step_id < seq_len; ++step_id) {
       size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
-      VLOG(3) << "Recurrent backward operate at the time step " << seq_offset;
+      VLOG(30) << "Recurrent backward operate at the time step " << seq_offset;
       auto &cur_scope = scopes.CurScope();
       // Link outside::output_grads --> inside::output_grads
       //   inside::output_grad = outside::output_grad[seq_offset:seq_offset+1]
@@ -348,11 +348,11 @@ class RecurrentGradOp : public RecurrentBase {
           });
       auto og_set = List2Set(Inputs(kOutputGrads));
 
-      if (VLOG_IS_ON(10)) {
+      if (VLOG_IS_ON(100)) {
         std::ostringstream sout;
         std::copy(og_set.begin(), og_set.end(),
                   std::ostream_iterator<std::string>(sout, ","));
-        VLOG(10) << " RNN output gradients = [" << sout.str() << "]";
+        VLOG(100) << " RNN output gradients = [" << sout.str() << "]";
       }
 
       // Link states
@@ -374,7 +374,7 @@ class RecurrentGradOp : public RecurrentBase {
           auto &ex_tensor =
               ex_scope.FindVar(ex_grad)->Get<framework::LoDTensor>();
 
-          VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad;
+          VLOG(100) << " RNN link " << cur_grad << " from " << ex_grad;
           auto *cur_grad_var = cur_scope.Var(cur_grad);
           auto cur_grad_tensor =
               cur_grad_var->GetMutable<framework::LoDTensor>();
@@ -382,12 +382,12 @@ class RecurrentGradOp : public RecurrentBase {
         }
       }
 
-      VLOG(5) << "Recurrent memory linking finished ";
+      VLOG(50) << "Recurrent memory linking finished ";
       // Run step block with cur_scope
       executor.Run(*program, &cur_scope, block->ID(),
                    false /*create_local_scope*/);
 
-      VLOG(5) << "executor.Run finished ";
+      VLOG(50) << "executor.Run finished ";
 
       auto local_var_names = LocalVarNames(cur_scope);
 
@@ -436,7 +436,7 @@ class RecurrentGradOp : public RecurrentBase {
           cur_scope.Rename(new_inside_name, inside_grad_name);
         }
       }
-      VLOG(5) << "Accumulate Parameter finished ";
+      VLOG(50) << "Accumulate Parameter finished ";
 
       // Copy input gradient from inside to outside
       //   outside::input_grad[seq_offset: seq_offset + 1] = inside::input_grad
@@ -455,7 +455,7 @@ class RecurrentGradOp : public RecurrentBase {
             auto dst = outside->Slice(seq_offset, seq_offset + 1);
             framework::TensorCopy(inside, place, dev_ctx, &dst);
           });
-      VLOG(5) << "Link outside gradient finished ";
+      VLOG(50) << "Link outside gradient finished ";
 
       if (step_id + 1 == seq_len) {  // at_end
         // copy initialize states gradient from inside to outside
@@ -468,7 +468,7 @@ class RecurrentGradOp : public RecurrentBase {
               outside->mutable_data(place, inside.type());
               framework::TensorCopy(inside, place, dev_ctx, outside);
             });
-        VLOG(5) << "Link initialize state gradient finished ";
+        VLOG(50) << "Link initialize state gradient finished ";
       }
       scopes.Next();
     }
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index 0399ff41007fbe10da8d53a05671eb0cfb475a5f..fbbd86502bfc61c004f88971526195f6a083d5a9 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -47,7 +47,7 @@ class RecvOp : public framework::OperatorBase {
 
     std::vector<distributed::VarHandlePtr> rets;
     for (size_t i = 0; i < outs.size(); i++) {
-      VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
+      VLOG(30) << "getting " << outs[i] << " from " << epmap[i];
       rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]));
     }
     if (sync_mode) {
diff --git a/paddle/fluid/operators/reduce_max_op.cu b/paddle/fluid/operators/reduce_max_op.cu
index 0d86b3127e42f7ee14ba57b1c762e8128a0f2d54..b21da178f3eeaafa41bde5f64cc4abcf7944b032 100644
--- a/paddle/fluid/operators/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_max_op.cu
@@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_max,
                                           int, ops::MaxFunctor>,
                         ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                           int64_t, ops::MaxFunctor>);
-REGISTER_OP_CUDA_KERNEL(
-    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_max_op.part.cu b/paddle/fluid/operators/reduce_max_op.part.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6954c8d744faee6f8f0b715d6e4c8e3bcda7fb83
--- /dev/null
+++ b/paddle/fluid/operators/reduce_max_op.part.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.cu b/paddle/fluid/operators/reduce_mean_op.cu
index 59b30244839849d79e3e531953134633503c4090..4408200d2d052c2f68c2dd35619de6ed67f07f6e 100644
--- a/paddle/fluid/operators/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_mean_op.cu
@@ -69,13 +69,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>,
                         ops::ReduceMeanKernel<double>,
                         ops::ReduceMeanKernel<int>,
                         ops::ReduceMeanKernel<int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                            float, ops::MeanGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::MeanGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::MeanGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::MeanGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_mean_op.part.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4b663bcdca7c20f8802d962a362f429d8eafe9af
--- /dev/null
+++ b/paddle/fluid/operators/reduce_mean_op.part.cu
@@ -0,0 +1,26 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// .part used to speed up nvcc compile
+#include "paddle/fluid/operators/reduce_mean_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                            float, ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MeanGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_min_op.cu b/paddle/fluid/operators/reduce_min_op.cu
index da466f805eff4709dc23471baef03e94052ee6c1..5a04a12b79444dcea30d3c1140d9708a98b55fe3 100644
--- a/paddle/fluid/operators/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_min_op.cu
@@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_min,
                                           int, ops::MinFunctor>,
                         ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                           int64_t, ops::MinFunctor>);
-REGISTER_OP_CUDA_KERNEL(
-    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_min_op.part.cu b/paddle/fluid/operators/reduce_min_op.part.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5b8f061b2d03eb76863401905ac87044fd5ea778
--- /dev/null
+++ b/paddle/fluid/operators/reduce_min_op.part.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_prod_op.cu b/paddle/fluid/operators/reduce_prod_op.cu
index d62e677d92cffecf629d1684026b0c7bcfec29e3..d8692afb96e4d5d3206210060684dd12fb4d79a7 100644
--- a/paddle/fluid/operators/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_prod_op.cu
@@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_prod,
                                           int, ops::ProdFunctor>,
                         ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                           int64_t, ops::ProdFunctor>);
-REGISTER_OP_CUDA_KERNEL(
-    reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                            float, ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_prod_op.part.cu b/paddle/fluid/operators/reduce_prod_op.part.cu
new file mode 100644
index 0000000000000000000000000000000000000000..486c578c64b9a2d80abc940a7c4266ef5fd23c7f
--- /dev/null
+++ b/paddle/fluid/operators/reduce_prod_op.part.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_prod_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                            float, ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_sum_op.cu b/paddle/fluid/operators/reduce_sum_op.cu
index 53cd9e9419dd9aecee730917ae21d7a4ab332ffc..2b031e8df99768c9208146640bddbe51149b2614 100644
--- a/paddle/fluid/operators/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_sum_op.cu
@@ -64,13 +64,3 @@ class ReduceSumKernel : public framework::OpKernel<T> {
 REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<float>,
                         ops::ReduceSumKernel<double>, ops::ReduceSumKernel<int>,
                         ops::ReduceSumKernel<int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                           float, ops::SumGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::SumGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::SumGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::SumGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_sum_op.part.cu
new file mode 100644
index 0000000000000000000000000000000000000000..525633f62a95b2d0d677fcbebe551b75cb2a180d
--- /dev/null
+++ b/paddle/fluid/operators/reduce_sum_op.part.cu
@@ -0,0 +1,26 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/cub_reduce.h"
+#include "paddle/fluid/operators/reduce_sum_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::SumGradFunctor>);
diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc
index 0fb7776fd9dbf437673820c7cf9411644272626c..b840e690960cf77a37895f5b3d83c4cdbc2fca35 100644
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -93,7 +93,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
                    in_grad_var_name);
 
     if (out_grad_var == nullptr) {
-      VLOG(5) << "Using fill constant 0 as starting gradient";
+      VLOG(50) << "Using fill constant 0 as starting gradient";
       auto in_var_name = Input("X");
       auto *in_var = scope.FindVar(in_var_name);
       auto &in_var_tensor = in_var->Get<framework::LoDTensor>();
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index e79cffcf498c52ed14db235f6221cfdf08399c9d..0dcf3f0e372f07370078553465973edfd7c96e07 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -110,7 +110,7 @@ class SaveOp : public framework::OperatorBase {
         lt_var != nullptr,
         "Can not find variable kLookupTablePath for SaveSelectedRows");
     std::string filename = lt_var->data();
-    VLOG(4) << "SaveSelectedRows get File name: " << filename;
+    VLOG(40) << "SaveSelectedRows get File name: " << filename;
 
     MkDirRecursively(DirName(filename).c_str());
 
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index d8a199bc2b860515645b4954b49d8eb59fbd02dc..96b8b00b429df72569ef2a292c8a600c56159f19 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -24,19 +24,13 @@ class ScaleKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& ctx) const {
     auto* in_var = ctx.InputVar("X");
-    auto* in = ctx.Input<framework::Tensor>("X");
-
-    auto* out_var = ctx.OutputVar("Out");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(in->place());
-
-    PADDLE_ENFORCE_EQ(in->dims(), out->dims(),
-                      "in and out should have the same dim");
+    auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
 
     auto scale = static_cast<T>(ctx.Attr<float>("scale"));
     auto bias = static_cast<T>(ctx.Attr<float>("bias"));
     auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
 
+    auto* out_var = ctx.OutputVar("Out");
     if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
       auto& in_slr = in_var->Get<framework::SelectedRows>();
       auto* out_slr = out_var->GetMutable<framework::SelectedRows>();
@@ -44,6 +38,13 @@ class ScaleKernel : public framework::OpKernel<T> {
       out_slr->set_height(in_slr.height());
     }
 
+    auto* out =
+        framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
+    out->mutable_data<T>(in->place());
+
+    PADDLE_ENFORCE_EQ(in->dims(), out->dims(),
+                      "in and out should have the same dim");
+
     auto eigen_out = framework::EigenVector<T>::Flatten(*out);
     auto eigen_in = framework::EigenVector<T>::Flatten(*in);
     auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h
index ac7d69bfb549fd98c76fcf834e8d3ad9bec2ef23..b2e79f6c82bb748293f4219845e6798347c8c46e 100644
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
@@ -51,7 +51,8 @@ void GPUScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
                       const Tensor& index, Tensor* output) {
   // PADDLE_ENFORCE(platform::is_gpu_place(place));
   // check index of shape 1-D
-  PADDLE_ENFORCE(index.dims().size() == 1);
+  PADDLE_ENFORCE(index.dims().size() == 1 ||
+                 (index.dims().size() == 2 && index.dims()[1] == 1));
   int index_size = index.dims()[0];
 
   auto src_dims = src.dims();
diff --git a/paddle/fluid/operators/scatter.h b/paddle/fluid/operators/scatter.h
index 39af717615c01f5c121e32b176b74d05be738531..8bae6606c94620ab4fa8ae34f69236e7e87e9670 100644
--- a/paddle/fluid/operators/scatter.h
+++ b/paddle/fluid/operators/scatter.h
@@ -37,7 +37,8 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
                    const Tensor& index, Tensor* output) {
   PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
   // check index of shape 1-D
-  PADDLE_ENFORCE(index.dims().size() == 1);
+  PADDLE_ENFORCE(index.dims().size() == 1 ||
+                 (index.dims().size() == 2 && index.dims()[1] == 1));
   int index_size = index.dims()[0];
 
   auto src_dims = src.dims();
diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc
index 8ca2877d8adad643089587fcee0917affa537f7d..02ca107ca35348df1827805e40730acd39f39e87 100644
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -42,12 +42,12 @@ class SendBarrierOp : public framework::OperatorBase {
         distributed::RPCClient::GetInstance<RPCCLIENT_T>(
             Attr<int>("trainer_id"));
 
-    VLOG(3) << "SendBarrierOp sync";
+    VLOG(30) << "SendBarrierOp sync";
 
     // need to wait before sending send_barrier message
     PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
     for (auto& ep : eps) {
-      VLOG(3) << "send barrier, ep: " << ep;
+      VLOG(30) << "send barrier, ep: " << ep;
       rpc_client->AsyncSendBatchBarrier(ep);
     }
     PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index be1dc4bf14c3394963822b065ca088afbfacd858..0ad43d56d3cd7500290dc1e386a2dbaf4453a191 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -50,10 +50,10 @@ class SendOp : public framework::OperatorBase {
     std::vector<distributed::VarHandlePtr> rets;
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
-        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
+        VLOG(30) << "sending " << ins[i] << " to " << epmap[i];
         rets.push_back(rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]));
       } else {
-        VLOG(3) << "don't send no-initialied variable: " << ins[i];
+        VLOG(30) << "don't send no-initialied variable: " << ins[i];
       }
     }
     if (sync_send) {
diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
index aee6180add5708d31f7ce927b37c4524a291fe3c..d79b16e3cca714d44c88834082cea9367480da9a 100644
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -120,7 +120,7 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
 void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
   f::Scope scope;
   p::CPUPlace place;
-  VLOG(4) << "before init tensor";
+  VLOG(40) << "before init tensor";
   if (is_sparse) {
     InitSelectedRowsInScope(place, &scope);
   } else {
@@ -146,7 +146,7 @@ void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
   attrs.insert({"PrefetchBlock", prefetch_block});
   attrs.insert({"grad_to_block_id", std::vector<std::string>({""})});
   attrs.insert({"sync_mode", true});
-  VLOG(4) << "before init op";
+  VLOG(40) << "before init op";
   listen_and_serv_op =
       f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
   *initialized = true;
diff --git a/paddle/fluid/operators/sequence_mask_op.h b/paddle/fluid/operators/sequence_mask_op.h
index 18acb735cecabd1e01f7821c880fd8ed5e52971f..7ff68f9c715e4c7243afe9de84af9474e7e4e260 100644
--- a/paddle/fluid/operators/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_mask_op.h
@@ -127,7 +127,7 @@ class SequenceMaskKernel : public framework::OpKernel<Tx> {
     auto x_numel = x->numel();
     if (maxlen < 0) {
 #ifdef __NVCC__
-      VLOG(10)
+      VLOG(100)
           << "SequenceMaskOp on GPU may be slow when maxlen is not provided.";
       maxlen = static_cast<int>(
           thrust::reduce(thrust::device_pointer_cast(x_data),
diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h
index d8b0165b2a89b04bd55671a37d96ee4ba275b2eb..b27ef27e298d0f08129e2c0a349c741129acdfe2 100644
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -98,10 +98,10 @@ class SGDOpKernel : public framework::OpKernel<T> {
 
       auto param_row_width = param.value().dims()[1];
       auto grad_row_width = grad.value().dims()[1];
-      VLOG(4) << " param rows: " << param.rows().size()
-              << " param memory rows: " << param.value().dims()[0]
-              << " grad rows: " << grad.rows().size()
-              << " grad memory rows: " << grad.value().dims()[0];
+      VLOG(40) << " param rows: " << param.rows().size()
+               << " param memory rows: " << param.value().dims()[0]
+               << " grad rows: " << grad.rows().size()
+               << " grad memory rows: " << grad.value().dims()[0];
       PADDLE_ENFORCE_EQ(param_row_width, grad_row_width,
                         "param_row should have the same size with grad_row");
 
@@ -109,8 +109,6 @@ class SGDOpKernel : public framework::OpKernel<T> {
       const auto *grad_data = grad.value().data<T>();
       auto *out_data = param_out->mutable_value()->data<T>();
       for (size_t i = 0; i < grad.rows().size(); i++) {
-        PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
-                       "Input rows index should less than height");
         int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false);
         PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
                           "id should be in the table");
diff --git a/paddle/fluid/operators/similarity_focus_op.cc b/paddle/fluid/operators/similarity_focus_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9612f82b6d45dc4e08bfe288ddd1c7790875ee4d
--- /dev/null
+++ b/paddle/fluid/operators/similarity_focus_op.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/similarity_focus_op.h"
+
+namespace paddle {
+namespace operators {
+class SimilarityFocusOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a 4-D tensor with shape,"
+             " [BatchSize, X, Y, Z]");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>), the similarity focus mask"
+              " with the same shape of input X.");
+    AddAttr<int>("axis",
+                 "(int32), indicating the dimension to be select. It can"
+                 " only be 1, 2, or 3.");
+    AddAttr<std::vector<int>>("indexes",
+                              "(std::vector<int32>), indicating the indexes"
+                              " of the selected dimension.");
+    AddComment(R"DOC(
+SimilarityFocus Operator.
+
+Generate a similarity focus mask with the same shape of input using the following method:
+1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding 
+   to the axis according to the indexes. For example, if axis=1 and indexes=[a], 
+   it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X 
+   is (BatchSize, A, B, C), the shape of tensor T is (BatchSize, B, C).
+2. For each index, find the largest numbers in the tensor T, so that the same 
+   row and same column has at most one number(what it means is that if the 
+   largest number has been found in the i-th row and the j-th column, then 
+   the numbers in the i-th row or j-th column will be skipped. And then the 
+   next largest number will be selected from the remaining numbers. Obviously 
+   there will be min(B, C) numbers), and mark the corresponding position of the 
+   3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for 
+   each index.
+3. Broadcast the 3-D similarity focus mask to the same shape of input X.
+
+Refer to `Similarity Focus Layer <http://www.aclweb.org/anthology/N16-1108>`_
+)DOC");
+  }
+};
+
+class SimilarityFocusOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 4, "Input(X)'s rank should be 4.");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(similarity_focus, ops::SimilarityFocusOp,
+                  ops::SimilarityFocusOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(similarity_focus, ops::SimilarityFocusKernel<float>,
+                       ops::SimilarityFocusKernel<double>);
diff --git a/paddle/fluid/operators/similarity_focus_op.h b/paddle/fluid/operators/similarity_focus_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf3fed2aaf2cf92d5619ae5bce6dd70d9dfe9621
--- /dev/null
+++ b/paddle/fluid/operators/similarity_focus_op.h
@@ -0,0 +1,168 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <cstring>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename T>
+class SimilarityFocusKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    Tensor* out = context.Output<Tensor>("Out");
+    const Tensor* x = context.Input<Tensor>("X");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    const T* x_data = x->data<T>();
+
+    int axis = context.Attr<int>("axis");
+    std::vector<int> indexes = context.Attr<std::vector<int>>("indexes");
+
+    int64_t batch_size = x->dims()[0];
+    int64_t dim[4];
+    for (int i = 1; i <= 3; ++i) {
+      dim[i] = x->dims()[i];
+    }
+
+    if (indexes.size() < 1) {
+      PADDLE_THROW("Indexes' size can not be 0.");
+    }
+    for (auto index : indexes) {
+      if (dim[axis] < index) {
+        PADDLE_THROW("Index exceeds tensor shape limit.");
+      }
+    }
+
+    int64_t array_size = 1;
+    for (int i = 1; i <= 3; ++i) {
+      if (i != axis) {
+        array_size *= dim[i];
+      }
+    }
+
+    std::vector<std::pair<T, int64_t>> array(array_size);
+
+    bool (*cmp)(std::pair<T, int64_t>, std::pair<T, int64_t>) = [](
+        std::pair<T, int64_t> x, std::pair<T, int64_t> y) {
+      return x.first > y.first;
+    };
+
+    int64_t (*compute_index)(int64_t*, int, int, int, int) = [](
+        int64_t* dim, int d1, int d2, int d3, int d4) {
+      return d1 * dim[1] * dim[2] * dim[3] + d2 * dim[2] * dim[3] +
+             d3 * dim[3] + d4;
+    };
+
+    memset(out_data, 0, sizeof(T) * batch_size * dim[1] * dim[2] * dim[3]);
+    for (int i = 0; i < batch_size; ++i) {
+      for (auto index : indexes) {
+        if (axis == 1) {
+          for (int j = 0; j < dim[2]; ++j) {
+            for (int k = 0; k < dim[3]; ++k) {
+              array[j * dim[3] + k] = std::make_pair(
+                  x_data[compute_index(dim, i, index, j, k)], j * dim[3] + k);
+            }
+          }
+
+          std::sort(array.begin(), array.end(), cmp);
+          int tag_num = 0;
+          std::vector<bool> tag2(dim[2]), tag3(dim[3]);
+          for (auto x : array) {
+            int idx2 = x.second / dim[3];
+            int idx3 = x.second % dim[3];
+            if (tag2[idx2] || tag3[idx3]) {
+              continue;
+            }
+            tag_num++;
+            tag2[idx2] = true;
+            tag3[idx3] = true;
+            for (int j = 0; j < dim[1]; ++j) {
+              out_data[compute_index(dim, i, j, idx2, idx3)] = 1;
+            }
+            if (tag_num == std::min(dim[2], dim[3])) {
+              break;
+            }
+          }
+        } else if (axis == 2) {
+          for (int j = 0; j < dim[1]; ++j) {
+            for (int k = 0; k < dim[3]; ++k) {
+              array[j * dim[3] + k] = std::make_pair(
+                  x_data[compute_index(dim, i, j, index, k)], j * dim[3] + k);
+            }
+          }
+
+          std::sort(array.begin(), array.end(), cmp);
+          int tag_num = 0;
+          std::vector<bool> tag1(dim[1]), tag3(dim[3]);
+          for (auto x : array) {
+            int idx1 = x.second / dim[3];
+            int idx3 = x.second % dim[3];
+            if (tag1[idx1] || tag3[idx3]) {
+              continue;
+            }
+            tag_num++;
+            tag1[idx1] = true;
+            tag3[idx3] = true;
+            for (int j = 0; j < dim[2]; ++j) {
+              out_data[compute_index(dim, i, idx1, j, idx3)] = 1;
+            }
+            if (tag_num == std::min(dim[1], dim[3])) {
+              break;
+            }
+          }
+        } else if (axis == 3) {
+          for (int j = 0; j < dim[1]; ++j) {
+            for (int k = 0; k < dim[2]; ++k) {
+              array[j * dim[2] + k] = std::make_pair(
+                  x_data[compute_index(dim, i, j, k, index)], j * dim[2] + k);
+            }
+          }
+
+          std::sort(array.begin(), array.end(), cmp);
+          int tag_num = 0;
+          std::vector<bool> tag1(dim[1]), tag2(dim[2]);
+          for (auto x : array) {
+            int idx1 = x.second / dim[2];
+            int idx2 = x.second % dim[2];
+            if (tag1[idx1] || tag2[idx2]) {
+              continue;
+            }
+            tag_num++;
+            tag1[idx1] = true;
+            tag2[idx2] = true;
+            for (int j = 0; j < dim[3]; ++j) {
+              out_data[compute_index(dim, i, idx1, idx2, j)] = 1;
+            }
+            if (tag_num == std::min(dim[1], dim[2])) {
+              break;
+            }
+          }
+        } else {
+          PADDLE_THROW("Axis must be 1 or 2 or 3");
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index a4bdbe6648afa7c91a056af4737bb5d826229022..9e21b6c824bfd7d1c1090e5ba3ba2f6aa9bdb230 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -124,6 +124,14 @@ For each row $i$ and each column $j$ in the matrix, we have:
   }
 };
 
+class SoftmaxOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
+  }
+};
+
 class SoftmaxOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -196,7 +204,7 @@ class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker {
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
-                  ops::SoftmaxOpGradMaker);
+                  ops::SoftmaxOpInferVarType, ops::SoftmaxOpGradMaker);
 REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
 REGISTER_OP_CPU_KERNEL(
     softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f109dd685c87ab1b0776a855bb5f510eab1f5526
--- /dev/null
+++ b/paddle/fluid/operators/space_to_depth_op.cc
@@ -0,0 +1,131 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/space_to_depth_op.h"
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+class SpaceToDepthOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SpaceToDepthOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SpaceToDepthOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 4, "input should be a 4D tensor");
+    auto blocksize = ctx->Attrs().Get<int64_t>("blocksize");
+
+    PADDLE_ENFORCE_GT(blocksize, 1, "The blocksize should be Greater than 1");
+    PADDLE_ENFORCE_GT(x_dims[1], 0, "input channel should be Greater than 0");
+    PADDLE_ENFORCE_GT(x_dims[2], 0, "input Height should be Greater than 0");
+    PADDLE_ENFORCE_GT(x_dims[3], 0, "input Width should be Greater than 0");
+
+    PADDLE_ENFORCE_EQ(x_dims[1] % (blocksize * blocksize), 0,
+                      "input channel should be divisible of the square of "
+                      "SpaceToDepthOp blocksize");
+    PADDLE_ENFORCE_EQ(x_dims[2] % (blocksize), 0,
+                      "input Height should be divisible of the square of "
+                      "SpaceToDepthOp blocksize");
+    PADDLE_ENFORCE_EQ(x_dims[3] % (blocksize), 0,
+                      "input Width should be divisible of the square of "
+                      "SpaceToDepthOp blocksize");
+
+    VLOG(3) << "SpaceToDepthOp operator x.shape=" << x_dims
+            << "Attribute blocksize" << blocksize << std::endl;
+
+    std::vector<int64_t> output_shape(4, 0);  // [B,C,H,W]
+    output_shape[0] = x_dims[0];
+    output_shape[1] = x_dims[1] * blocksize * blocksize;
+    output_shape[2] = x_dims[2] / blocksize;
+    output_shape[3] = x_dims[3] / blocksize;
+
+    auto out_dims = framework::make_ddim(output_shape);
+
+    ctx->SetOutputDim("Out", out_dims);
+
+    if (x_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
+  }
+};
+
+class SpaceToDepthOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor). The input should be a 4D tensor B * C * W * H of "
+             "SpaceToDepthOp "
+             "operator.");
+    AddOutput("Out",
+              "(Tensor), The output should be a 4D tensor B * C2 * W2 * H2 of "
+              "SpaceToDepthOp operator.");
+    AddAttr<int64_t>(
+        "blocksize",
+        "(int64_t, default 2) blocksize used to do change Space To Depth.")
+        .SetDefault(2)
+        .GreaterThan(1);
+    AddComment(R"DOC(
+        reorg operator used in Yolo v2.
+        The equation is: C2 = C1/blocksize * blocksize, W2 = W1 ∗ blocksize + offset % blocksize, H2 = H1 ∗ blocksize + offset / blocksize, 
+
+        Reshape Input(X) into the shape according to Attr(blocksize). The
+        data in Input(X) are unchanged.
+
+        Examples:
+
+            1. Given a 4-D tensor Input(X) with a shape [128, 2048, 26, 26], and the blocksize is 2, the reorg operator will transform Input(X)
+            into a 4-D tensor with shape [128, 2048, 13, 13] and leaving Input(X)'s data unchanged.
+
+    )DOC");
+  }
+};
+
+class SpaceToDepthGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(space_to_depth, ops::SpaceToDepthOp, ops::SpaceToDepthOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(space_to_depth_grad, ops::SpaceToDepthGradOp);
+REGISTER_OP_CPU_KERNEL(
+    space_to_depth,
+    ops::SpaceToDepthKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SpaceToDepthKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SpaceToDepthKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    space_to_depth_grad,
+    ops::SpaceToDepthGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SpaceToDepthGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SpaceToDepthGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/space_to_depth_op.cu b/paddle/fluid/operators/space_to_depth_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..38d0a662733222386b8ecd68d064f3d1abe56c3b
--- /dev/null
+++ b/paddle/fluid/operators/space_to_depth_op.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/space_to_depth_op.h"
+
+namespace plat = paddle::platform;
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    space_to_depth,
+    ops::SpaceToDepthKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SpaceToDepthKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SpaceToDepthKernel<paddle::platform::CUDADeviceContext, int64_t>);
+
+REGISTER_OP_CUDA_KERNEL(
+    space_to_depth_grad,
+    ops::SpaceToDepthGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SpaceToDepthGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SpaceToDepthGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/space_to_depth_op.h b/paddle/fluid/operators/space_to_depth_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a71662b4813ab27b65f5c7a918e2bb6fb15a1993
--- /dev/null
+++ b/paddle/fluid/operators/space_to_depth_op.h
@@ -0,0 +1,127 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_
+#define PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_
+#endif  // PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class space_to_depth_compute {
+ public:
+  HOSTDEVICE space_to_depth_compute(const T *x, int64_t w, int64_t h, int64_t c,
+                                    int64_t batch, int64_t blocksize,
+                                    int64_t forward, T *out)
+      : x_(x),
+        w_(w),
+        h_(h),
+        c_(c),
+        batch_(batch),
+        blocksize_(blocksize),
+        forward_(forward),
+        out_(out) {}
+
+  HOSTDEVICE void operator()(int64_t in_index) {
+    int64_t out_c = c_ / (blocksize_ * blocksize_);
+    // calculate each dim position with index of tensor
+    int64_t b = in_index / (c_ * h_ * w_);
+    int64_t k = (in_index % (c_ * h_ * w_)) / (h_ * w_);
+    int64_t j = ((in_index % (c_ * h_ * w_)) % (h_ * w_)) / w_;
+    int64_t i = ((in_index % (c_ * h_ * w_)) % (h_ * w_)) % w_;
+
+    int64_t c2 = k % out_c;
+    int64_t offset = k / out_c;
+    int64_t w2 = i * blocksize_ + offset % blocksize_;
+    int64_t h2 = j * blocksize_ + offset / blocksize_;
+    int64_t out_index =
+        w2 + w_ * blocksize_ * (h2 + h_ * blocksize_ * (c2 + out_c * b));
+    if (forward_)
+      out_[out_index] = x_[in_index];
+    else
+      out_[in_index] = x_[out_index];
+  }
+
+ private:
+  const T *x_;
+  int64_t w_, h_, c_, batch_, blocksize_, forward_;
+  T *out_;
+};
+
+template <typename DeviceContext, typename T>
+class SpaceToDepthKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *out = context.Output<framework::LoDTensor>("Out");
+    auto *x = context.Input<framework::LoDTensor>("X");
+    auto blocksize = context.Attr<int64_t>("blocksize");
+    auto in_dims = x->dims();
+    out->mutable_data(context.GetPlace(), x->type());
+
+    auto out_dims = out->dims();
+    auto B = in_dims[0];
+    auto C = in_dims[1];
+    auto H = in_dims[2];
+    auto W = in_dims[3];
+    platform::ForRange<DeviceContext> for_range(
+        context.template device_context<DeviceContext>(),
+        static_cast<size_t>(x->numel()));
+
+    auto *x_data = x->data<T>();
+    auto *out_data = out->data<T>();
+    paddle::operators::space_to_depth_compute<T> computer(
+        x_data, W, H, C, B, blocksize, 1, out_data);
+    for_range(computer);
+
+    out->Resize(out_dims);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SpaceToDepthGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *d_out =
+        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto *d_x =
+        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto blocksize = context.Attr<int64_t>("blocksize");
+    auto in_dims = d_x->dims();
+    d_x->mutable_data(context.GetPlace(), d_out->type());
+
+    auto B = in_dims[0];
+    auto C = in_dims[1];
+    auto H = in_dims[2];
+    auto W = in_dims[3];
+
+    platform::ForRange<DeviceContext> for_range(
+        context.template device_context<DeviceContext>(),
+        static_cast<size_t>(d_x->numel()));
+
+    auto *dx_data = d_x->data<T>();
+    auto *dout_data = d_out->data<T>();
+
+    paddle::operators::space_to_depth_compute<T> computer(
+        dout_data, W, H, C, B, blocksize, 0, dx_data);
+    for_range(computer);
+
+    d_x->Resize(in_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/split_byref_op.h b/paddle/fluid/operators/split_byref_op.h
index fedd7218dd6cc9481e94a92a3820cafbe4157bd0..3b7ae6fc91e0a9e08406e38b9a557cab442c2560 100644
--- a/paddle/fluid/operators/split_byref_op.h
+++ b/paddle/fluid/operators/split_byref_op.h
@@ -32,7 +32,7 @@ class SplitByrefOpKernel : public framework::OpKernel<T> {
     for (size_t i = 0; i < outs.size(); ++i) {
       // NOTE: no need to call mutable_data here to allocate memory.
       auto* out = outs[i];
-      VLOG(3) << "spliting by ref: " << row_offset << " " << out->dims()[0];
+      VLOG(30) << "spliting by ref: " << row_offset << " " << out->dims()[0];
       *out = in->Slice(row_offset, row_offset + out->dims()[0]);
       row_offset += out->dims()[0];
     }
diff --git a/paddle/fluid/operators/split_ids_op.cc b/paddle/fluid/operators/split_ids_op.cc
index 243f81e296fb95a2c7e9f717950b8a958ad98852..01d432e13068f7b718d08dc15d8cc99a7fbb0afe 100644
--- a/paddle/fluid/operators/split_ids_op.cc
+++ b/paddle/fluid/operators/split_ids_op.cc
@@ -64,8 +64,7 @@ class SplitIdsOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.MultiInput<framework::Tensor>("Ids").front()->type()),
+        framework::GetDataTypeOfVar(ctx.MultiInputVar("Ids").front()),
         ctx.GetPlace());
   }
 };
diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h
index 69ac6c5a6b9a8b318520eb9a3ff89a3a6be48339..6dbada3da8826f0e7cb07a9642d327e5ee38c309 100644
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -44,7 +44,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
       for (size_t i = 0; i < ids_tensors.size(); ++i) {
         batch_size += ids_tensors[i]->dims()[0];
       }
-      VLOG(4) << "Get Total BatchSize is: " << batch_size;
+      VLOG(40) << "Get Total BatchSize is: " << batch_size;
 
       std::vector<T> all_ids(batch_size);
       int offset = 0;
@@ -113,6 +113,10 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
                  row_width * sizeof(T));
         }
       }
+    } else {
+      PADDLE_THROW(
+          "% should be LoDTensor or SelectedRows, but the received type is %s",
+          ctx.Inputs("Ids")[0], ids_var->Type().name());
     }
   }
 };
diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc
index 3f4b48bc7391def082c82ed451fc5a752009a2f1..9345b495415d203728238c19621a20f446c40bf5 100644
--- a/paddle/fluid/operators/stack_op.cc
+++ b/paddle/fluid/operators/stack_op.cc
@@ -21,8 +21,12 @@ REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker,
 REGISTER_OPERATOR(stack_grad, ops::StackOpGrad);
 
 REGISTER_OP_CPU_KERNEL(stack, ops::StackKernel<plat::CPUDeviceContext, float>,
-                       ops::StackKernel<plat::CPUDeviceContext, double>);
+                       ops::StackKernel<plat::CPUDeviceContext, double>,
+                       ops::StackKernel<plat::CPUDeviceContext, int>,
+                       ops::StackKernel<plat::CPUDeviceContext, int64_t>);
 
 REGISTER_OP_CPU_KERNEL(stack_grad,
                        ops::StackGradKernel<plat::CPUDeviceContext, float>,
-                       ops::StackGradKernel<plat::CPUDeviceContext, double>);
+                       ops::StackGradKernel<plat::CPUDeviceContext, double>,
+                       ops::StackGradKernel<plat::CPUDeviceContext, int>,
+                       ops::StackGradKernel<plat::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu
index 92c1bde2bcf089e5c715e90e564408e6ad37ba17..bf2a9e5b3d22996e688621727cb280dc9aed7859 100644
--- a/paddle/fluid/operators/stack_op.cu
+++ b/paddle/fluid/operators/stack_op.cu
@@ -18,8 +18,12 @@ namespace plat = paddle::platform;
 namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(stack, ops::StackKernel<plat::CUDADeviceContext, float>,
-                        ops::StackKernel<plat::CUDADeviceContext, double>);
+                        ops::StackKernel<plat::CUDADeviceContext, double>,
+                        ops::StackKernel<plat::CUDADeviceContext, int>,
+                        ops::StackKernel<plat::CUDADeviceContext, int64_t>);
 
 REGISTER_OP_CUDA_KERNEL(stack_grad,
                         ops::StackGradKernel<plat::CUDADeviceContext, float>,
-                        ops::StackGradKernel<plat::CUDADeviceContext, double>);
+                        ops::StackGradKernel<plat::CUDADeviceContext, double>,
+                        ops::StackGradKernel<plat::CUDADeviceContext, int>,
+                        ops::StackGradKernel<plat::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/sum_mkldnn_op.cc
index f9a16ef35ecb9eeb6c8eda9d124ecb17e7f9d5ce..2ae5c17bf6465874572e80da54e40fbe22403660 100644
--- a/paddle/fluid/operators/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/sum_mkldnn_op.cc
@@ -186,7 +186,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       }
 
       if (in_dim.empty()) {
-        VLOG(3) << "WARNING: all the inputs are empty";
+        VLOG(30) << "WARNING: all the inputs are empty";
         in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
       } else {
         in_dim[0] = static_cast<int64_t>(first_dim);
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index d19ac9839c90a116265b761e3b1b3f855e2d95e8..c67b694283cd8f0203021c0329f5ac16ae7854a5 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -45,7 +45,7 @@ class SumOp : public framework::OperatorWithKernel {
     size_t N = x_dims.size();
     PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0.");
     if (N == 1) {
-      VLOG(3) << "Warning: sum have only one input, may waste memory";
+      VLOG(30) << "Warning: sum have only one input, may waste memory";
     }
 
     framework::DDim in_dim({0});
@@ -85,8 +85,8 @@ class SumOp : public framework::OperatorWithKernel {
       for (size_t idx = 0; idx < x_vars.size(); ++idx) {
         PADDLE_ENFORCE(x_vars[idx] != nullptr,
                        "Input var[%s] should not be nullptr", x_vars_name[idx]);
-        // FIXME(zcd): The input x_var may be SelectedRows or LoDTensor.
-        auto tensor = framework::GetTensorFromVar(*x_vars[idx]);
+        auto tensor =
+            framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_vars[idx]);
         if (tensor->numel() == 0) {
           continue;
         }
@@ -157,8 +157,8 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
     auto& inputs = op_desc.Input("X");
     auto var_type = framework::proto::VarType::SELECTED_ROWS;
     for (auto& name : op_desc.Input("X")) {
-      VLOG(10) << name << " "
-               << block->FindRecursiveOrCreateVar(name).GetType();
+      VLOG(100) << name << " "
+                << block->FindRecursiveOrCreateVar(name).GetType();
     }
 
     bool any_input_is_lod_tensor = std::any_of(
diff --git a/paddle/fluid/operators/tensor_array_read_write_op.cc b/paddle/fluid/operators/tensor_array_read_write_op.cc
index a2d44284e9de1ace42cabbce82e0b45929432d7b..484160aeb8de573c6a6c1bb2ea5da23600d2d287 100644
--- a/paddle/fluid/operators/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/tensor_array_read_write_op.cc
@@ -34,8 +34,8 @@ class WriteToArrayOp : public ArrayOp {
     auto *out =
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
     if (offset >= out->size()) {
-      VLOG(10) << "Resize " << Output("Out") << " from " << out->size()
-               << " to " << offset + 1;
+      VLOG(100) << "Resize " << Output("Out") << " from " << out->size()
+                << " to " << offset + 1;
       out->resize(offset + 1);
     }
     auto *out_tensor = &out->at(offset);
@@ -47,9 +47,9 @@ class WriteToArrayOp : public ArrayOp {
 
       TensorCopy(x_tensor, place, dev_ctx, out_tensor);
     } else {
-      VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
-                  "nothing has been written to output array["
-               << offset << "].";
+      VLOG(100) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
+                   "nothing has been written to output array["
+                << offset << "].";
     }
   }
 };
@@ -104,7 +104,7 @@ class WriteToArrayInferVarType : public framework::VarTypeInference {
                   framework::BlockDesc *block) const override {
     auto x_name = op_desc.Input("X")[0];
     auto out_name = op_desc.Output("Out")[0];
-    VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
+    VLOG(100) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
     auto &out = block->FindRecursiveOrCreateVar(out_name);
     out.SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
     auto *x = block->FindVarRecursive(x_name);
@@ -139,7 +139,7 @@ class ReadFromArrayOp : public ArrayOp {
       framework::TensorCopy(x_array[offset], place, dev_ctx, out_tensor);
       out_tensor->set_lod(x_array[offset].lod());
     } else {
-      VLOG(10) << "offset " << offset << " >= " << x_array.size();
+      VLOG(100) << "offset " << offset << " >= " << x_array.size();
     }
   }
 };
diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..96dc123f6a36e1a2b6ae04e0d97dffe1e10ac4ea
--- /dev/null
+++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
@@ -0,0 +1,246 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+void LodTensorArray2LodTensorVector(const framework::Scope &scope,
+                                    const std::string &base_name,
+                                    const std::string &lod_tensor_array_name,
+                                    std::vector<std::string> *res_names) {
+  auto &inx =
+      scope.FindVar(lod_tensor_array_name)->Get<framework::LoDTensorArray>();
+  for (size_t i = 0; i < inx.size(); i++) {
+    std::string var_name = base_name + std::to_string(i);
+    framework::Variable *g_feed_value =
+        const_cast<framework::Scope &>(scope).Var(var_name);
+    auto &feed_input =
+        *(g_feed_value->GetMutable<paddle::framework::LoDTensor>());
+    feed_input.ShareDataWith(inx[i]);
+    res_names->push_back(var_name);
+  }
+}
+
+void LodTensorVectorResizeFromLodTensorArray(
+    const framework::Scope &scope, const std::string &base_name,
+    const std::string &lod_tensor_array_name,
+    std::vector<std::string> *res_names) {
+  auto &inx =
+      scope.FindVar(lod_tensor_array_name)->Get<framework::LoDTensorArray>();
+  for (size_t i = 0; i < inx.size(); i++) {
+    std::string var_name = base_name + std::to_string(i);
+    framework::Variable *g_feed_value =
+        const_cast<framework::Scope &>(scope).Var(var_name);
+    auto &feed_input =
+        *(g_feed_value->GetMutable<paddle::framework::LoDTensor>());
+    auto dims = inx[i].dims();
+    feed_input.Resize(dims);
+    res_names->push_back(var_name);
+  }
+}
+
+void LodTensorArrayCreateFromLodTensorArray(
+    const framework::Scope &scope,
+    const std::string &input_lod_tensor_array_name,
+    const std::string &output_lod_tensor_array_name) {
+  auto &inx = scope.FindVar(input_lod_tensor_array_name)
+                  ->Get<framework::LoDTensorArray>();
+  auto &grad_inx = *scope.FindVar(output_lod_tensor_array_name)
+                        ->GetMutable<framework::LoDTensorArray>();
+
+  for (size_t i = 0; i < inx.size(); i++) {
+    std::string var_name = output_lod_tensor_array_name + std::to_string(i);
+    framework::Variable *g_feed_value =
+        const_cast<framework::Scope &>(scope).Var(var_name);
+    auto &feed_input =
+        *(g_feed_value->GetMutable<paddle::framework::LoDTensor>());
+    grad_inx.push_back(feed_input);
+  }
+}
+
+class LoDTensorArray2TensorOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto axis = Attr<int>("axis");
+
+    framework::AttributeMap attrs;
+    attrs["axis"] = axis;
+
+    auto &inx = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
+    auto &out =
+        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    auto &out_inx =
+        *scope.FindVar(Output("OutIndex"))->GetMutable<framework::LoDTensor>();
+
+    const size_t n = inx.size();
+    PADDLE_ENFORCE_GT(n, 0, "Input tensorarray size should > 0.");
+
+    std::string base_name = Inputs("X")[0];
+    std::vector<std::string> names;
+
+    // get the input tensorarray items' dim in out_inx
+    auto out_inx_dim = out_inx.dims();
+    out_inx_dim[0] = inx.size();
+    out_inx.Resize(out_inx_dim);
+
+    std::string var_name = "out_index";
+    framework::Variable *tmp_index_var =
+        const_cast<framework::Scope &>(scope).Var(var_name);
+    auto &tmp_index_tensor =
+        *(tmp_index_var->GetMutable<paddle::framework::LoDTensor>());
+    tmp_index_tensor.Resize(out_inx_dim);
+    int *tmp_index_data =
+        tmp_index_tensor.mutable_data<int>(platform::CPUPlace());
+
+    auto out_dims = inx[0].dims();
+    size_t out_dim_sum = 0;
+    for (size_t index = 0; index < inx.size(); index++) {
+      auto inx_dims = inx[index].dims();
+      out_dim_sum += inx_dims[axis];
+      tmp_index_data[index] = inx_dims[axis];
+    }
+    out_inx.ShareDataWith(tmp_index_tensor);
+
+    // get input array items' dims
+    out_dims[axis] = out_dim_sum;
+    out.Resize(out_dims);
+
+    LodTensorArray2LodTensorVector(scope, base_name, Input("X"), &names);
+    // Invoke Reshape Op
+    auto concat_op = framework::OpRegistry::CreateOp(
+        "concat", {{"X", names}}, {{"Out", {Output("Out")}}}, attrs);
+
+    concat_op->Run(scope, place);
+  }
+};
+
+class LoDTensorArray2TensorOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input LoDTensorArray of tensor_array_to_tensor operator.");
+    AddOutput("Out", "Output tensor of tensor_array_to_tensor operator.");
+    AddOutput("OutIndex",
+              "Output input LoDTensorArray items' dims of "
+              "tensor_array_to_tensor operator.");
+    AddAttr<int>("axis",
+                 "The axis along which the input tensors will be concatenated.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+tensor_array_to_tensor Operator.
+
+Concatenate the input LoDTensorArray along dimension axis to the output Tensor.
+Examples:
+  Input = {[1,2], [3,4], [5,6]}
+  axis = 0
+  Output = [[1,2],
+            [3,4],
+            [5,6]]
+  OutputIndex = [1,1,1]
+
+)DOC");
+  }
+};
+
+class LoDTensorArray2TensorOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {}
+};
+
+class LoDTensorArray2TensorGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {}
+};
+
+class LoDTensorArray2TensorGradInferVarType
+    : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    for (auto &out_var : op_desc.Output(framework::GradVarName("X"))) {
+      block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
+    }
+  }
+};
+
+class LoDTensorArray2TensorGradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto axis = Attr<int>("axis");
+    framework::AttributeMap attrs;
+    attrs["axis"] = axis;
+
+    auto &inx = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
+    const size_t n = inx.size();
+    PADDLE_ENFORCE_GT(n, 0, "Input tensorarray size should > 0.");
+
+    std::string base_name = Inputs("X")[0];
+    std::vector<std::string> names;
+
+    LodTensorArray2LodTensorVector(scope, base_name, Input("X"), &names);
+
+    // grad
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+
+    std::vector<std::string> grad_names;
+
+    LodTensorVectorResizeFromLodTensorArray(scope, "grad_name", Input("X"),
+                                            &grad_names);
+
+    auto concat_grad_op = framework::OpRegistry::CreateOp(
+        "concat_grad", {{"X", names}, {"Out@GRAD", {dout_name}}},
+        {{"X@GRAD", grad_names}}, attrs);
+
+    concat_grad_op->Run(scope, place);
+
+    LodTensorArrayCreateFromLodTensorArray(scope, Input("X"), dx_name);
+    auto &grad_inx =
+        *scope.FindVar(dx_name)->GetMutable<framework::LoDTensorArray>();
+
+    for (size_t i = 0; i < grad_names.size(); i++) {
+      std::string var_name = grad_names[i];
+      auto &feed_input = scope.FindVar(var_name)->Get<framework::LoDTensor>();
+      grad_inx[i].ShareDataWith(feed_input);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+USE_OP(concat);
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(tensor_array_to_tensor, ops::LoDTensorArray2TensorOp,
+                  ops::LoDTensorArray2TensorOpMaker,
+                  ops::LoDTensorArray2TensorOpInferShape,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(tensor_array_to_tensor_grad, ops::LoDTensorArray2TensorGradOp,
+                  ops::LoDTensorArray2TensorGradInferShape,
+                  ops::LoDTensorArray2TensorGradInferVarType);
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index 673f86da76ee0712b4d941f5b33594f89926b973..3af9376da1d3fa096b277e6b5a9d1a8de197d6f1 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -34,7 +34,7 @@ namespace operators {
 using FluidDT = framework::proto::VarType_Type;
 using TRT_DT = nvinfer1::DataType;
 
-namespace {
+namespace {  // NOLINT
 
 TRT_DT FluidDataType2TRT(FluidDT type) {
   switch (type) {
@@ -60,7 +60,7 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
   return nvinfer1::DimsCHW(shape[1], 1, 1);
 }
 
-}  // namespace
+}  // namespace // NOLINT
 
 using inference::Singleton;
 using inference::tensorrt::TRT_EngineManager;
@@ -127,9 +127,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
 
     // Convert output tensor from engine to fluid
     int output_index = 0;
-    VLOG(4) << "TensorRT Engine Op Outputs:";
+    VLOG(40) << "TensorRT Engine Op Outputs:";
     for (const auto& y : context.Outputs("Ys")) {
-      VLOG(4) << y;
+      VLOG(40) << y;
       // convert output and copy to fluid.
       nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
       auto dims = trt_t->getDimensions();
@@ -167,7 +167,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
 
  protected:
   void Prepare(const framework::ExecutionContext& context) const {
-    VLOG(4) << "Prepare engine";
+    VLOG(40) << "Prepare engine";
     // Get the ProgramDesc and pass to convert.
     framework::proto::BlockDesc block_desc;
     block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
@@ -192,12 +192,12 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
     engine->InitNetwork();
 
     framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
-    VLOG(4) << "parsed var size " << block.AllVars().size();
+    VLOG(40) << "parsed var size " << block.AllVars().size();
     // Add inputs
-    VLOG(4) << "declare inputs";
+    VLOG(40) << "declare inputs";
     for (auto& input : context.Inputs("Xs")) {
       if (parameters.count(input)) continue;
-      VLOG(4) << "declare input " << input;
+      VLOG(40) << "declare input " << input;
       auto* var = block.FindVar(input);
       // TensorRT engine need to create parameters. The parameter's description
       // should be set in
diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc
index 3c8a01b6e47459760b05b5ca7fa4fa5e1d37d112..aa6af055decc4856fcf2036d324af6b1ff3a5de0 100644
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -129,15 +129,15 @@ class WhileGradOp : public framework::OperatorBase {
 
     for (auto cur_scope_iter = step_scopes->rbegin();
          cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) {
-      VLOG(3) << "Start backward at time_step "
-              << cur_scope_iter - step_scopes->rbegin();
+      VLOG(30) << "Start backward at time_step "
+               << cur_scope_iter - step_scopes->rbegin();
       framework::Scope &cur_scope = **cur_scope_iter;
       // Link OG from outside to inside
       for (size_t i = 0; i < outside_og_names.size(); ++i) {
         auto outside_og_name = outside_og_names[i];
         auto inside_og_name = inside_og_names[i];
-        VLOG(8) << "Linking outside " << outside_og_name << " --> inside "
-                << inside_og_name;
+        VLOG(80) << "Linking outside " << outside_og_name << " --> inside "
+                 << inside_og_name;
         if (scope.FindVar(outside_og_name) == nullptr) {
           continue;
         }
@@ -159,11 +159,11 @@ class WhileGradOp : public framework::OperatorBase {
           auto &outside_array = og_outside.Get<framework::LoDTensorArray>();
           auto &inside_array =
               detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
-          VLOG(8) << outside_og_name << " size = " << outside_array.size();
+          VLOG(80) << outside_og_name << " size = " << outside_array.size();
           inside_array.resize(outside_array.size());
 
           for (size_t j = 0; j < inside_array.size(); ++j) {
-            VLOG(8) << j << " " << outside_array[j].numel();
+            VLOG(80) << j << " " << outside_array[j].numel();
             if (outside_array[j].numel() != 0) {
               inside_array[j].set_lod(outside_array[j].lod());
               inside_array[j].ShareDataWith(outside_array[j]);
@@ -289,7 +289,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     auto igs = InputGrad(kX, /*do not drop empty gradient*/ false);
     for (auto &each_ig : igs) {
       if (inner_op_outputs.find(each_ig) == inner_op_outputs.end()) {
-        VLOG(8) << "Ignore " << each_ig;
+        VLOG(80) << "Ignore " << each_ig;
         each_ig = framework::kEmptyVarName;
       }
     }
@@ -353,8 +353,8 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference {
       auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
       auto *g_var = block->FindVarRecursive(pg_ig_names[i]);
       if (g_var != nullptr) {  // Gradient could be @EMPTY@
-        VLOG(5) << "Setting " << pg_ig_names[i] << " following " << p_names[i]
-                << " type: " << p_var.GetType();
+        VLOG(50) << "Setting " << pg_ig_names[i] << " following " << p_names[i]
+                 << " type: " << p_var.GetType();
         g_var->SetType(p_var.GetType());
         g_var->SetDataType(p_var.GetDataType());
       }
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index ff49a1d57fd977a6d6b4502b44e48aad34cde872..f5541014af5170488efbb10f6e7e331ef015a848 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -204,7 +204,10 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
                           << "." << (driver_version_ % 100) / 10
                           << ", Runtime Version: " << runtime_version_ / 1000
                           << "." << (runtime_version_ % 100) / 10;
-
+  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
+  LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
+                          << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
+                          << (cudnn_dso_ver % 100) / 10 << ".";
   callback_manager_.reset(new StreamCallbackManager(stream_));
 }
 
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index dc1d751141187edb7738e42c41514614d4d399b0..ea4564058d602a9abe43bd063f1ed73f88a2de08 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -203,7 +203,7 @@ class DeviceTracerImpl : public DeviceTracer {
   void AddCPURecords(const std::string &anno, uint64_t start_ns,
                      uint64_t end_ns, int64_t device_id, int64_t thread_id) {
     if (anno.empty()) {
-      VLOG(1) << "Empty timeline annotation.";
+      VLOG(10) << "Empty timeline annotation.";
       return;
     }
     std::lock_guard<std::mutex> l(trace_mu_);
@@ -216,7 +216,7 @@ class DeviceTracerImpl : public DeviceTracer {
                      uint32_t correlation_id, uint64_t bytes) {
     // 0 means timestamp information could not be collected for the kernel.
     if (start_ns == 0 || end_ns == 0) {
-      VLOG(3) << name << " cannot be traced";
+      VLOG(30) << name << " cannot be traced";
       return;
     }
     std::lock_guard<std::mutex> l(trace_mu_);
@@ -228,7 +228,7 @@ class DeviceTracerImpl : public DeviceTracer {
                         int64_t stream_id, uint32_t correlation_id) {
     // 0 means timestamp information could not be collected for the kernel.
     if (start == 0 || end == 0) {
-      VLOG(3) << correlation_id << " cannot be traced";
+      VLOG(30) << correlation_id << " cannot be traced";
       return;
     }
     std::lock_guard<std::mutex> l(trace_mu_);
@@ -347,7 +347,7 @@ class DeviceTracerImpl : public DeviceTracer {
         tracer->AddAnnotation(cbInfo->correlationId, anno);
       }
     } else {
-      VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid;
+      VLOG(10) << "Unhandled API Callback for " << domain << " " << cbid;
     }
   }
   CUpti_SubscriberHandle subscriber_;
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index d3d754b6f58d25a9dfacafaf55d50b353a71ee6d..c26143d2f2780f3042f66b99808c6b85866f9dc4 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -65,51 +65,54 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  * include all needed cudnn functions in HPPL
  * different cudnn version has different interfaces
  **/
-#define CUDNN_DNN_ROUTINE_EACH(__macro)              \
-  __macro(cudnnSetTensor4dDescriptor);               \
-  __macro(cudnnSetTensor4dDescriptorEx);             \
-  __macro(cudnnSetTensorNdDescriptor);               \
-  __macro(cudnnGetTensorNdDescriptor);               \
-  __macro(cudnnGetConvolutionNdForwardOutputDim);    \
-  __macro(cudnnGetConvolutionForwardAlgorithm);      \
-  __macro(cudnnCreateTensorDescriptor);              \
-  __macro(cudnnDestroyTensorDescriptor);             \
-  __macro(cudnnCreateFilterDescriptor);              \
-  __macro(cudnnSetFilter4dDescriptor);               \
-  __macro(cudnnSetFilterNdDescriptor);               \
-  __macro(cudnnGetFilterNdDescriptor);               \
-  __macro(cudnnSetPooling2dDescriptor);              \
-  __macro(cudnnSetPoolingNdDescriptor);              \
-  __macro(cudnnGetPoolingNdDescriptor);              \
-  __macro(cudnnDestroyFilterDescriptor);             \
-  __macro(cudnnCreateConvolutionDescriptor);         \
-  __macro(cudnnCreatePoolingDescriptor);             \
-  __macro(cudnnDestroyPoolingDescriptor);            \
-  __macro(cudnnSetConvolution2dDescriptor);          \
-  __macro(cudnnDestroyConvolutionDescriptor);        \
-  __macro(cudnnSetConvolutionNdDescriptor);          \
-  __macro(cudnnGetConvolutionNdDescriptor);          \
-  __macro(cudnnDeriveBNTensorDescriptor);            \
-  __macro(cudnnCreateSpatialTransformerDescriptor);  \
-  __macro(cudnnSetSpatialTransformerNdDescriptor);   \
-  __macro(cudnnDestroySpatialTransformerDescriptor); \
-  __macro(cudnnSpatialTfGridGeneratorForward);       \
-  __macro(cudnnSpatialTfGridGeneratorBackward);      \
-  __macro(cudnnSpatialTfSamplerForward);             \
-  __macro(cudnnSpatialTfSamplerBackward);            \
-  __macro(cudnnCreate);                              \
-  __macro(cudnnDestroy);                             \
-  __macro(cudnnSetStream);                           \
-  __macro(cudnnActivationForward);                   \
-  __macro(cudnnConvolutionForward);                  \
-  __macro(cudnnConvolutionBackwardBias);             \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize);  \
-  __macro(cudnnTransformTensor);                     \
-  __macro(cudnnPoolingForward);                      \
-  __macro(cudnnPoolingBackward);                     \
-  __macro(cudnnSoftmaxBackward);                     \
-  __macro(cudnnSoftmaxForward);                      \
-  __macro(cudnnGetVersion);                          \
+#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
+  __macro(cudnnSetTensor4dDescriptor);                    \
+  __macro(cudnnSetTensor4dDescriptorEx);                  \
+  __macro(cudnnSetTensorNdDescriptor);                    \
+  __macro(cudnnGetTensorNdDescriptor);                    \
+  __macro(cudnnGetConvolutionNdForwardOutputDim);         \
+  __macro(cudnnGetConvolutionForwardAlgorithm);           \
+  __macro(cudnnCreateTensorDescriptor);                   \
+  __macro(cudnnDestroyTensorDescriptor);                  \
+  __macro(cudnnCreateFilterDescriptor);                   \
+  __macro(cudnnSetFilter4dDescriptor);                    \
+  __macro(cudnnSetFilterNdDescriptor);                    \
+  __macro(cudnnGetFilterNdDescriptor);                    \
+  __macro(cudnnSetPooling2dDescriptor);                   \
+  __macro(cudnnSetPoolingNdDescriptor);                   \
+  __macro(cudnnGetPoolingNdDescriptor);                   \
+  __macro(cudnnDestroyFilterDescriptor);                  \
+  __macro(cudnnCreateConvolutionDescriptor);              \
+  __macro(cudnnCreatePoolingDescriptor);                  \
+  __macro(cudnnDestroyPoolingDescriptor);                 \
+  __macro(cudnnSetConvolution2dDescriptor);               \
+  __macro(cudnnDestroyConvolutionDescriptor);             \
+  __macro(cudnnSetConvolutionNdDescriptor);               \
+  __macro(cudnnGetConvolutionNdDescriptor);               \
+  __macro(cudnnDeriveBNTensorDescriptor);                 \
+  __macro(cudnnCreateSpatialTransformerDescriptor);       \
+  __macro(cudnnSetSpatialTransformerNdDescriptor);        \
+  __macro(cudnnDestroySpatialTransformerDescriptor);      \
+  __macro(cudnnSpatialTfGridGeneratorForward);            \
+  __macro(cudnnSpatialTfGridGeneratorBackward);           \
+  __macro(cudnnSpatialTfSamplerForward);                  \
+  __macro(cudnnSpatialTfSamplerBackward);                 \
+  __macro(cudnnCreate);                                   \
+  __macro(cudnnDestroy);                                  \
+  __macro(cudnnSetStream);                                \
+  __macro(cudnnActivationForward);                        \
+  __macro(cudnnConvolutionForward);                       \
+  __macro(cudnnConvolutionBackwardBias);                  \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize);       \
+  __macro(cudnnTransformTensor);                          \
+  __macro(cudnnPoolingForward);                           \
+  __macro(cudnnPoolingBackward);                          \
+  __macro(cudnnSoftmaxBackward);                          \
+  __macro(cudnnSoftmaxForward);                           \
+  __macro(cudnnGetVersion);                               \
+  __macro(cudnnFindConvolutionForwardAlgorithmEx);        \
+  __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
+  __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);   \
   __macro(cudnnGetErrorString);
 CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index cc5cda6106c188f3156d33480b5d3641eed32556..d53907b749805d9c16737da3105d6c66cacb12fb 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -72,8 +72,8 @@ static inline std::string join(const std::string& part1,
 
 static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
                                                 int dynload_flags) {
-  VLOG(3) << "Try to find library: " << dso_path
-          << " from default system path.";
+  VLOG(30) << "Try to find library: " << dso_path
+           << " from default system path.";
   // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
   // and /usr/local/lib path
   void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index aa20553ceffceded09447693c6e92f55fb48702d..9273e9b1e72f0ad7abd6c20d4a34283fbe24378a 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -76,6 +76,10 @@ extern void* mklml_dso_handle;
   __macro(vdMul);                   \
   __macro(vsExp);                   \
   __macro(vdExp);                   \
+  __macro(vsSqr);                   \
+  __macro(vdSqr);                   \
+  __macro(vsPowx);                  \
+  __macro(vdPowx);                  \
   __macro(MKL_Set_Num_Threads)
 
 MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 8fff9844db738dbd6508569a8aaeed044e445e5f..c78f159ad25a17b38333a57a0650d9843c4c5632 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -124,8 +124,8 @@ size_t GpuMaxChunkSize() {
   size_t available = 0;
 
   GpuMemoryUsage(&available, &total);
-  VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
-           << total / 1024 / 1024 << "M";
+  VLOG(100) << "GPU Usage " << available / 1024 / 1024 << "M/"
+            << total / 1024 / 1024 << "M";
   size_t reserving = static_cast<size_t>(0.05 * total);
   // If available less than minimum chunk size, no usable memory exists.
   available =
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 2211e5504373b4a30e5fda0db22a41bdcd9f2421..4cbfe0a69c06cb6793c877263b2feaafa7c3dc60 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -45,7 +45,7 @@ void InitGflags(std::vector<std::string> argv) {
       line += ' ';
     }
     google::ParseCommandLineFlags(&argc, &arr, true);
-    VLOG(1) << "Init commandline: " << line;
+    VLOG(10) << "Init commandline: " << line;
   });
 }
 
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 115abb98d56e633c938695c8127c832eab602110..40af1f95208905231b933e5184a807b061164799 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -112,7 +112,7 @@ struct NCCLContextMap {
         NCCLGroupGuard gurad;
         for (auto &gpu_id : order_) {
           int rank = trainer_id * order_.size() + gpu_id;
-          VLOG(3) << "init nccl rank: " << rank << " nranks: " << nranks;
+          VLOG(30) << "init nccl rank: " << rank << " nranks: " << nranks;
           PADDLE_ENFORCE(cudaSetDevice(gpu_id));
           PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
               comms.get() + gpu_id, nranks, *nccl_id, rank));
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index 1f61a0e289f32196ead04d71d07b513cbe4655b1..06d8b65fb1480d9f621ca937c1d66ab7e910f010 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -27,6 +27,7 @@ void BindConstValue(pybind11::module* m) {
   m->def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
   m->def("kControlDepVarName",
          [] { return framework::ir::Node::kControlDepVarName; });
+  m->def("kNewGradSuffix", [] { return framework::kNewGradSuffix; });
 
   auto op_proto_and_checker_maker =
       m->def_submodule("op_proto_and_checker_maker");
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index d3b0d4a22954c1d67dc9551b997dcffa0625cbeb..586e92c2b3146d75a673d1fe326dbee7297a3bfb 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -61,9 +61,9 @@ struct variant_caster<V<Ts...>> {
       if (std::is_same<T, std::vector<float>>::value) {
         auto caster_ints = make_caster<std::vector<int64_t>>();
         if (caster_ints.load(src, convert)) {
-          VLOG(4) << "This value are floats and int64_ts satisfy "
-                     "simultaneously, will set it's type to "
-                     "std::vector<int64_t>";
+          VLOG(40) << "This value are floats and int64_ts satisfy "
+                      "simultaneously, will set it's type to "
+                      "std::vector<int64_t>";
           value = cast_op<std::vector<int64_t>>(caster_ints);
           return true;
         }
diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h
index 47de23377398423dabf3b0ed5b670e564f57cdfb..a2eec6e3c48dd126614bbff0227145537b678ac4 100644
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
@@ -72,6 +72,7 @@
 #include <iostream>
 #include <sstream>
 #include <string>
+#include <vector>
 
 #include "tinyformat/tinyformat.h"  // https://github.com/c42f/tinyformat
 
@@ -102,5 +103,22 @@ void Printf(const char* fmt, const Args&... args) {
   Fprintf(std::cout, fmt, args...);
 }
 
+template <typename T>
+std::string HumanReadableSize(T size) {
+  size_t i = 0;
+  double f_size = static_cast<double>(size);
+  double orig = f_size;
+  const std::vector<std::string> units(
+      {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"});
+  while (f_size > 1024) {
+    f_size /= 1024;
+    i++;
+  }
+  if (i >= units.size()) {
+    return Sprintf("%fB", orig);
+  }
+  return Sprintf("%f%s", f_size, units[i]);
+}
+
 }  // namespace string
 }  // namespace paddle
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
index a0757b53f37b29de0b3802c345b1ad9db69f16e9..ac1ac8e7c2348289516240b6eddf454d02828e2f 100644
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ b/paddle/fluid/train/demo/demo_trainer.cc
@@ -40,7 +40,7 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) {
 
 std::unique_ptr<paddle::framework::ProgramDesc> Load(
     paddle::framework::Executor* executor, const std::string& model_filename) {
-  VLOG(3) << "loading model from " << model_filename;
+  VLOG(30) << "loading model from " << model_filename;
   std::string program_desc_str;
   ReadBinaryFile(model_filename, &program_desc_str);
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index d7676f89ab5e781f910f98d03e72d5f7c1023a9a..32f9bca645d80a11274d128b6615a73ffa224705 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -139,6 +139,7 @@ function cmake_gen() {
         -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
         -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
         -DWITH_MKL=${WITH_MKL:-ON}
+        -DWITH_NGRAPH=${WITH_NGRAPH:-OFF}
         -DWITH_AVX=${WITH_AVX:-OFF}
         -DWITH_GOLANG=${WITH_GOLANG:-OFF}
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
@@ -155,6 +156,8 @@ function cmake_gen() {
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
+        -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}
+        -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}
         -DPY_VERSION=${PY_VERSION:-2.7}
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
     ========================================
@@ -171,6 +174,7 @@ EOF
         -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
         -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
         -DWITH_MKL=${WITH_MKL:-ON} \
+        -DWITH_NGRAPH=${WITH_NGRAPH:-OFF} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
         -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
@@ -186,6 +190,8 @@ EOF
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
+        -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}\
+        -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}\
         -DPY_VERSION=${PY_VERSION:-2.7} \
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
 
@@ -367,7 +373,12 @@ function run_test() {
     Running unit tests ...
     ========================================
 EOF
-        ctest --output-on-failure
+        if [ ${TESTING_DEBUG_MODE:-OFF} == "ON" ] ; then
+            ctest -V
+        else
+            ctest --output-on-failure
+        fi
+
         # make install should also be test when unittest
         make install -j `nproc`
         pip install ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
@@ -609,7 +620,24 @@ EOF
         CMD='"true"'
     fi
 
-    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    if [ "$1" == "cp35-cp35m" ]; then
+        cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    ADD python/dist/*.whl /
+    # run paddle version to install python packages first
+    RUN apt-get update && ${NCCL_DEPS}
+    RUN apt-get install -y wget python3 python3-pip libgtk2.0-dev dmidecode python3-tk && \
+        pip3 install opencv-python && pip3 install /*.whl; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f /*.whl && \
+        ${PADDLE_VERSION} && \
+        ldconfig
+    ${DOCKERFILE_CUDNN_DSO}
+    ${DOCKERFILE_CUBLAS_DSO}
+    ${DOCKERFILE_GPU_ENV}
+    ENV NCCL_LAUNCH_MODE PARALLEL
+EOF
+    else
+        cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
     ADD python/dist/*.whl /
     # run paddle version to install python packages first
     RUN apt-get update && ${NCCL_DEPS}
@@ -624,6 +652,8 @@ EOF
     ${DOCKERFILE_GPU_ENV}
     ENV NCCL_LAUNCH_MODE PARALLEL
 EOF
+    fi
+
     if [[ ${WITH_GOLANG:-OFF} == "ON" ]]; then
         cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
         ADD go/cmd/pserver/pserver /usr/bin/
@@ -698,7 +728,7 @@ function main() {
       build)
         cmake_gen ${PYTHON_ABI:-""}
         build
-        gen_dockerfile
+        gen_dockerfile ${PYTHON_ABI:-""}
         ;;
       build_android)
         build_android
@@ -725,7 +755,7 @@ function main() {
         gen_html
         ;;
       dockerfile)
-        gen_dockerfile
+        gen_dockerfile ${PYTHON_ABI:-""}
         ;;
       capi)
         cmake_gen ${PYTHON_ABI:-""}
@@ -751,6 +781,17 @@ function main() {
         test_fluid_lib
         assert_api_spec_approvals
         ;;
+      assert_api)
+        assert_api_not_changed ${PYTHON_ABI:-""}
+        ;;
+      test_inference)
+        gen_capi_package
+        gen_fluid_lib
+        test_fluid_lib
+        ;;
+      assert_api_approvals)
+        assert_api_spec_approvals
+        ;;
       maccheck)
         cmake_gen ${PYTHON_ABI:-""}
         build_mac
diff --git a/paddle/testing/TestUtil.cpp b/paddle/testing/TestUtil.cpp
index fa8efc20f59addb4526d2cbeaf34f161307c588a..fa1888966d820cd756e47d7c0fce4e1f586a96fc 100644
--- a/paddle/testing/TestUtil.cpp
+++ b/paddle/testing/TestUtil.cpp
@@ -118,7 +118,7 @@ void generateSequenceStartPositions(size_t batchSize,
     }
     buf[i] = pos;
     pos += len;
-    VLOG(1) << " len=" << len;
+    VLOG(10) << " len=" << len;
   }
   buf[numSeqs] = batchSize;
 }
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index c4cfd8e4680a3564b099eb4d8e3587e45f907572..dd57a8aac2452d6af11327a4a2e6bc7ad9acd3b1 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -34,6 +34,7 @@ from . import regularizer
 from . import average
 from . import metrics
 from . import transpiler
+from . import distribute_lookup_table
 from .param_attr import ParamAttr, WeightNormParamAttr
 from .data_feeder import DataFeeder
 from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
@@ -111,10 +112,10 @@ def __bootstrap__():
 
     read_env_flags = [
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
-        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
-        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        'dist_threadpool_size', 'cpu_deterministic', 'eager_delete_tensor_gb',
-        'reader_queue_speed_test_mode'
+        'eager_delete_scope', 'use_mkldnn', 'use_ngraph',
+        'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory',
+        'paddle_num_threads', 'dist_threadpool_size', 'cpu_deterministic',
+        'eager_delete_tensor_gb', 'reader_queue_speed_test_mode'
     ]
     if core.is_compiled_with_dist():
         read_env_flags.append('rpc_deadline')
@@ -126,7 +127,8 @@ def __bootstrap__():
 
     if core.is_compiled_with_cuda():
         read_env_flags += [
-            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic'
+            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
+            'conv_workspace_size_limit', 'cudnn_exhaustive_search'
         ]
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])
diff --git a/python/paddle/fluid/distribute_lookup_table.py b/python/paddle/fluid/distribute_lookup_table.py
new file mode 100644
index 0000000000000000000000000000000000000000..52d9ce75f8d73eb3c3e8683bc0793e9dd8fbe48d
--- /dev/null
+++ b/python/paddle/fluid/distribute_lookup_table.py
@@ -0,0 +1,39 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+LOOKUP_TABLE_TYPE = "lookup_table"
+
+
+def find_distributed_lookup_table(program):
+    """
+    Find distribute lookup table in program.
+    We only support one distribute table now.
+    :param program:
+    :return: table_name or None
+    """
+    table_name = None
+
+    for op in program.global_block().ops:
+        if op.type == LOOKUP_TABLE_TYPE:
+            if op.attr('is_distributed') is True:
+                if table_name is None:
+                    table_name = op.input("W")[0]
+                if table_name != op.input("W")[0]:
+                    raise RuntimeError("all distributed lookup_table_ops"
+                                       " should have only one table")
+            else:
+                if table_name is not None:
+                    assert op.input("W")[0] != table_name
+
+    return table_name
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 4ac94981a7a47530fe6ae4d968212c62dd3e0a93..96b6705e26c0f8d8d223e9020192a8f330c2c727 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -31,6 +31,7 @@ from functools import reduce
 
 __all__ = [
     'prior_box',
+    'density_prior_box',
     'multi_box_head',
     'bipartite_match',
     'target_assign',
@@ -1023,6 +1024,135 @@ def prior_box(input,
     return box, var
 
 
+def density_prior_box(input,
+                      image,
+                      densities=None,
+                      fixed_sizes=None,
+                      fixed_ratios=None,
+                      variance=[0.1, 0.1, 0.2, 0.2],
+                      clip=False,
+                      steps=[0.0, 0.0],
+                      offset=0.5,
+                      name=None):
+    """
+    **Density Prior Box Operator**
+
+    Generate density prior boxes for SSD(Single Shot MultiBox Detector) 
+    algorithm. Each position of the input produce N prior boxes, N is 
+    determined by the count of densities, fixed_sizes and fixed_ratios. 
+    Boxes center at grid points around each input position is generated by 
+    this operator, and the grid points is determined by densities and 
+    the count of density prior box is determined by fixed_sizes and fixed_ratios. 
+    Obviously, the number of fixed_sizes is equal to the number of densities.
+    For densities_i in densities:
+    N_density_prior_box =sum(N_fixed_ratios * densities_i^2),
+
+    Args:
+       input(Variable): The Input Variables, the format is NCHW.
+       image(Variable): The input image data of PriorBoxOp,
+            the layout is NCHW.
+       densities(list|tuple|None): the densities of generated density prior 
+            boxes, this attribute should be a list or tuple of integers. 
+            Default: None.
+       fixed_sizes(list|tuple|None): the fixed sizes of generated density
+            prior boxes, this attribute should a list or tuple of same 
+            length with :attr:`densities`. Default: None.
+       fixed_ratios(list|tuple|None): the fixed ratios of generated density
+            prior boxes, if this attribute is not set and :attr:`densities`
+            and :attr:`fix_sizes` is set, :attr:`aspect_ratios` will be used
+            to generate density prior boxes.
+       variance(list|tuple): the variances to be encoded in density prior boxes.
+            Default:[0.1, 0.1, 0.2, 0.2].
+       clip(bool): Whether to clip out-of-boundary boxes. Default: False.
+       step(list|turple): Prior boxes step across width and height, If
+            step[0] == 0.0/step[1] == 0.0, the density prior boxes step across
+            height/weight of the input will be automatically calculated.
+            Default: [0., 0.]
+       offset(float): Prior boxes center offset. Default: 0.5
+       name(str): Name of the density prior box op. Default: None.
+
+    Returns:
+        tuple: A tuple with two Variable (boxes, variances)
+
+        boxes: the output density prior boxes of PriorBox.
+        The layout is [H, W, num_priors, 4].
+        H is the height of input, W is the width of input,
+        num_priors is the total
+        box count of each position of input.
+
+        variances: the expanded variances of PriorBox.
+        The layout is [H, W, num_priors, 4].
+        H is the height of input, W is the width of input
+        num_priors is the total
+        box count of each position of input
+
+
+    Examples:
+        .. code-block:: python
+
+            box, var = fluid.layers.density_prior_box(
+                input=conv1,
+                image=images,
+                min_sizes=[100.],
+                max_sizes=[200.],
+                aspect_ratios=[1.0, 1.0 / 2.0, 2.0],
+                densities=[3, 4],
+                fixed_sizes=[50., 60.],
+                fixed_ratios=[1.0, 3.0, 1.0 / 3.0],
+                flip=True,
+                clip=True)
+    """
+    helper = LayerHelper("density_prior_box", **locals())
+    dtype = helper.input_dtype()
+
+    def _is_list_or_tuple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    if not _is_list_or_tuple_(densities):
+        raise TypeError('densities should be a list or a tuple or None.')
+    if not _is_list_or_tuple_(fixed_sizes):
+        raise TypeError('fixed_sizes should be a list or a tuple or None.')
+    if not _is_list_or_tuple_(fixed_ratios):
+        raise TypeError('fixed_ratios should be a list or a tuple or None.')
+    if len(densities) != len(fixed_sizes):
+        raise ValueError('densities and fixed_sizes length should be euqal.')
+    if not (_is_list_or_tuple_(steps) and len(steps) == 2):
+        raise ValueError('steps should be a list or tuple ',
+                         'with length 2, (step_width, step_height).')
+
+    densities = list(map(int, densities))
+    fixed_sizes = list(map(float, fixed_sizes))
+    fixed_ratios = list(map(float, fixed_ratios))
+    steps = list(map(float, steps))
+
+    attrs = {
+        'variances': variance,
+        'clip': clip,
+        'step_w': steps[0],
+        'step_h': steps[1],
+        'offset': offset,
+    }
+    if densities is not None and len(densities) > 0:
+        attrs['densities'] = densities
+    if fixed_sizes is not None and len(fixed_sizes) > 0:
+        attrs['fixed_sizes'] = fixed_sizes
+    if fixed_ratios is not None and len(fixed_ratios) > 0:
+        attrs['fixed_ratios'] = fixed_ratios
+
+    box = helper.create_variable_for_type_inference(dtype)
+    var = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type="density_prior_box",
+        inputs={"Input": input,
+                "Image": image},
+        outputs={"Boxes": box,
+                 "Variances": var},
+        attrs=attrs, )
+    box.stop_gradient = True
+    var.stop_gradient = True
+    return box, var
+
+
 def multi_box_head(inputs,
                    image,
                    base_size,
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index d1c926c4e4d41d55130a37e0bf2492f56fde0658..1ab48c00548b58f4b3e411d8e46e8cf496d6b891 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -30,7 +30,8 @@ from ..unique_name import generate as unique_name
 
 __all__ = [
     'data', 'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer',
-    'random_data_generator', 'py_reader', 'Preprocessor', 'load'
+    'random_data_generator', 'py_reader', 'create_py_reader_by_data',
+    'Preprocessor', 'load'
 ]
 
 
@@ -475,6 +476,159 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
     return monkey_patch_reader_methods(main_prog_var)
 
 
+def _py_reader(capacity,
+               shapes,
+               dtypes,
+               lod_levels=None,
+               name=None,
+               use_double_buffer=True,
+               feed_list=None):
+
+    if feed_list is not None:
+        if not isinstance(feed_list, list):
+            raise TypeError("feed_list should be a list of Variable"
+                            " instead of " + str(type(feed_list)))
+        lod_levels = []
+        dtypes = []
+        shape_concat = []
+        ranks = []
+        shapes = []
+
+        for feed_data in feed_list:
+            dtypes.append(feed_data.dtype)
+            shape_concat.extend(feed_data.shape)
+            ranks.append(len(feed_data.shape))
+            shapes.append(feed_data.shape)
+            lod_levels.append(feed_data.lod_level)
+    else:
+        dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
+        shape_concat = []
+        ranks = []
+
+        for shape in shapes:
+            shape_concat.extend(shape)
+            ranks.append(len(shape))
+
+        if lod_levels is None:
+            lod_levels = [0] * len(shapes)
+
+    if name is None:
+        queue_name = unique_name('lod_tensor_blocking_queue')
+        reader_name = unique_name('create_py_reader')
+        double_buffer_name = unique_name('double_buffer')
+    else:
+        queue_name = "_".join([name, "queue"])
+        reader_name = "_".join([name, "reader"])
+        double_buffer_name = "_".join([name, "double_buffer"])
+
+    var = global_scope().var(queue_name)
+    feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes)
+
+    startup_blk = default_startup_program().current_block()
+    startup_var = startup_blk.create_var(name=reader_name)
+    startup_blk.append_op(
+        type='create_py_reader',
+        inputs={'blocking_queue': [queue_name]},
+        outputs={'Out': [startup_var]},
+        attrs={
+            'shape_concat': shape_concat,
+            'lod_levels': lod_levels,
+            'ranks': ranks
+        })
+
+    startup_var.desc.set_dtypes(dtypes)
+    startup_var.persistable = True
+
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+                                      startup_var)
+
+    reader = monkey_patch_reader_methods(main_prog_var)
+    if use_double_buffer:
+        double_buffer_reader = double_buffer(reader, name=double_buffer_name)
+        # we return a double buffer reader. However, the reset method comes from
+        # py_reader.
+        double_buffer_reader.reset = reader.reset
+        reader = double_buffer_reader
+
+    # monkey patch py_reader special methods
+    reader.queue = feed_queue
+    current_reset_method = reader.reset
+    reader.thread = None
+    reader.tensor_provider = None
+    reader.exited = False
+
+    def start_provide_thread(func):
+        def __provider_thread__():
+            for tensors in func():
+                array = core.LoDTensorArray()
+                for item in tensors:
+                    if not isinstance(item, core.LoDTensor):
+                        tmp = core.LoDTensor()
+                        tmp.set(item, core.CPUPlace())
+                        item = tmp
+
+                    array.append(item)
+
+                if reader.exited:
+                    break
+                feed_queue.push(array)
+                if reader.exited:
+                    break
+            feed_queue.close()
+
+        reader.thread = threading.Thread(target=__provider_thread__)
+        reader.thread.daemon = True
+        reader.thread.start()
+
+    def __set_tensor_provider__(func):
+        reader.tensor_provider = func
+
+    def __set_paddle_reader__(paddle_reader):
+        with program_guard(Program(), Program()):
+            actual_feed_list = feed_list
+            if actual_feed_list is None:
+                actual_feed_list = []
+                counter = 0
+                for dtype, shape, lod_level in zip(dtypes, shapes, lod_levels):
+                    name = str(counter)
+                    actual_feed_list.append(
+                        data(
+                            name=name,
+                            dtype=dtype,
+                            shape=shape,
+                            lod_level=lod_level))
+                    counter += 1
+
+            data_names = [feed_data.name for feed_data in actual_feed_list]
+            feeder = DataFeeder(
+                feed_list=actual_feed_list, place=core.CPUPlace())
+            paddle_reader = feeder.decorate_reader(
+                paddle_reader, multi_devices=False)
+
+        def __tensor_provider__():
+            for slots in paddle_reader():
+                yield [slots[data_name] for data_name in data_names]
+
+        __set_tensor_provider__(__tensor_provider__)
+
+    def __reset__():
+        current_reset_method()
+        if reader.thread is not None and reader.tensor_provider is not None:
+            reader.exited = True
+            reader.thread.join()
+            reader.exited = False
+
+    def __start__():
+        start_provide_thread(reader.tensor_provider)
+
+    reader.reset = __reset__
+    reader.decorate_tensor_provider = __set_tensor_provider__
+    reader.decorate_paddle_reader = __set_paddle_reader__
+    reader.start = __start__
+
+    return reader
+
+
 def py_reader(capacity,
               shapes,
               dtypes,
@@ -599,128 +753,72 @@ def py_reader(capacity,
         >>>     except fluid.core.EOFException:
         >>>         test_reader.reset()
     """
-    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
-    shape_concat = []
-    ranks = []
-
-    for shape in shapes:
-        shape_concat.extend(shape)
-        ranks.append(len(shape))
-
-    if lod_levels is None:
-        lod_levels = [0] * len(shapes)
-
-    if name is None:
-        queue_name = unique_name('lod_tensor_blocking_queue')
-        reader_name = unique_name('create_py_reader')
-        double_buffer_name = unique_name('double_buffer')
-    else:
-        queue_name = "_".join([name, "queue"])
-        reader_name = "_".join([name, "reader"])
-        double_buffer_name = "_".join([name, "double_buffer"])
-
-    var = global_scope().var(queue_name)
-    feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes)
-
-    startup_blk = default_startup_program().current_block()
-    startup_var = startup_blk.create_var(name=reader_name)
-    startup_blk.append_op(
-        type='create_py_reader',
-        inputs={'blocking_queue': [queue_name]},
-        outputs={'Out': [startup_var]},
-        attrs={
-            'shape_concat': shape_concat,
-            'lod_levels': lod_levels,
-            'ranks': ranks
-        })
-
-    startup_var.desc.set_dtypes(dtypes)
-    startup_var.persistable = True
-
-    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
-                                      startup_var)
-
-    reader = monkey_patch_reader_methods(main_prog_var)
-    if use_double_buffer:
-        double_buffer_reader = double_buffer(reader, name=double_buffer_name)
-        # we return a double buffer reader. However, the reset method comes from
-        # py_reader.
-        double_buffer_reader.reset = reader.reset
-        reader = double_buffer_reader
-
-    # monkey patch py_reader special methods
-    reader.queue = feed_queue
-    current_reset_method = reader.reset
-    reader.thread = None
-    reader.tensor_provider = None
-    reader.exited = False
-
-    def start_provide_thread(func):
-        def __provider_thread__():
-            for tensors in func():
-                array = core.LoDTensorArray()
-                for item in tensors:
-                    if not isinstance(item, core.LoDTensor):
-                        tmp = core.LoDTensor()
-                        tmp.set(item, core.CPUPlace())
-                        item = tmp
-
-                    array.append(item)
-
-                if reader.exited:
-                    break
-                feed_queue.push(array)
-                if reader.exited:
-                    break
-            feed_queue.close()
+    return _py_reader(
+        capacity=capacity,
+        shapes=shapes,
+        dtypes=dtypes,
+        lod_levels=lod_levels,
+        name=name,
+        use_double_buffer=use_double_buffer)
 
-        reader.thread = threading.Thread(target=__provider_thread__)
-        reader.thread.daemon = True
-        reader.thread.start()
 
-    def __set_tensor_provider__(func):
-        reader.tensor_provider = func
+def create_py_reader_by_data(capacity,
+                             feed_list,
+                             name=None,
+                             use_double_buffer=True):
+    """
+    Create a Python reader for data feeding in Python
 
-    def __set_paddle_reader__(paddle_reader):
-        with program_guard(Program(), Program()):
-            feed_list = []
-            counter = 0
-            for dtype, shape, lod_level in zip(dtypes, shapes, lod_levels):
-                name = str(counter)
-                feed_list.append(
-                    data(
-                        name=name,
-                        dtype=dtype,
-                        shape=shape,
-                        lod_level=lod_level))
-                counter += 1
-
-            feeder = DataFeeder(feed_list=feed_list, place=core.CPUPlace())
-            paddle_reader = feeder.decorate_reader(
-                paddle_reader, multi_devices=False)
+    This layer returns a Reader Variable.
 
-        def __tensor_provider__():
-            for slots in paddle_reader():
-                yield [slots[str(idx)] for idx in six.moves.xrange(counter)]
+    Works much like py_reader except that it's input is feed_list
+    instead of shapes, dtypes and lod_levels
 
-        __set_tensor_provider__(__tensor_provider__)
+    Args:
+       capacity(int): The buffer capacity maintained by :code:`py_reader`.
+       feed_list(list(Variable)): The data feed list.
+       name(basestring): The prefix Python queue name and Reader name. None will
+            be generated automatically.
+       use_double_buffer(bool): Whether use double buffer or not.
 
-    def __reset__():
-        current_reset_method()
-        if reader.thread is not None and reader.tensor_provider is not None:
-            reader.exited = True
-            reader.thread.join()
-            reader.exited = False
+    Returns:
+       Variable: A Reader from which we can get feeding data.
 
-    def __start__():
-        start_provide_thread(reader.tensor_provider)
+    Examples:
 
-    reader.reset = __reset__
-    reader.decorate_tensor_provider = __set_tensor_provider__
-    reader.decorate_paddle_reader = __set_paddle_reader__
-    reader.start = __start__
+        1. The basic usage of :code:`py_reader` is as follows:
 
-    return reader
+        >>> import paddle.fluid as fluid
+        >>> import paddle.dataset.mnist as mnist
+        >>>
+        >>> image = fluid.layers.data(name='image', shape=[3,224,224], dtypes='float32')
+        >>> label = fluid.layers.data(name='label', shape=[1], dtypes='int64')
+        >>> reader = fluid.layers.create_py_reader_by_data(capacity=64, feed_list=[image, label])
+        >>> reader.decorate_paddle_reader(
+        >>>     paddle.reader.shuffle(paddle.batch(mnist.train())
+        >>>
+        >>> img, label = fluid.layers.read_file(reader)
+        >>> loss = network(img, label) # some network definition
+        >>>
+        >>> fluid.Executor(fluid.CUDAPlace(0)).run(fluid.default_startup_program())
+        >>>
+        >>> exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
+        >>> for epoch_id in range(10):
+        >>>     reader.start()
+        >>>     try:
+        >>>         while True:
+        >>>             exe.run(fetch_list=[loss.name])
+        >>>     except fluid.core.EOFException:
+        >>>         reader.reset()
+    """
+    return _py_reader(
+        capacity=capacity,
+        shapes=None,
+        dtypes=None,
+        lod_levels=None,
+        name=name,
+        use_double_buffer=use_double_buffer,
+        feed_list=feed_list)
 
 
 def open_files(filenames,
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index a87f123117491f27c7f024a758200e3a8e41fbc2..bf1d7171e2f453872d809e42994f946328dfe470 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -27,6 +27,7 @@ from .tensor import concat
 from . import utils
 from .. import unique_name
 from functools import reduce
+from .. import core
 
 __all__ = [
     'fc',
@@ -101,6 +102,7 @@ __all__ = [
     'image_resize',
     'image_resize_short',
     'resize_bilinear',
+    'resize_nearest',
     'gather',
     'scatter',
     'sequence_scatter',
@@ -154,13 +156,16 @@ __all__ = [
     'mul',
     'sigmoid_cross_entropy_with_logits',
     'maxout',
+    'space_to_depth',
     'affine_grid',
     'sequence_reverse',
     'affine_channel',
+    'similarity_focus',
     'hash',
     'grid_sampler',
     'log_loss',
     'add_position_encoding',
+    'bilinear_tensor_product',
 ]
 
 
@@ -1664,6 +1669,20 @@ def conv2d(input,
 
     pre_bias = helper.create_variable_for_type_inference(dtype)
 
+    if use_cudnn:
+        helper.create_variable(
+            name="kCUDNNFwdAlgoCache",
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+        helper.create_variable(
+            name="kCUDNNBwdDataAlgoCache",
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+        helper.create_variable(
+            name="kCUDNNBwdFilterAlgoCache",
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+
     helper.append_op(
         type=l_type,
         inputs={
@@ -1677,7 +1696,7 @@ def conv2d(input,
             'dilations': dilation,
             'groups': groups,
             'use_cudnn': use_cudnn,
-            'use_mkldnn': False
+            'use_mkldnn': False,
         })
 
     pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
@@ -3060,7 +3079,7 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
             x = fluid.layers.data(name='y', shape=[10, 5],
                              dtype='float32', lod_level=1)
             pad_value = fluid.layers.assign(
-                input=numpy.array([0], dtype=numpy.float32))
+                input=numpy.array([0.0], dtype=numpy.float32))
             out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
     """
 
@@ -4048,8 +4067,8 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None):
     Examples:
         .. code-block:: python
 
-            x = fluid.layers.data(name='x', shape=[8], dtype='float32')
-            y = fluid.layers.data(name='y', shape=[7], dtype='float32')
+            x = fluid.layers.data(name='x', shape=[1], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
             cost = fluid.layers.edit_distance(input=x,label=y)
     """
     helper = LayerHelper("edit_distance", **locals())
@@ -4723,7 +4742,8 @@ def softmax_with_cross_entropy(logits,
                                label,
                                soft_label=False,
                                ignore_index=-100,
-                               numeric_stable_mode=False):
+                               numeric_stable_mode=False,
+                               return_softmax=False):
     """
     **Softmax With Cross Entropy Operator.**
 
@@ -4787,9 +4807,15 @@ def softmax_with_cross_entropy(logits,
                                     the algorithm is always numerically stable. 
                                     Note that the speed may be slower when use 
                                     stable algorithm. Default: False
+        return_softmax (bool): A flag indicating whether to return the softmax 
+                               along with the cross entropy loss. Default: False
 
     Returns:
-        Variable: The cross entropy loss is a 2-D tensor with shape [N x 1].
+        Variable or Tuple of two Variables: Return the cross entropy loss if 
+                              `return_softmax` is False, otherwise the tuple 
+                              (loss, softmax), where the cross entropy loss is 
+                              a 2-D tensor with shape [N x 1], and softmax is a 
+                              2-D tensor with shape [N x K].
 
     Examples:
         .. code-block:: python
@@ -4814,6 +4840,10 @@ def softmax_with_cross_entropy(logits,
             'ignore_index': ignore_index,
             'numeric_stable_mode': numeric_stable_mode
         })
+
+    if return_softmax:
+        return loss, softmax
+
     return loss
 
 
@@ -5639,7 +5669,8 @@ def image_resize(input,
                  out_shape=None,
                  scale=None,
                  name=None,
-                 resample='BILINEAR'):
+                 resample='BILINEAR',
+                 actual_shape=None):
     """
     **Resize a Batch of Images**
 
@@ -5649,6 +5680,7 @@ def image_resize(input,
     Supporting resample methods:
 
         'BILINEAR' : Bilinear interpolation
+        'NEAREST' : Nearest neighbor interpolation
 
     Args:
         input (Variable): The input tensor of image resize layer,
@@ -5663,25 +5695,51 @@ def image_resize(input,
                          Default: None
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
-        resample(str): The resample method. It can only be 'BILINEAR' currently.
+        resample(str): The resample method. It supports 'BILINEAR' and 'NEAREST' 
+                       currently.
                        Default: 'BILINEAR'
+        actual_shape(Variable): An optional input to specify output shape 
+                                dynamically. If provided, image resize  
+                                according to this given shape rather than 
+                                :attr:`out_shape` and :attr:`scale` specifying
+                                shape. That is to say actual_shape has the 
+                                highest priority. It is recommended to use 
+                                actual_shape instead of :attr:`out_shape` if you 
+                                want to specify output shape dynamically. When 
+                                using actual_shape to specify output shape, one of 
+                                :attr:`out_shape` and :attr:`scale` should also be 
+                                set, otherwise errors would be occured in graph 
+                                constructing stage.
+                                Default: None
 
     Returns:
         Variable: The output is a 4-D tensor of the shape
         (num_batches, channls, out_h, out_w).
 
+    Raises:
+        TypeError: out_shape should be a list or tuple or Variable.
+        TypeError: actual_shape should either be Variable or None.
+        ValueError: The 'resample' of image_resize can only be 'BILINEAR' 
+                    or 'NEAREST' currently.
+        ValueError: One of out_shape and scale must not be None.
+        ValueError: out_shape length should be 2.
+
     Examples:
         .. code-block:: python
 
             out = fluid.layers.image_resize(input, out_shape=[12, 12])
     """
-    resample_methods = {'BILINEAR': 'bilinear_interp'}
+    resample_methods = {
+        'BILINEAR': 'bilinear',
+        'NEAREST': 'nearest',
+    }
     if resample not in resample_methods:
         raise ValueError(
-            "The 'resample' of image_resize can only be 'BILINEAR' currently.")
+            "The 'resample' of image_resize can only be 'BILINEAR' or 'NEAREST' currently."
+        )
     if out_shape is None and scale is None:
-        raise ValueError("One of out_shape and scale must not be None")
-    helper = LayerHelper('bilinear_interp', **locals())
+        raise ValueError("One of out_shape and scale must not be None.")
+    helper = LayerHelper('interpolate', **locals())
     dtype = helper.input_dtype()
 
     def _is_list_or_turple_(data):
@@ -5691,33 +5749,106 @@ def image_resize(input,
     out_w = 0
     inputs = {"X": input}
     if out_shape is not None:
-        if not (_is_list_or_turple_(out_shape) and
-                len(out_shape) == 2) and not isinstance(out_shape, Variable):
-            raise ValueError('out_shape should be a list or tuple or variable')
-        if _is_list_or_turple_(out_shape):
-            out_shape = list(map(int, out_shape))
-            out_h = out_shape[0]
-            out_w = out_shape[1]
-        else:
+        if isinstance(out_shape, Variable):
+            warnings.warn("out_shape as Variable type is deprecated, \
+                    it is recommended to use actual_shape instead of \
+                    out_shape to specify output shape dynamically.")
             inputs['OutSize'] = out_shape
+        elif not (_is_list_or_turple_(out_shape)):
+            raise TypeError("out_shape should be a list or tuple or Variable.")
+        elif len(out_shape) != 2:
+            raise ValueError("out_shape length should be 2.")
+
+        out_shape = list(map(int, out_shape))
+        out_h = out_shape[0]
+        out_w = out_shape[1]
     else:
         out_h = int(input.shape[2] * scale)
         out_w = int(input.shape[3] * scale)
 
+    if isinstance(actual_shape, Variable):
+        inputs["OutSize"] = actual_shape
+    elif actual_shape is not None:
+        raise TypeError("actual_shape should either be Variable or None.")
+
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
-        type=resample_methods[resample],
+        type='interpolate',
         inputs=inputs,
         outputs={"Out": out},
-        attrs={"out_h": out_h,
-               "out_w": out_w})
+        attrs={
+            "out_h": out_h,
+            "out_w": out_w,
+            "interp_method": resample_methods[resample]
+        })
     return out
 
 
-@templatedoc(op_type="bilinear_interp")
-def resize_bilinear(input, out_shape=None, scale=None, name=None):
+@templatedoc(op_type="interpolate")
+def resize_bilinear(input,
+                    out_shape=None,
+                    scale=None,
+                    name=None,
+                    actual_shape=None):
     """
-    ${comment}
+    Resize input by performing bilinear interpolation based on given 
+    output shape which specified by actual_shape, out_shape and scale 
+    in priority order.
+
+    Bilinear interpolation is an extension of linear interpolation for 
+    interpolating functions of two variables (e.g. H-direction and 
+    W-direction in this op) on a rectilinear 2D grid. The key idea is 
+    to perform linear interpolation first in one direction, and then 
+    again in the other direction.
+
+    For details of bilinear interpolation, please refer to Wikipedia: 
+    https://en.wikipedia.org/wiki/Bilinear_interpolation
+
+    Args:
+        input(${x_type}): ${x_comment}.
+
+        out_shape(${out_size_type}): ${out_size_comment}.
+
+        scale(float|None): The multiplier for the input height or width. At
+             least one of out_shape or scale must be set. And out_shape has
+             a higher priority than scale. Default: None.
+
+        name(str|None): The output variable name.
+        actual_shape(Variable): An optional input to specify output shape 
+                                dynamically. If provided, image resize  
+                                according to this given shape rather than 
+                                :attr:`out_shape` and :attr:`scale` specifying
+                                shape. That is to say actual_shape has the 
+                                highest priority. It is recommended to use 
+                                actual_shape instead of :attr:`out_shape` if you 
+                                want to specify output shape dynamically. When 
+                                using actual_shape to specify output shape, one of 
+                                :attr:`out_shape` and :attr:`scale` should also be 
+                                set, otherwise errors would be occured in graph 
+                                constructing stage.
+                                Default: None
+
+    Returns:
+        ${out_comment}.
+    """
+
+    return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape)
+
+
+@templatedoc(op_type="interpolate")
+def resize_nearest(input,
+                   out_shape=None,
+                   scale=None,
+                   name=None,
+                   actual_shape=None):
+    """
+    Resize input by performing nearest neighbor interpolation in both the
+    3rd dimention(in height direction) and the 4th dimention(in width 
+    direction) based on given output shape which specified by actual_shape, 
+    out_shape and scale in priority order.
+
+    For details of nearest neighbor interpolation, please refer to Wikipedia: 
+    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
 
     Args:
         input(${x_type}): ${x_comment}.
@@ -5729,12 +5860,25 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None):
              a higher priority than scale. Default: None.
 
         name(str|None): The output variable name.
+        actual_shape(Variable): An optional input to specify output shape 
+                                dynamically. If provided, image resize  
+                                according to this given shape rather than 
+                                :attr:`out_shape` and :attr:`scale` specifying
+                                shape. That is to say actual_shape has the 
+                                highest priority. It is recommended to use 
+                                actual_shape instead of :attr:`out_shape` if you 
+                                want to specify output shape dynamically. When 
+                                using actual_shape to specify output shape, one of 
+                                :attr:`out_shape` and :attr:`scale` should also be 
+                                set, otherwise errors would be occured in graph 
+                                constructing stage.
+                                Default: None
 
     Returns:
         ${out_comment}.
     """
 
-    return image_resize(input, out_shape, scale, name, 'BILINEAR')
+    return image_resize(input, out_shape, scale, name, 'NEAREST', actual_shape)
 
 
 def image_resize_short(input, out_short_len, resample='BILINEAR'):
@@ -6678,7 +6822,7 @@ def prelu(x, mode, param_attr=None, name=None):
         alpha_shape = x.shape
     dtype = helper.input_dtype(input_param_name='x')
     alpha = helper.create_parameter(
-        attr=param_attr,
+        attr=helper.param_attr,
         shape=alpha_shape,
         dtype='float32',
         is_bias=False,
@@ -7674,6 +7818,66 @@ def maxout(x, groups, name=None):
     return out
 
 
+def space_to_depth(x, blocksize, name=None):
+    """
+    Gives a blocksize to space_to_depth the input LoDtensor with Layout: [batch, channel, height, width]
+    
+    This op rearranges blocks of spatial data, into depth. More specifically, this op outputs a copy of the 
+    input LoDtensor where values from the height and width dimensions are moved to the channel dimension. 
+    The attr blocksize indicates the input block size.
+    
+    space_to_depth will reorgnize the elements of input with shape[batch, channel, height, width] according 
+    to blocksize to construct output with shape [batch, channel * blocksize * blocksize, height/blocksize, width/blocksize]:
+    
+    space_to_depth is used to This operation is useful for resizing the activations between convolutions 
+    (but keeping all data)
+
+    - Non-overlapping blocks of size block_size x block size are rearranged into depth at each location.
+    - The depth of the output tensor is block_size * block_size * input channel 
+    - The Y, X coordinates within each block of the input become the high order component of the output channel index
+    - channel should be divisible by square of blocksize
+    - height, width should be divsible by blocksize
+
+
+    Args:
+        x(variable): The input LoDtensor.
+        blocksize(variable): The blocksize to select the element on each feature map should be > 2
+
+    Returns:
+        Variable: The output LoDtensor.
+
+    Raises:
+        TypeError: blocksize type must be a long.
+
+    Examples:
+        .. code-block:: python
+
+            data = fluid.layers.data(
+                name='data', shape=[1, 4, 2, 2], dtype='float32')
+            space_to_depthed = fluid.layers.space_to_depth(
+                x=data, blocksize=2)
+    """
+
+    helper = LayerHelper("space_to_depth", **locals())
+
+    if not (isinstance(blocksize, int)):
+        raise ValueError("blocksize must be a python Int")
+
+    if name is None:
+        out = helper.create_variable_for_type_inference(
+            dtype=x.dtype)  #fix create
+    else:
+        out = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False)
+
+    helper.append_op(
+        type="space_to_depth",
+        inputs={"X": x},
+        attrs={"blocksize": blocksize},
+        outputs={"Out": out})
+    return out
+
+
 @templatedoc()
 def sequence_reverse(x, name=None):
     """ 
@@ -7742,6 +7946,118 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
     return out
 
 
+def similarity_focus(input, axis, indexes, name=None):
+    """  
+    SimilarityFocus Operator
+
+    Generate a similarity focus mask with the same shape of input using the following method:
+    1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding 
+       to the axis according to the indexes. For example, if axis=1 and indexes=[a], 
+       it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X 
+       is (BatchSize, A, B, C), the shape of tensor T is (BatchSize, B, C).
+    2. For each index, find the largest numbers in the tensor T, so that the same 
+       row and same column has at most one number(what it means is that if the 
+       largest number has been found in the i-th row and the j-th column, then 
+       the numbers in the i-th row or j-th column will be skipped. And then the 
+       next largest number will be selected from the remaining numbers. Obviously 
+       there will be min(B, C) numbers), and mark the corresponding position of the 
+       3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for 
+       each index.
+    3. Broadcast the 3-D similarity focus mask to the same shape of input X.
+
+    Refer to `Similarity Focus Layer <http://www.aclweb.org/anthology/N16-1108>`_
+
+    .. code-block:: text
+
+        * Example :
+
+            Given a 4-D tensor x with the shape (BatchSize, C, A, B), where C is
+            the number of channels and the shape of feature map is (A, B):
+                x.shape = (2, 3, 2, 2)
+                x.data = [[[[0.8, 0.1],
+                            [0.4, 0.5]],
+
+                           [[0.9, 0.7],
+                            [0.9, 0.9]],
+
+                           [[0.8, 0.9],
+                            [0.1, 0.2]]],
+
+
+                          [[[0.2, 0.5],
+                            [0.3, 0.4]],
+
+                           [[0.9, 0.7],
+                            [0.8, 0.4]],
+
+                           [[0.0, 0.2],
+                            [0.4, 0.7]]]]
+
+            Given axis: 1 (the axis of the channel)
+            Given indexes: [0]
+
+            then we get a 4-D tensor out with the same shape of input x:
+                out.shape = (2, 3, 2, 2)
+                out.data = [[[[1.0, 0.0],
+                              [0.0, 1.0]],
+
+                             [[1.0, 0.0],
+                              [0.0, 1.0]],
+
+                             [[1.0, 0.0],
+                              [0.0, 1.0]]],
+
+                            [[[0.0, 1.0],
+                              [1.0, 0.0]],
+
+                             [[0.0, 1.0],
+                              [1.0, 0.0]],
+
+                             [[0.0, 1.0],
+                              [1.0, 0.0]]]]
+
+    Args:
+        input(Variable): The input tensor variable(default float). It should 
+            be a 4-D tensor with shape [BatchSize, A, B, C].
+        axis(int): Indicating the dimension to be selected. It can only be
+            1, 2 or 3.
+        indexes(list): Indicating the indexes of the selected dimension.
+
+    Returns:
+        Variable: A tensor variable with the same shape and same type 
+            as the input.
+        
+    Examples:
+        .. code-block:: python
+            data = fluid.layers.data(
+              name='data', shape=[2, 3, 2, 2], dtype='float32')
+            x = fluid.layers.layer_norm(input=data, axis=1, indexes=[0])
+    """
+    helper = LayerHelper('similarity_focus', **locals())
+    # check attrs
+    if isinstance(axis, int) is False:
+        raise TypeError("axis must be int type.")
+    if isinstance(indexes, list) is False:
+        raise TypeError("indexes must be list type.")
+    if axis != 1 and axis != 2 and axis != 3:
+        raise ValueError("axis must be 1, 2 or 3.")
+    if len(indexes) == 0:
+        raise ValueError("indexes can not be empty.")
+
+    if name is None:
+        out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    else:
+        out = helper.create_variable(
+            name=name, dtype=input.dtype, persistable=False)
+    helper.append_op(
+        type='similarity_focus',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={"axis": axis,
+               "indexes": indexes})
+    return out
+
+
 def hash(input, hash_size, num_hash=1, name=None):
     """
     Hash the input to an integer whose value is less than the given hash size.
@@ -7985,3 +8301,72 @@ def add_position_encoding(input, alpha, beta, name=None):
         attrs={"alpha": alpha,
                "beta": beta})
     return out
+
+
+def bilinear_tensor_product(x,
+                            y,
+                            size,
+                            act=None,
+                            name=None,
+                            param_attr=None,
+                            bias_attr=None):
+    """
+    **Add Bilinear Tensor Product Layer**
+
+    This layer performs bilinear tensor product on two inputs.
+    For example:
+
+    .. math::
+       out{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1
+
+    In this formula:
+      - :math:`x`: the first input contains M elements, shape is [batch_size, M].
+      - :math:`y`: the second input contains N elements, shape is [batch_size, N].
+      - :math:`W_{i}`: the i-th learned weight, shape is [M, N]
+      - :math:`out{i}`: the i-th element of out, shape is [batch_size, size].
+      - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`.
+
+    Args:
+        x (Variable): 2-D input tensor with shape [batch_size, M]
+        y (Variable): 2-D input tensor with shape [batch_size, N]
+        size (int): The dimension of this layer.
+        act (str, default None): Activation to be applied to the output of this layer.
+        name (str, default None): The name of this layer.
+        param_attr (ParamAttr, default None): The parameter attribute for the learnable w.
+            parameters/weights of this layer.
+        bias_attr (ParamAttr, default None): The parameter attribute for the bias
+            of this layer. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
+
+    Returns:
+        Variable: A 2-D Tensor of shape [batch_size, size].
+
+    Examples:
+        .. code-block:: python
+
+          tensor = bilinear_tensor_product(x=layer1, y=layer2, size=1000)
+    """
+    helper = LayerHelper('bilinear_tensor_product', **locals())
+    dtype = helper.input_dtype('x')
+
+    param_shape = [size, x.shape[1], y.shape[1]]
+
+    w = helper.create_parameter(
+        attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=False)
+
+    if name is None:
+        out = helper.create_variable_for_type_inference(dtype=dtype)
+    else:
+        out = helper.create_variable(name=name, dtype=dtype, persistable=False)
+
+    inputs = {"X": x, "Y": y, "Weight": w}
+    if helper.bias_attr:
+        bias_size = [1, size]
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+        inputs["Bias"] = bias
+    helper.append_op(
+        type="bilinear_tensor_product", inputs=inputs, outputs={"Out": out})
+
+    # add activation
+    return helper.append_activation(out)
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 09a7cb8dc9339afa666f8cf09e92a27ffba8a9b3..ff32c00104171bf42c00be33f05758a4387228e1 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -24,10 +24,10 @@ from .layer_function_generator import templatedoc
 import numpy
 
 __all__ = [
-    'create_tensor', 'create_parameter', 'create_global_var', 'cast', 'concat',
-    'sums', 'assign', 'fill_constant_batch_size_like', 'fill_constant',
-    'argmin', 'argmax', 'argsort', 'ones', 'zeros', 'reverse', 'has_inf',
-    'has_nan', 'isfinite'
+    'create_tensor', 'create_parameter', 'create_global_var', 'cast',
+    'tensor_array_to_tensor', 'concat', 'sums', 'assign',
+    'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax',
+    'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite'
 ]
 
 
@@ -193,6 +193,60 @@ def concat(input, axis=0, name=None):
     return out
 
 
+def tensor_array_to_tensor(input, axis=1, name=None):
+    """
+    This function concatenates the input LodTensorArray along the axis mentioned
+    and returns that as the output.
+
+    A simple example as below:
+    
+    .. code-block:: text
+    
+        Given:
+
+        input.data = {[[0.6, 0.1, 0.3],
+                       [0.5, 0.3, 0.2]],
+                      [[1.3],
+                       [1.8]],
+                      [[2.3, 2.1],
+                       [2.5, 2.4]]}
+        
+        axis = 1
+    
+        Then:
+
+        output.data = [[0.6, 0.1, 0.3, 1.3, 2.3, 2.1],
+                       [0.5, 0.3, 0.2, 1.8, 2.5, 2.4]]
+
+        output_index.data = [3, 1, 2]
+
+    Args:
+        input(list): Input LodTensorArray
+        axis(int): Integer axis along which the tensors will be concatenated
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
+
+    Returns:
+        Variable: Output variable of the concatenation
+        Variable: The input LodTensorArray items' dims along the axis
+
+    Examples:
+        .. code-block:: python
+
+           output, output_index = fluid.layers.tensor_array_to_tensor(input=tensor_array)
+    """
+    helper = LayerHelper('tensor_array_to_tensor', **locals())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+    out_index = helper.create_variable_for_type_inference(dtype="int32")
+    helper.append_op(
+        type='tensor_array_to_tensor',
+        inputs={'X': input},
+        outputs={'Out': [out],
+                 'OutIndex': [out_index]},
+        attrs={'axis': axis})
+    return out, out_index
+
+
 def sums(input, out=None):
     """
     This function performs the sum operation on the input and returns the
diff --git a/python/paddle/fluid/op.py b/python/paddle/fluid/op.py
index 4e1d1450dea85fe4eb3e68713250836e4beac992..b8bb3db1eedcf25c9b6a02ad3b4f261e8be8efce 100644
--- a/python/paddle/fluid/op.py
+++ b/python/paddle/fluid/op.py
@@ -108,6 +108,8 @@ class OpDescCreationMethod(object):
                     new_attr.i = user_defined_attr
                 elif attr.type == framework_pb2.FLOAT:
                     new_attr.f = user_defined_attr
+                elif attr.type == framework_pb2.LONG:
+                    new_attr.l = user_defined_attr
                 elif attr.type == framework_pb2.STRING:
                     new_attr.s = user_defined_attr
                 elif attr.type == framework_pb2.BOOLEAN:
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 7e2364a5a872cdd8cf590438cc081ab070db767d..da92826d410505c9a80820f655162dd22e6b5966 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -13,21 +13,23 @@
 # limitations under the License.
 
 from __future__ import print_function
-import re
-import sys
+
 from collections import defaultdict
+from contextlib import contextmanager
+
 from paddle.fluid.framework import Program, Variable, name_scope, default_main_program
+from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
+
 from . import framework
 from . import layers
+from . import unique_name
 from .backward import append_backward
+from .clip import append_gradient_clip_ops, error_clip_callback
 from .framework import program_guard
-from . import unique_name
 from .initializer import Constant
 from .layer_helper import LayerHelper
-from .regularizer import append_regularization_ops
-from .clip import append_gradient_clip_ops, error_clip_callback
-from contextlib import contextmanager
 from .layers import ops
+from .regularizer import append_regularization_ops
 
 __all__ = [
     'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
@@ -85,7 +87,7 @@ class Optimizer(object):
             name=unique_name.generate("learning_rate"),
             shape=[1],
             value=float(self._learning_rate),
-            dtype='float32' if self._dtype == None else self._dtype,
+            dtype='float32' if self._dtype is None else self._dtype,
             persistable=True)
 
     def _global_learning_rate(self, program=None):
@@ -245,6 +247,50 @@ class Optimizer(object):
             end = len(global_block.ops)
             return global_block._slice_ops(start, end)
 
+    def _process_distribute_lookuptable(self, param_grads, loss,
+                                        startup_program):
+        """
+        Because distribute lookup table only support SGD optimizer for now, not support
+        other optimizer and regularization, so we should find the table parameter out,
+        and avoid to add regularization and other op for it, and add sgd optimize op
+        for it independently.
+        :param param_grads(list((Var, Var))): list of (param, grad) pair.
+        :param loss: the loss variable.
+        :param startup_program: the startup program
+        """
+        program = loss.block.program
+        table_name = find_distributed_lookup_table(program)
+        table_param = None
+        table_grad = None
+        new_param_grads = []
+        for p, g in param_grads:
+            if p.name == table_name:
+                if table_param is not None:
+                    raise RuntimeError(
+                        "multi dist table var found, only support one now!")
+                table_param = p
+                table_grad = g
+            else:
+                new_param_grads.append((p, g))
+        sgd_op = None
+        if table_param is not None:
+            with program_guard(program, startup_program):
+                param_and_grad = [table_param, table_grad]
+                with table_param.block.program._optimized_guard(param_and_grad), \
+                     framework.name_scope("optimizer"):
+                    self._create_global_learning_rate()
+                    # create the optimize op
+                    sgd_op = loss.block.append_op(
+                        type='sgd',
+                        inputs={
+                            "Param": table_param,
+                            "Grad": table_grad,
+                            "LearningRate":
+                            self._create_param_lr(param_and_grad)
+                        },
+                        outputs={"ParamOut": param_and_grad[0]})
+        return new_param_grads, (table_param, table_grad), sgd_op
+
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -260,6 +306,9 @@ class Optimizer(object):
 
         params_grads = sorted(params_grads, key=lambda x: x[0].name)
 
+        params_grads, table_param_and_grad, table_optimize_op = \
+            self._process_distribute_lookuptable(params_grads, loss, startup_program)
+
         params_grads = append_gradient_clip_ops(params_grads)
 
         # Add regularization if any
@@ -268,6 +317,9 @@ class Optimizer(object):
 
         optimize_ops = self._create_optimization_pass(params_grads, loss,
                                                       startup_program)
+        if table_optimize_op is not None:
+            optimize_ops.append(table_optimize_op)
+            params_grads.append(table_param_and_grad)
         return optimize_ops, params_grads
 
 
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 57185da4d1d38f3848994aae105411cf2844843a..d8aace9fdfa601413bb4d4b1b2a309ba6a8e4ece 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -61,14 +61,25 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
                 params_and_grads.append((param, grad))
                 continue
 
-            assert grad.shape == regularization_term.shape
+            new_grad = grad
+            if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+                # FIXME(zcd): If the grad is SELECTED_ROWS, after regularization,
+                # the grad's type and name will be changed. But the gradient's name
+                # is used in ParallelExecutor Reduce mode, so I add a flag for
+                # the new_grad here.
+                new_grad = grad.block.create_var(
+                    name=grad.name + core.kNewGradSuffix(),
+                    dtype=param.dtype,
+                    shape=param.shape,
+                    lod_level=param.lod_level,
+                    type=core.VarDesc.VarType.LOD_TENSOR)
 
             grad.block.append_op(
-                type='elementwise_add',
-                inputs={"X": grad,
-                        "Y": regularization_term},
-                outputs={"Out": grad})
-            params_and_grads.append((param, grad))
+                type='sum',
+                inputs={"X": [grad, regularization_term]},
+                outputs={"Out": new_grad})
+
+            params_and_grads.append((param, new_grad))
 
     return params_and_grads
 
@@ -142,26 +153,7 @@ class L2DecayRegularizer(WeightDecayRegularizer):
         assert isinstance(block, framework.Block)
 
         decay = block.create_var(
-            dtype="float32", shape=param.shape, lod_level=param.lod_level)
-
-        if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
-            idx = block.create_var(
-                dtype="int64",
-                shape=param.shape,
-                type=core.VarDesc.VarType.LOD_TENSOR)
-            decay = block.create_var(
-                dtype="float32",
-                shape=param.shape,
-                type=core.VarDesc.VarType.LOD_TENSOR)
-            block.append_op(
-                type='extract_rows', inputs={'X': grad}, outputs={'Out': idx})
-            block.append_op(
-                type='lookup_table',
-                inputs={'W': param,
-                        'Ids': idx},
-                outputs={'Out': decay},
-                attrs={'is_sparse': True})
-            param = decay
+            dtype=param.dtype, shape=param.shape, lod_level=param.lod_level)
 
         # Append Op to calculate decay
         block.append_op(
@@ -218,27 +210,9 @@ class L1DecayRegularizer(WeightDecayRegularizer):
         """
         assert isinstance(param, framework.Parameter)
         assert isinstance(block, framework.Block)
+
         decay = block.create_var(
-            dtype="float32", shape=param.shape, lod_level=param.lod_level)
-
-        if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
-            idx = block.create_var(
-                dtype="int64",
-                shape=param.shape,
-                type=core.VarDesc.VarType.LOD_TENSOR)
-            decay = block.create_var(
-                dtype="float32",
-                shape=param.shape,
-                type=core.VarDesc.VarType.LOD_TENSOR)
-            block.append_op(
-                type='extract_rows', inputs={'X': grad}, outputs={'Out': idx})
-            block.append_op(
-                type='lookup_table',
-                inputs={'W': param,
-                        'Ids': idx},
-                outputs={'Out': decay},
-                attrs={'is_sparse': True})
-            param = decay
+            dtype=param.dtype, shape=param.shape, lod_level=param.lod_level)
 
         # Append sign op
         block.append_op(
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index f63387a90617dc4e9b7c9ee7caa2d01595237a03..42ab9b231153f7ede7b8f8dd4e754f8cc92f65fe 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -38,7 +38,7 @@ depth = 8
 mix_hidden_lr = 1e-3
 
 IS_SPARSE = True
-PASS_NUM = 10
+PASS_NUM = 1
 BATCH_SIZE = 10
 
 embedding_name = 'emb'
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 28dc7519571d8b5464e92fddf634ba46691ceaa9..982d29180141d052e25ea3dcba6e3e7ce4181c48 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -128,6 +128,24 @@ class TestPriorBox(unittest.TestCase):
         assert box.shape[3] == 4
 
 
+class TestDensityPriorBox(unittest.TestCase):
+    def test_density_prior_box(self):
+        data_shape = [3, 224, 224]
+        images = fluid.layers.data(
+            name='pixel', shape=data_shape, dtype='float32')
+        conv1 = fluid.layers.conv2d(images, 3, 3, 2)
+        box, var = layers.density_prior_box(
+            input=conv1,
+            image=images,
+            densities=[3, 4],
+            fixed_sizes=[50., 60.],
+            fixed_ratios=[1.0],
+            clip=True)
+        assert len(box.shape) == 4
+        assert box.shape == var.shape
+        assert box.shape[3] == 4
+
+
 class TestAnchorGenerator(unittest.TestCase):
     def test_anchor_generator(self):
         data_shape = [3, 224, 224]
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
deleted file mode 100644
index bed847c3c168c906a89c32631b2a8f0ba2e6e7be..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ /dev/null
@@ -1,168 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-
-
-def bilinear_interp_np(input, out_h, out_w, out_size):
-    if out_size is not None:
-        out_h = out_size[0]
-        out_w = out_size[1]
-    batch_size, channel, in_h, in_w = input.shape
-    if out_h > 1:
-        ratio_h = (in_h - 1.0) / (out_h - 1.0)
-    else:
-        ratio_h = 0.0
-    if out_w > 1:
-        ratio_w = (in_w - 1.0) / (out_w - 1.0)
-    else:
-        ratio_w = 0.0
-
-    out = np.zeros((batch_size, channel, out_h, out_w))
-    for i in range(out_h):
-        h = int(ratio_h * i)
-        hid = 1 if h < in_h - 1 else 0
-        h1lambda = ratio_h * i - h
-        h2lambda = 1.0 - h1lambda
-        for j in range(out_w):
-            w = int(ratio_w * j)
-            wid = 1 if w < in_w - 1 else 0
-            w1lambda = ratio_w * j - w
-            w2lambda = 1.0 - w1lambda
-
-            out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] +
-                                        w1lambda*input[:, :, h, w+wid]) + \
-                h1lambda*(w2lambda*input[:, :, h+hid, w] +
-                          w1lambda*input[:, :, h+hid, w+wid])
-    return out.astype(input.dtype)
-
-
-class TestBilinearInterpOp(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.init_test_case()
-        self.op_type = "bilinear_interp"
-        input_np = np.random.random(self.input_shape).astype("float32")
-        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
-                                       self.out_size)
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-        self.attrs = {'out_h': self.out_h, 'out_w': self.out_w}
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', in_place=True)
-
-    def init_test_case(self):
-        self.input_shape = [2, 3, 4, 4]
-        self.out_h = 2
-        self.out_w = 2
-        self.out_size = np.array([3, 3]).astype("int32")
-
-
-class TestCase1(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-
-
-class TestCase2(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-
-
-class TestCase3(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.input_shape = [1, 1, 128, 64]
-        self.out_h = 64
-        self.out_w = 128
-
-
-class TestCase4(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.out_size = np.array([2, 2]).astype("int32")
-
-
-class TestCase5(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.out_size = np.array([11, 11]).astype("int32")
-
-
-class TestCase6(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.input_shape = [1, 1, 128, 64]
-        self.out_h = 64
-        self.out_w = 128
-        self.out_size = np.array([65, 129]).astype("int32")
-
-
-class TestBilinearInterpOpUint8(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.init_test_case()
-        self.op_type = "bilinear_interp"
-        input_np = np.random.randint(
-            low=0, high=256, size=self.input_shape).astype("uint8")
-        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
-                                       self.out_size)
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-        self.attrs = {'out_h': self.out_h, 'out_w': self.out_w}
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output_with_place(place=core.CPUPlace(), atol=1)
-
-    def init_test_case(self):
-        self.input_shape = [1, 3, 9, 6]
-        self.out_h = 10
-        self.out_w = 9
-
-
-class TestCase1Uint8(TestBilinearInterpOpUint8):
-    def init_test_case(self):
-        self.input_shape = [2, 3, 128, 64]
-        self.out_h = 120
-        self.out_w = 50
-
-
-class TestCase2Uint8(TestBilinearInterpOpUint8):
-    def init_test_case(self):
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 5
-        self.out_w = 13
-        self.out_size = np.array([6, 15]).astype("int32")
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index aba3e7139c25fe66f814e41080afb8f1dad79e4b..ebbbf3ab8b00ff49d55ea5d472a2f7c4eae0da52 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -67,6 +67,7 @@ class TestConv2dOp(OpTest):
     def setUp(self):
         self.op_type = "conv2d"
         self.use_cudnn = False
+        self.exhaustive_search = False
         self.use_cuda = False
         self.use_mkldnn = False
         self.data_format = "AnyLayout"
@@ -98,7 +99,8 @@ class TestConv2dOp(OpTest):
             'dilations': self.dilations,
             'use_cudnn': self.use_cudnn,
             'use_mkldnn': self.use_mkldnn,
-            'data_format': self.data_format
+            'data_format': self.data_format,
+            'exhaustive_search': self.exhaustive_search
         }
         self.outputs = {'Output': output}
 
@@ -225,29 +227,29 @@ class TestWithInput1x1Filter1x1(TestConv2dOp):
 #----------------Conv2dCUDNN----------------
 
 
-def create_test_cudnn_class(parent, cls_name):
+def create_test_cudnn_class(parent):
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestCUDNNCase(parent):
         def init_kernel_type(self):
             self.use_cudnn = True
 
-    cls_name = "{0}".format(cls_name)
+    cls_name = "{0}_{1}".format(parent.__name__, "CUDNN")
     TestCUDNNCase.__name__ = cls_name
     globals()[cls_name] = TestCUDNNCase
 
 
-create_test_cudnn_class(TestConv2dOp, "TestPool2DCUDNNOp")
-create_test_cudnn_class(TestWithPad, "TestPool2DCUDNNOpCase1")
-create_test_cudnn_class(TestWithStride, "TestPool2DCUDNNOpCase2")
-create_test_cudnn_class(TestWithGroup, "TestPool2DCUDNNOpCase3")
-create_test_cudnn_class(TestWith1x1, "TestPool2DCUDNNOpCase4")
-create_test_cudnn_class(TestWithInput1x1Filter1x1, "TestPool2DCUDNNOpCase4")
+create_test_cudnn_class(TestConv2dOp)
+create_test_cudnn_class(TestWithPad)
+create_test_cudnn_class(TestWithStride)
+create_test_cudnn_class(TestWithGroup)
+create_test_cudnn_class(TestWith1x1)
+create_test_cudnn_class(TestWithInput1x1Filter1x1)
 
 #----------------Conv2dCUDNN----------------
 
 
-def create_test_cudnn_fp16_class(parent, cls_name, grad_check=True):
+def create_test_cudnn_fp16_class(parent, grad_check=True):
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestConv2DCUDNNFp16(parent):
@@ -279,23 +281,17 @@ def create_test_cudnn_fp16_class(parent, cls_name, grad_check=True):
                     max_relative_error=0.02,
                     no_grad_set=set(['Input']))
 
-    cls_name = "{0}".format(cls_name)
+    cls_name = "{0}_{1}".format(parent.__name__, "CUDNNFp16")
     TestConv2DCUDNNFp16.__name__ = cls_name
     globals()[cls_name] = TestConv2DCUDNNFp16
 
 
-create_test_cudnn_fp16_class(
-    TestConv2dOp, "TestPool2DCUDNNFp16Op", grad_check=False)
-create_test_cudnn_fp16_class(
-    TestWithPad, "TestPool2DCUDNNFp16OpCase1", grad_check=False)
-create_test_cudnn_fp16_class(
-    TestWithStride, "TestPool2DCUDNNFp16OpCase2", grad_check=False)
-create_test_cudnn_fp16_class(
-    TestWithGroup, "TestPool2DCUDNNFp16OpCase3", grad_check=False)
-create_test_cudnn_fp16_class(
-    TestWith1x1, "TestPool2DCUDNNFp16OpCase4", grad_check=False)
-create_test_cudnn_fp16_class(
-    TestWithInput1x1Filter1x1, "TestPool2DCUDNNFp16OpCase4", grad_check=False)
+create_test_cudnn_fp16_class(TestConv2dOp, grad_check=False)
+create_test_cudnn_fp16_class(TestWithPad, grad_check=False)
+create_test_cudnn_fp16_class(TestWithStride, grad_check=False)
+create_test_cudnn_fp16_class(TestWithGroup, grad_check=False)
+create_test_cudnn_fp16_class(TestWith1x1, grad_check=False)
+create_test_cudnn_fp16_class(TestWithInput1x1Filter1x1, grad_check=False)
 
 # -------TestDepthwiseConv
 
@@ -367,6 +363,12 @@ class TestDepthwiseConvWithDilation2(TestConv2dOp):
         self.op_type = "depthwise_conv2d"
 
 
+class TestCUDNNExhaustiveSearch(TestConv2dOp):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.exhaustive_search = True
+
+
 # Please Don't remove the following code.
 # Currently, CI use cudnn V5.0 which not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
index ddaf99fe061205f0f2e4c592c9e28e27e657c16a..69c5ab7a4a4cbd552d27dcb07052d46752eeb54a 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -335,6 +335,12 @@ class TestFP16WithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
                 self.check_output_with_place(place, atol=2e-2)
 
 
+class TestCUDNNExhaustiveSearch(TestCUDNN):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.exhaustive_search = True
+
+
 # FIXME(typhoonzero): find a way to determine if
 # using cudnn > 6 in python
 # class TestWithDilationCUDNN(TestWithDilation):
diff --git a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..79d1fd3d7171e06a88a75cf50b6a51ef4da51f07
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+import math
+from op_test import OpTest
+
+
+class TestDensityPriorBoxOp(OpTest):
+    def set_data(self):
+        self.init_test_params()
+        self.init_test_input()
+        self.init_test_output()
+        self.inputs = {'Input': self.input, 'Image': self.image}
+
+        self.attrs = {
+            'variances': self.variances,
+            'clip': self.clip,
+            'step_w': self.step_w,
+            'step_h': self.step_h,
+            'offset': self.offset,
+            'densities': self.densities,
+            'fixed_sizes': self.fixed_sizes,
+            'fixed_ratios': self.fixed_ratios
+        }
+        self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "density_prior_box"
+        self.set_data()
+
+    def set_density(self):
+        self.densities = []
+        self.fixed_sizes = []
+        self.fixed_ratios = []
+
+    def init_test_params(self):
+        self.layer_w = 32
+        self.layer_h = 32
+
+        self.image_w = 40
+        self.image_h = 40
+
+        self.step_w = float(self.image_w) / float(self.layer_w)
+        self.step_h = float(self.image_h) / float(self.layer_h)
+
+        self.input_channels = 2
+        self.image_channels = 3
+        self.batch_size = 10
+
+        self.variances = [0.1, 0.1, 0.2, 0.2]
+        self.variances = np.array(self.variances, dtype=np.float).flatten()
+
+        self.set_density()
+
+        self.clip = True
+        self.num_priors = 0
+        if len(self.fixed_sizes) > 0 and len(self.densities) > 0:
+            for density in self.densities:
+                if len(self.fixed_ratios) > 0:
+                    self.num_priors += len(self.fixed_ratios) * (pow(density,
+                                                                     2))
+        self.offset = 0.5
+
+    def init_test_input(self):
+        self.image = np.random.random(
+            (self.batch_size, self.image_channels, self.image_w,
+             self.image_h)).astype('float32')
+
+        self.input = np.random.random(
+            (self.batch_size, self.input_channels, self.layer_w,
+             self.layer_h)).astype('float32')
+
+    def init_test_output(self):
+        out_dim = (self.layer_h, self.layer_w, self.num_priors, 4)
+        out_boxes = np.zeros(out_dim).astype('float32')
+        out_var = np.zeros(out_dim).astype('float32')
+
+        step_average = int((self.step_w + self.step_h) * 0.5)
+        for h in range(self.layer_h):
+            for w in range(self.layer_w):
+                idx = 0
+                c_x = (w + self.offset) * self.step_w
+                c_y = (h + self.offset) * self.step_h
+                # Generate density prior boxes with fixed size
+                for density, fixed_size in zip(self.densities,
+                                               self.fixed_sizes):
+                    if (len(self.fixed_ratios) > 0):
+                        for ar in self.fixed_ratios:
+                            shift = int(step_average / density)
+                            box_width_ratio = fixed_size * math.sqrt(ar)
+                            box_height_ratio = fixed_size / math.sqrt(ar)
+                            for di in range(density):
+                                for dj in range(density):
+                                    c_x_temp = c_x - step_average / 2.0 + shift / 2.0 + dj * shift
+                                    c_y_temp = c_y - step_average / 2.0 + shift / 2.0 + di * shift
+                                    out_boxes[h, w, idx, :] = [
+                                        max((c_x_temp - box_width_ratio / 2.0) /
+                                            self.image_w, 0),
+                                        max((c_y_temp - box_height_ratio / 2.0)
+                                            / self.image_h, 0),
+                                        min((c_x_temp + box_width_ratio / 2.0) /
+                                            self.image_w, 1),
+                                        min((c_y_temp + box_height_ratio / 2.0)
+                                            / self.image_h, 1)
+                                    ]
+                                    idx += 1
+        if self.clip:
+            out_boxes = np.clip(out_boxes, 0.0, 1.0)
+        out_var = np.tile(self.variances,
+                          (self.layer_h, self.layer_w, self.num_priors, 1))
+        self.out_boxes = out_boxes.astype('float32')
+        self.out_var = out_var.astype('float32')
+
+
+class TestDensityPriorBox(TestDensityPriorBoxOp):
+    def set_density(self):
+        self.densities = [3, 4]
+        self.fixed_sizes = [1.0, 2.0]
+        self.fixed_ratios = [1.0]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 45fae63b01e6a766202408d023e0292579c2857a..4b8a215190a90c974a9ecc8658d044c59b80c989 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -98,17 +98,18 @@ class TestDistRunnerBase(object):
         strategy.allow_op_delay = False
 
         build_stra = fluid.BuildStrategy()
-        if args.batch_merge_repeat > 1:
-            pass_builder = build_stra._create_passes_from_strategy()
-            mypass = pass_builder.insert_pass(
-                len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass")
-            mypass.set_int("num_repeats", args.batch_merge_repeat)
 
         if args.use_reduce:
             build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
         else:
             build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
 
+        if args.batch_merge_repeat > 1:
+            pass_builder = build_stra._create_passes_from_strategy()
+            mypass = pass_builder.insert_pass(
+                len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass")
+            mypass.set_int("num_repeats", args.batch_merge_repeat)
+
         exe = fluid.ParallelExecutor(
             args.use_cuda,
             loss_name=avg_cost.name,
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 986fdd9ff27fe2be54ce97f330028b4ae2358714..d132dd3c48f55c07725515e40faeb5076398adeb 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -373,9 +373,8 @@ class TestL2Decay(TranspilerTest):
         self.assertEqual(len(pserver.blocks), 3)
         self.assertEqual([op.type for op in pserver.blocks[1].ops],
                          ["sum", "scale", "clip", "sgd"])
-        self.assertEqual(
-            [op.type for op in pserver.blocks[2].ops],
-            ["sum", "scale", "clip", "scale", "elementwise_add", "sgd"])
+        self.assertEqual([op.type for op in pserver.blocks[2].ops],
+                         ["sum", "scale", "clip", "scale", "sum", "sgd"])
         # TODO(typhoonzero): test clipping and L2Decay ops are removed from trainer
 
 
@@ -416,12 +415,10 @@ class TestL2DecayWithPiecewise(TranspilerTest):
             "logical_and", "conditional_block", "fill_constant",
             "conditional_block"
         ])
-        self.assertEqual(
-            [op.type for op in pserver.blocks[7].ops],
-            ["sum", "scale", "scale", "elementwise_add", "momentum"])
-        self.assertEqual(
-            [op.type for op in pserver.blocks[8].ops],
-            ["sum", "scale", "scale", "elementwise_add", "momentum"])
+        self.assertEqual([op.type for op in pserver.blocks[7].ops],
+                         ["sum", "scale", "scale", "sum", "momentum"])
+        self.assertEqual([op.type for op in pserver.blocks[8].ops],
+                         ["sum", "scale", "scale", "sum", "momentum"])
 
 
 class TestEmptyPserverOptimizeBlocks(TranspilerTest):
@@ -570,7 +567,6 @@ class TestDistLookupTable(TestDistLookupTableBase):
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
-            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'uniform_random',
             'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat',
             'fake_init'
@@ -642,7 +638,7 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
         # 5 save table
         self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
 
-        trainer, _ = self.get_trainer(config)
+        trainer, trainer_startup = self.get_trainer(config)
         self.assertEqual(len(trainer.blocks), 1)
         ops = [
             'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
@@ -656,6 +652,16 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
             'recv', 'concat'
         ]
         self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+        startup_ops = [
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'uniform_random',
+            'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat',
+            'fake_init'
+        ]
+        self.assertEqual([op.type for op in trainer_startup.blocks[0].ops],
+                         startup_ops)
 
 
 class TestDistLookupTableSliceSize(TestDistLookupTableBase):
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
index 6a129b6df9bf1830fdf5eb5cb9ae0c5e4f7bb4ec..53409e436c0739bce63a3a8f90591e0ca6836859 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@@ -117,56 +117,5 @@ class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
         }
 
 
-class TestElementWiseMulSelectedRows(OpTest):
-    def setUp(self):
-        self.rows = [0, 1, 2, 3, 4, 5, 6]
-        self.feature = 12
-        self.height = 100
-        self.input_shape = (len(self.rows), self.feature)
-
-    def prepare_input(self, scope, place):
-        self.input = {
-            "X": np.random.random(self.input_shape).astype("float32"),
-            "Y": np.random.random(self.input_shape).astype("float32")
-        }
-
-        def init_input(in_name):
-            x_selected_rows = scope.var(in_name).get_selected_rows()
-            x_selected_rows.set_height(self.height)
-            x_selected_rows.set_rows(self.rows)
-            x_array = self.input[in_name]
-            x_tensor = x_selected_rows.get_tensor()
-            x_tensor.set(x_array, place)
-
-        init_input("X")
-        init_input("Y")
-
-    def create_out_selected_row(self, scope):
-        return scope.var('Out').get_selected_rows()
-
-    def check_result(self, out_selected_rows):
-        assert out_selected_rows.height() == self.height
-        assert out_selected_rows.rows() == self.rows
-        out_tensor = np.array(out_selected_rows.get_tensor())
-        assert out_tensor.shape == self.input_shape
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-        self.prepare_input(scope, place)
-
-        out_selected_rows = self.create_out_selected_row(scope)
-        out_selected_rows.set_height(0)
-        out_selected_rows.set_rows([])
-
-        elementwise_mul = Operator("elementwise_mul", X='X', Y='Y', Out='Out')
-        elementwise_mul.run(scope, place)
-        self.check_result(out_selected_rows)
-
-    def test_elewisemul_with_selected_rows_input(self):
-        places = [core.CPUPlace()]
-        for place in places:
-            self.check_with_place(place)
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_extract_rows_op.py b/python/paddle/fluid/tests/unittests/test_extract_rows_op.py
deleted file mode 100644
index 8629bcf0f2e3c37aefdbf79b203176a43e0c3a7e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_extract_rows_op.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from op_test import OpTest
-
-
-class TestExtractRows(OpTest):
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        # create and initialize Variable
-        feature_len = 12
-        rows = [0, 4, 4, 7]
-        np_array = np.ones((len(rows), feature_len)).astype("float32")
-
-        in_x = scope.var('X').get_selected_rows()
-        in_x.set_height(len(rows))
-        in_x.set_rows(rows)
-        in_x_tensor = in_x.get_tensor()
-        in_x_tensor.set(np_array, place)
-
-        # create Out Variable
-        out_tensor = scope.var('Out').get_tensor()
-
-        # create and run lookup_table operator
-        extract_rows_op = Operator("extract_rows", X='X', Out='Out')
-        extract_rows_op.run(scope, place)
-
-        # get result from Out
-        result_array = np.array(out_tensor)
-        result_array = [ele[0] for ele in result_array]
-        assert result_array == rows
-
-    def test_concat_rows(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            self.check_with_place(place)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_interpolate_op.py b/python/paddle/fluid/tests/unittests/test_interpolate_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..9748d094cda6ee9dc649d95d1ca7f1c4b55d1031
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_interpolate_op.py
@@ -0,0 +1,335 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+
+
+def nearest_neighbor_interp_np(X,
+                               out_h,
+                               out_w,
+                               out_size=None,
+                               actual_shape=None):
+    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+    n, c, in_h, in_w = X.shape
+
+    ratio_h = ratio_w = 0.0
+    if out_h > 1:
+        ratio_h = (in_h - 1.0) / (out_h - 1.0)
+    if out_w > 1:
+        ratio_w = (in_w - 1.0) / (out_w - 1.0)
+
+    out = np.zeros((n, c, out_h, out_w))
+    for i in range(out_h):
+        in_i = int(ratio_h * i + 0.5)
+        for j in range(out_w):
+            in_j = int(ratio_w * j + 0.5)
+            out[:, :, i, j] = X[:, :, in_i, in_j]
+
+    return out.astype(X.dtype)
+
+
+def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None):
+    """bilinear interpolation implement in shape [N, C, H, W]"""
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+    batch_size, channel, in_h, in_w = input.shape
+    if out_h > 1:
+        ratio_h = (in_h - 1.0) / (out_h - 1.0)
+    else:
+        ratio_h = 0.0
+    if out_w > 1:
+        ratio_w = (in_w - 1.0) / (out_w - 1.0)
+    else:
+        ratio_w = 0.0
+
+    out = np.zeros((batch_size, channel, out_h, out_w))
+    for i in range(out_h):
+        h = int(ratio_h * i)
+        hid = 1 if h < in_h - 1 else 0
+        h1lambda = ratio_h * i - h
+        h2lambda = 1.0 - h1lambda
+        for j in range(out_w):
+            w = int(ratio_w * j)
+            wid = 1 if w < in_w - 1 else 0
+            w1lambda = ratio_w * j - w
+            w2lambda = 1.0 - w1lambda
+
+            out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] +
+                                        w1lambda*input[:, :, h, w+wid]) + \
+                h1lambda*(w2lambda*input[:, :, h+hid, w] +
+                          w1lambda*input[:, :, h+hid, w+wid])
+    return out.astype(input.dtype)
+
+
+INTERPOLATE_FUNCS = {
+    'bilinear': bilinear_interp_np,
+    'nearest': nearest_neighbor_interp_np,
+}
+
+
+class TestInterpolateOp(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "interpolate"
+        input_np = np.random.random(self.input_shape).astype("float32")
+
+        output_np = INTERPOLATE_FUNCS[self.interp_method](
+            input_np, self.out_h, self.out_w, self.out_size, self.actual_shape)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method
+        }
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 4, 4]
+        self.out_h = 2
+        self.out_w = 2
+        self.out_size = np.array([3, 3]).astype("int32")
+
+
+class TestBilinearInterpCase1(TestInterpolateOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+
+
+class TestBilinearInterpCase2(TestInterpolateOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+
+
+class TestBilinearInterpCase3(TestInterpolateOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [1, 1, 128, 64]
+        self.out_h = 64
+        self.out_w = 128
+
+
+class TestBilinearInterpCase4(TestInterpolateOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.out_size = np.array([2, 2]).astype("int32")
+
+
+class TestBilinearInterpCase5(TestInterpolateOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.out_size = np.array([11, 11]).astype("int32")
+
+
+class TestBilinearInterpCase6(TestInterpolateOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [1, 1, 128, 64]
+        self.out_h = 64
+        self.out_w = 128
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+class TestBilinearInterpActualShape(TestInterpolateOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.out_size = np.array([66, 40]).astype("int32")
+
+
+class TestBilinearInterpBigScale(TestInterpolateOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [4, 4, 64, 32]
+        self.out_h = 100
+        self.out_w = 50
+        self.out_size = np.array([101, 51]).astype('int32')
+
+
+class TestInterpolateOpUint8(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "interpolate"
+        input_np = np.random.randint(
+            low=0, high=256, size=self.input_shape).astype("uint8")
+        output_np = INTERPOLATE_FUNCS[self.interp_method](
+            input_np, self.out_h, self.out_w, self.out_size, self.actual_shape)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method
+        }
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(place=core.CPUPlace(), atol=1)
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [1, 3, 9, 6]
+        self.out_h = 10
+        self.out_w = 9
+
+
+class TestBilinearInterpCase1Uint8(TestInterpolateOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 128, 64]
+        self.out_h = 120
+        self.out_w = 50
+
+
+class TestBilinearInterpCase2Uint8(TestInterpolateOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 5
+        self.out_w = 13
+        self.out_size = np.array([6, 15]).astype("int32")
+
+
+class TestNearestNeighborInterpCase1(TestInterpolateOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+
+
+class TestNearestNeighborInterpCase2(TestInterpolateOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+
+
+class TestNearestNeighborInterpCase3(TestInterpolateOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [1, 1, 128, 64]
+        self.out_h = 64
+        self.out_w = 128
+
+
+class TestNearestNeighborInterpCase4(TestInterpolateOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.out_size = np.array([2, 2]).astype("int32")
+
+
+class TestNearestNeighborInterpCase5(TestInterpolateOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.out_size = np.array([11, 11]).astype("int32")
+
+
+class TestNearestNeighborInterpCase6(TestInterpolateOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [1, 1, 128, 64]
+        self.out_h = 64
+        self.out_w = 128
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+class TestNearestNeighborInterpActualShape(TestInterpolateOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.out_size = np.array([66, 40]).astype("int32")
+
+
+class TestNearestNeighborInterpBigScale(TestInterpolateOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 4, 64, 32]
+        self.out_h = 100
+        self.out_w = 50
+        self.out_size = np.array([101, 51]).astype('int32')
+
+
+class TestNearestNeighborInterpCase1Uint8(TestInterpolateOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 3, 128, 64]
+        self.out_h = 120
+        self.out_w = 50
+
+
+class TestNearestNeighborInterpCase2Uint8(TestInterpolateOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 5
+        self.out_w = 13
+        self.out_size = np.array([6, 15]).astype("int32")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index c4ecc2c2c2563fcad09821453ee73e41f81407d5..a8fa5436c43d2f05f632b920f67d43d837d28da9 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -248,6 +248,17 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(layers.softmax(hid))
         print(str(program))
 
+    def test_space_to_depth(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(
+                name='data',
+                shape=[32, 9, 6, 6],
+                append_batch_size=False,
+                dtype='float32')
+            self.assertIsNotNone(layers.space_to_depth(data, 3))
+        print(str(program))
+
     def test_sequence_unsqueeze(self):
         program = Program()
         with program_guard(program):
@@ -358,6 +369,10 @@ class TestBook(unittest.TestCase):
         with program_guard(program):
             x = layers.data(name='x', shape=[16], dtype='float32')
             y = layers.data(name='label', shape=[1], dtype='int64')
+            loss, softmax = layers.softmax_with_cross_entropy(
+                x, y, return_softmax=True)
+            self.assertIsNotNone(loss)
+            self.assertIsNotNone(softmax)
             loss = layers.softmax_with_cross_entropy(x, y)
             self.assertIsNotNone(loss)
         print(str(program))
@@ -485,6 +500,16 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(output)
         print(str(program))
 
+    def test_resize_nearest(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 9, 6], dtype="float32")
+            output = layers.resize_nearest(x, out_shape=[12, 12])
+            self.assertIsNotNone(output)
+            output = layers.resize_nearest(x, scale=3)
+            self.assertIsNotNone(output)
+        print(str(program))
+
     def test_polygon_box_transform(self):
         program = Program()
         with program_guard(program):
@@ -890,6 +915,16 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(data_1)
         print(str(program))
 
+    def test_bilinear_tensor_product_layer(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name='data', shape=[4], dtype="float32")
+
+            theta = layers.data(name="theta", shape=[5], dtype="float32")
+            out = layers.bilinear_tensor_product(data, theta, 6)
+
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
index 11e5d8b536fb65b66c954991bf815241774702ec..c7f4f3e913bfd66cbbb703c0e73336f9a3563507 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
@@ -80,6 +80,33 @@ class TestLookupSpraseTable(OpTest):
         assert (result_array2[3] == w_array[6]).all()
         assert (result_array2[4] == w_array[7]).all()
 
+        # create and run lookup_table operator
+        test_lookup_table = Operator(
+            "lookup_sparse_table",
+            W='W',
+            Ids='Ids',
+            Out='Out',
+            min=-5.0,
+            max=10.0,
+            seed=10,
+            is_test=True)
+
+        ids = scope.var("Ids").get_tensor()
+        unknown_id = [44, 22, 33]
+        ids_array2 = np.array([4, 2, 3, 7, 100000] + unknown_id).astype("int64")
+        ids.set(ids_array2, place)
+        test_lookup_table.run(scope, place)
+
+        result_array2 = np.array(out_tensor)
+        assert (result_array2[0] == w_array[5]).all()
+        assert (result_array2[1] == w_array[1]).all()
+        assert (result_array2[2] == w_array[2]).all()
+        assert (result_array2[3] == w_array[6]).all()
+        assert (result_array2[4] == w_array[7]).all()
+
+        for i in [5, 6, 7]:
+            assert np.all(result_array2[i] == 0)
+
     def test_w_is_selected_rows(self):
         places = [core.CPUPlace()]
         # currently only support CPU
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index d6dbedcf875b06f5af4597e6dad71a39f286a471..84b0aad8acb096a32f625e32fb640599f2882d97 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import paddle.dataset.conll05 as conll05
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import unittest
 import paddle
 import numpy as np
@@ -177,32 +178,36 @@ class TestCRFModel(unittest.TestCase):
     def test_update_sparse_parameter_all_reduce(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
-        self.check_network_convergence(
-            is_sparse=True, build_strategy=build_strategy, use_cuda=True)
+        if core.is_compiled_with_cuda():
+            self.check_network_convergence(
+                is_sparse=True, build_strategy=build_strategy, use_cuda=True)
         self.check_network_convergence(
             is_sparse=True, build_strategy=build_strategy, use_cuda=False)
 
     def test_update_dense_parameter_all_reduce(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
-        self.check_network_convergence(
-            is_sparse=False, build_strategy=build_strategy, use_cuda=True)
+        if core.is_compiled_with_cuda():
+            self.check_network_convergence(
+                is_sparse=False, build_strategy=build_strategy, use_cuda=True)
         self.check_network_convergence(
             is_sparse=False, build_strategy=build_strategy, use_cuda=False)
 
     def test_update_sparse_parameter_reduce(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-        self.check_network_convergence(
-            is_sparse=True, build_strategy=build_strategy, use_cuda=True)
+        if core.is_compiled_with_cuda():
+            self.check_network_convergence(
+                is_sparse=True, build_strategy=build_strategy, use_cuda=True)
         self.check_network_convergence(
             is_sparse=True, build_strategy=build_strategy, use_cuda=False)
 
     def test_update_dense_parameter_reduce(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-        self.check_network_convergence(
-            is_sparse=False, build_strategy=build_strategy, use_cuda=True)
+        if core.is_compiled_with_cuda():
+            self.check_network_convergence(
+                is_sparse=False, build_strategy=build_strategy, use_cuda=True)
         self.check_network_convergence(
             is_sparse=False, build_strategy=build_strategy, use_cuda=False)
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
index c93740669f40aee3a6c143d153cfd0f5bb72dbd9..18d95c94ad36316b7149eb5412260b40a57ac002 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
@@ -21,8 +21,8 @@ import six
 class TestBase(unittest.TestCase):
     def main(self,
              network_func,
-             iter=100,
-             iter_per_pe=100,
+             iter=10,
+             iter_per_pe=10,
              use_gpu=True,
              use_experimental_executor=False):
         if use_gpu and not fluid.core.is_compiled_with_cuda():
@@ -45,7 +45,7 @@ class TestBase(unittest.TestCase):
             exe_strategy._dry_run = True
             exe_strategy.use_experimental_executor = use_experimental_executor
             pe = fluid.ParallelExecutor(
-                use_cuda=True,
+                use_cuda=use_gpu,
                 loss_name=loss.name,
                 main_program=main_prog,
                 exec_strategy=exe_strategy)
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
index b7fad9b3a60632adb564e1d155a3d935706b467f..d94494e219c5f348a08b4c3c2d111674ea6badf3 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -53,15 +53,24 @@ def simple_fc_net(in_size,
                   hidden_sizes,
                   batch_size,
                   queue_capacity,
-                  use_double_buffer=False):
-    reader = fluid.layers.py_reader(
-        capacity=queue_capacity,
-        shapes=[[-1, in_size], [-1, 1]],
-        lod_levels=[0, 0],
-        dtypes=['float32', 'int64'],
-        use_double_buffer=False)
-    feed_queue = reader.queue
-    reader = fluid.layers.batch(reader, batch_size=batch_size)
+                  use_double_buffer=False,
+                  use_feed_list=True):
+    if use_feed_list:
+        data = fluid.layers.data(name="data", dtype='float32', shape=[in_size])
+        label = fluid.layers.data(name='label', dtype='int64', shape=[1])
+        py_reader = fluid.layers.create_py_reader_by_data(
+            capacity=queue_capacity,
+            use_double_buffer=False,
+            feed_list=[data, label])
+    else:
+        py_reader = fluid.layers.py_reader(
+            capacity=queue_capacity,
+            shapes=[[-1, in_size], [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'],
+            use_double_buffer=False)
+    feed_queue = py_reader.queue
+    reader = fluid.layers.batch(py_reader, batch_size=batch_size)
     if use_double_buffer:
         reader = fluid.layers.double_buffer(reader)
 
@@ -83,7 +92,7 @@ def simple_fc_net(in_size,
 
     optimizer = fluid.optimizer.Adam()
     optimizer.minimize(loss)
-    return in_data, label, loss, optimizer, feed_queue
+    return in_data, label, loss, optimizer, feed_queue, py_reader
 
 
 class TestPyReaderUsingExecutor(unittest.TestCase):
@@ -100,16 +109,22 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
                          if core.is_compiled_with_cuda() else [False]):
             for use_parallel_executor in [False, True]:
                 for use_double_buffer in [False, True]:
-                    print('Test Parameters:'),
-                    print({
-                        'use_cuda': use_cuda,
-                        'use_parallel_executor': use_parallel_executor,
-                        'use_double_buffer': use_double_buffer
-                    })
-                    self.main(use_cuda, use_parallel_executor,
-                              use_double_buffer)
-
-    def random_reader(self):
+                    for use_feed_list in [False, True]:
+                        for use_decorate_paddle_reader in [False, True]:
+                            print('Test Parameters:'),
+                            print({
+                                'use_cuda': use_cuda,
+                                'use_parallel_executor': use_parallel_executor,
+                                'use_double_buffer': use_double_buffer,
+                                'use_feed_list': use_feed_list,
+                                'use_decorate_paddle_reader':
+                                use_decorate_paddle_reader
+                            })
+                            self.main(use_cuda, use_parallel_executor,
+                                      use_double_buffer, use_feed_list,
+                                      use_decorate_paddle_reader)
+
+    def tensor_reader(self, use_decorate_paddle_reader):
         def reader():
             self.inputs = []
             cnt = 0
@@ -133,34 +148,43 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
                 elif not self.use_double_buffer:
                     break
 
-                yield tensors
+                if use_decorate_paddle_reader:
+                    yield [(in_data, label)]
+                else:
+                    yield tensors
                 cnt += 1
 
-            yield None
+            if not use_decorate_paddle_reader:
+                yield None
 
         return reader
 
     def main(self,
              use_cuda=True,
              use_parallel_executor=False,
-             use_double_buffer=False):
+             use_double_buffer=False,
+             use_feed_list=False,
+             use_decorate_paddle_reader=False):
         assert not use_cuda or use_cuda and core.is_compiled_with_cuda()
 
         self.use_cuda = use_cuda
         self.use_parallel_executor = use_parallel_executor
         self.use_double_buffer = use_double_buffer
+        self.use_feed_list = use_feed_list
+        self.use_decorate_paddle_reader = use_decorate_paddle_reader
 
         startup_program = fluid.Program()
         main_program = fluid.Program()
 
         with fluid.program_guard(main_program, startup_program):
-            in_data, label, loss, optimizer, feed_queue = simple_fc_net(
+            in_data, label, loss, optimizer, feed_queue, py_reader = simple_fc_net(
                 in_size=self.in_size,
                 class_num=self.class_num,
                 hidden_sizes=self.hidden_sizes,
                 batch_size=self.batch_size,
                 queue_capacity=self.queue_capacity,
-                use_double_buffer=self.use_double_buffer)
+                use_double_buffer=self.use_double_buffer,
+                use_feed_list=self.use_feed_list)
 
             place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
@@ -178,10 +202,14 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
                 main_exe = startup_exe
                 self.batch_size_times = 1
 
-            reader = self.random_reader()
-            thread = threading.Thread(
-                target=feed_data, args=(feed_queue, reader))
-            thread.start()
+            reader = self.tensor_reader(use_decorate_paddle_reader)
+            if use_decorate_paddle_reader:
+                py_reader.decorate_paddle_reader(reader)
+                py_reader.start()
+            else:
+                thread = threading.Thread(
+                    target=feed_data, args=(feed_queue, reader))
+                thread.start()
 
             self.outputs = []
             for _ in range(self.iterations):
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py
index 6727335c6059161d235a64a1b90d36b84004f9b3..20f91cf4485f2e79c20fe90143c8b7deebb9fc49 100644
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
@@ -55,7 +55,7 @@ class TestL2DecayRegularizer(unittest.TestCase):
         params_grads = optimizer.append_regularization_ops(params_grads)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(block.ops), count_ops + 2)
-        self.assertEqual(block.ops[-1].type, 'elementwise_add')
+        self.assertEqual(block.ops[-1].type, 'sum')
         self.assertEqual(block.ops[-2].type, 'scale')
 
 
@@ -92,7 +92,7 @@ class TestL1DecayRegularizer(unittest.TestCase):
         params_grads = optimizer.append_regularization_ops(params_grads)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(block.ops), count_ops + 3)
-        self.assertEqual(block.ops[-1].type, 'elementwise_add')
+        self.assertEqual(block.ops[-1].type, 'sum')
         self.assertEqual(block.ops[-2].type, 'scale')
         self.assertEqual(block.ops[-3].type, 'sign')
 
diff --git a/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py b/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..b3833f05f1aa3aac7b5bcc5b6fdc138870cc8844
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
@@ -0,0 +1,217 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+
+
+class TestSimilarityFocusOp(OpTest):
+    def setUp(self):
+        self.op_type = "similarity_focus"
+        batch_size = 2
+        x_dim, y_dim, z_dim = 3, 2, 2
+        self.inputs = {
+            'X': np.array([[[[0.8, 0.1], [0.4, 0.5]], [[0.9, 0.7], [0.9, 0.9]],
+                            [[0.8, 0.9], [0.1, 0.2]]],
+                           [[[0.2, 0.5], [0.3, 0.4]], [[0.9, 0.7], [0.8, 0.4]],
+                            [[0.0, 0.2], [0.4, 0.7]]]]),
+        }
+        self.attrs = {
+            'axis': 1,
+            'indexes': [0],
+        }
+
+        output = None
+        for batch in range(batch_size):
+            res = np.zeros((1, y_dim, z_dim)).astype("float32").reshape(-1)
+            for index in self.attrs['indexes']:
+                channel = self.inputs['X'][batch, index, :, :].reshape(-1).copy(
+                )
+                tag1 = [0 for i in range(y_dim)]
+                tag2 = [0 for i in range(z_dim)]
+                cnt = 0
+                for i in range(channel.size):
+                    index = channel.argmax()
+                    idx1 = index // z_dim
+                    idx2 = index % z_dim
+                    if tag1[idx1] + tag2[idx2] == 0:
+                        tag1[idx1] = 1
+                        tag2[idx2] = 1
+                        res[index] = 1
+                        cnt += 1
+                        if cnt == min(y_dim, z_dim):
+                            break
+                    channel[index] = -1
+            res = res.reshape(1, y_dim, z_dim).repeat([x_dim], axis=0)
+            res = res.reshape(1, x_dim, y_dim, z_dim)
+            if output is not None:
+                output = np.concatenate((output, res), axis=0)
+            else:
+                output = res
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestSimilarityFocusOp_axis1(OpTest):
+    def setUp(self):
+        self.op_type = "similarity_focus"
+        batch_size = 3
+        x_dim, y_dim, z_dim = 4, 5, 6
+        self.inputs = {
+            'X': np.random.random(
+                (batch_size, x_dim, y_dim, z_dim)).astype("float32"),
+        }
+        self.attrs = {
+            'axis': 1,
+            'indexes': [0, 3],
+        }
+
+        output = None
+        for batch in range(batch_size):
+            res = np.zeros((1, y_dim, z_dim)).astype("float32").reshape(-1)
+            for index in self.attrs['indexes']:
+                channel = self.inputs['X'][batch, index, :, :].reshape(-1).copy(
+                )
+                tag1 = [0 for i in range(y_dim)]
+                tag2 = [0 for i in range(z_dim)]
+                cnt = 0
+                for i in range(channel.size):
+                    index = channel.argmax()
+                    idx1 = index // z_dim
+                    idx2 = index % z_dim
+                    if tag1[idx1] + tag2[idx2] == 0:
+                        tag1[idx1] = 1
+                        tag2[idx2] = 1
+                        res[index] = 1
+                        cnt += 1
+                        if cnt == min(y_dim, z_dim):
+                            break
+                    channel[index] = -1
+            res = res.reshape(1, y_dim, z_dim)
+            res = res.repeat([x_dim], axis=0)
+            res = res.reshape(1, x_dim, y_dim, z_dim)
+            if output is not None:
+                output = np.concatenate((output, res), axis=0)
+            else:
+                output = res
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestSimilarityFocusOp_axis2(OpTest):
+    def setUp(self):
+        self.op_type = "similarity_focus"
+        batch_size = 6
+        x_dim, y_dim, z_dim = 7, 8, 9
+        self.inputs = {
+            'X': np.random.random(
+                (batch_size, x_dim, y_dim, z_dim)).astype("float32"),
+        }
+        self.attrs = {
+            'axis': 2,
+            'indexes': [0, 3, 5],
+        }
+
+        output = None
+        for batch in range(batch_size):
+            res = np.zeros((x_dim, 1, z_dim)).astype("float32").reshape(-1)
+            for index in self.attrs['indexes']:
+                channel = self.inputs['X'][batch, :, index, :].reshape(-1).copy(
+                )
+                tag1 = [0 for i in range(x_dim)]
+                tag2 = [0 for i in range(z_dim)]
+                cnt = 0
+                for i in range(channel.size):
+                    index = channel.argmax()
+                    idx1 = index // z_dim
+                    idx2 = index % z_dim
+                    if tag1[idx1] + tag2[idx2] == 0:
+                        tag1[idx1] = 1
+                        tag2[idx2] = 1
+                        res[index] = 1
+                        cnt += 1
+                        if cnt == min(x_dim, z_dim):
+                            break
+                    channel[index] = -1
+            res = res.reshape(x_dim, 1, z_dim)
+            res = res.repeat([y_dim], axis=1)
+            res = res.reshape(1, x_dim, y_dim, z_dim)
+            if output is not None:
+                output = np.concatenate((output, res), axis=0)
+            else:
+                output = res
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestSimilarityFocusOp_axis3(OpTest):
+    def setUp(self):
+        self.op_type = "similarity_focus"
+        batch_size = 64
+        x_dim, y_dim, z_dim = 48, 48, 13
+        self.inputs = {
+            'X': np.random.random(
+                (batch_size, x_dim, y_dim, z_dim)).astype("float32"),
+        }
+        self.attrs = {
+            'axis': 3,
+            'indexes': [0, 2, 7, 9],
+        }
+
+        output = None
+        for batch in range(batch_size):
+            res = np.zeros((x_dim, y_dim, 1)).astype("float32").reshape(-1)
+            for index in self.attrs['indexes']:
+                channel = self.inputs['X'][batch, :, :, index].reshape(-1).copy(
+                )
+                tag1 = [0 for i in range(x_dim)]
+                tag2 = [0 for i in range(y_dim)]
+                cnt = 0
+                for i in range(channel.size):
+                    index = channel.argmax()
+                    idx1 = index // y_dim
+                    idx2 = index % y_dim
+                    if tag1[idx1] + tag2[idx2] == 0:
+                        tag1[idx1] = 1
+                        tag2[idx2] = 1
+                        res[index] = 1
+                        cnt += 1
+                        if cnt == min(x_dim, y_dim):
+                            break
+                    channel[index] = -1
+            res = res.reshape(x_dim, y_dim, 1)
+            res = res.repeat([z_dim], axis=2)
+            res = res.reshape(1, x_dim, y_dim, z_dim)
+            if output is not None:
+                output = np.concatenate((output, res), axis=0)
+            else:
+                output = res
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_space_to_depth_op.py b/python/paddle/fluid/tests/unittests/test_space_to_depth_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fdad44f1242b9ee99040b43d7ce2cf84664eed1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_space_to_depth_op.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+from op_test import OpTest
+
+
+class TestSpaceToDepthOp(OpTest):
+    @staticmethod
+    def helper(in_, width, height, channel, batch, blocksize, forward, out_):
+        channel_out = channel // (blocksize * blocksize)
+        for b in range(batch):
+            for k in range(channel):
+                for j in range(height):
+                    for i in range(width):
+                        in_index = i + width * (j + height * (k + channel * b))
+                        channel2 = k % channel_out
+                        offset = k // channel_out
+                        width2 = i * blocksize + offset % blocksize
+                        height2 = j * blocksize + offset // blocksize
+                        out_index = width2 + width * blocksize * (
+                            height2 + height * blocksize *
+                            (channel2 + channel_out * b))
+                        if forward:
+                            out_[out_index] = in_[in_index]
+                        else:
+                            out_[in_index] = in_[out_index]
+
+    def setUp(self):
+        self.init_data()
+
+        self.op_type = "space_to_depth"
+        self.inputs = {"X": self.x}
+        self.helper(self.x_1d, self.x.shape[3], self.x.shape[2],
+                    self.x.shape[1], self.x.shape[0], self.blocksize,
+                    self.forward, self.out_1d)
+        self.out = np.reshape(self.out_1d, self.infered_shape)
+        self.attrs = {"blocksize": self.blocksize}
+        self.outputs = {"Out": self.out}
+
+    def init_data(self):
+        self.ori_shape = (32, 12, 6, 6)
+        self.infered_shape = (32, 48, 3, 3)
+        self.one_d_len = 32 * 48 * 3 * 3
+
+        self.blocksize = 2
+        self.x = np.random.random(self.ori_shape).astype('float32')
+        self.x_1d = np.reshape(self.x, self.one_d_len)
+        self.out = np.zeros(self.infered_shape).astype('float32')
+        self.out_1d = np.reshape(self.out, self.one_d_len)
+        self.forward = 1
+
+    def test_check_output(self):
+        place = fluid.core.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.core.CPUPlace()
+        self.check_output_with_place(place, 1e-5, None, False)
+
+    def test_check_grad(self):
+        place = fluid.core.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.core.CPUPlace()
+        self.check_grad_with_place(place, ['X'], 'Out')
+
+
+class TestSpaceToDepthOpBasic(TestSpaceToDepthOp):
+    def init_data(self):
+        self.ori_shape = (32, 8, 6, 6)
+        self.infered_shape = (32, 32, 3, 3)
+        self.one_d_len = 32 * 32 * 3 * 3
+
+        self.blocksize = 2
+        self.x = np.random.random(self.ori_shape).astype('float32')
+        self.x_1d = np.reshape(self.x, self.one_d_len)
+        self.out = np.zeros(self.infered_shape).astype('float32')
+        self.out_1d = np.reshape(self.out, self.one_d_len)
+        self.forward = 1
+
+
+class TestSpaceToDepthOpDoubleBasic(TestSpaceToDepthOp):
+    def init_data(self):
+        self.ori_shape = (32, 8, 6, 6)
+        self.infered_shape = (32, 32, 3, 3)
+        self.one_d_len = 32 * 32 * 3 * 3
+
+        self.blocksize = 2
+        self.x = np.random.random(self.ori_shape).astype('float64')
+        self.x_1d = np.reshape(self.x, self.one_d_len)
+        self.out = np.zeros(self.infered_shape).astype('float64')
+        self.out_1d = np.reshape(self.out, self.one_d_len)
+        self.forward = 1
+
+
+class TestSpaceToDepthOpWithStride3(TestSpaceToDepthOp):
+    def init_data(self):
+        self.ori_shape = (32, 9, 6, 6)
+        self.infered_shape = (32, 81, 2, 2)
+        self.one_d_len = 32 * 81 * 2 * 2
+
+        self.blocksize = 3
+        self.x = np.random.random(self.ori_shape).astype('float32')
+        self.x_1d = np.reshape(self.x, self.one_d_len)
+        self.out = np.zeros(self.infered_shape).astype('float32')
+        self.out_1d = np.reshape(self.out, self.one_d_len)
+        self.forward = 1
+
+
+class TestSpaceToDepthOpWithNotSquare(TestSpaceToDepthOp):
+    def init_data(self):
+        self.ori_shape = (32, 9, 9, 6)
+        self.infered_shape = (32, 81, 3, 2)
+        self.one_d_len = 32 * 81 * 3 * 2
+
+        self.blocksize = 3
+        self.x = np.random.random(self.ori_shape).astype('float32')
+        self.x_1d = np.reshape(self.x, self.one_d_len)
+        self.out = np.zeros(self.infered_shape).astype('float32')
+        self.out_1d = np.reshape(self.out, self.one_d_len)
+        self.forward = 1
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index 643878dc5c2c2854ad3a1b6429d78519b1670857..0be5be6e97d26c6ec42471d078e8e5995727e594 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -49,11 +49,14 @@ class TestSumOp(OpTest):
 
 
 class TestSelectedRowsSumOp(OpTest):
-    def check_with_place(self, place, inplace):
+    def setUp(self):
         self.height = 10
         self.row_numel = 12
         self.rows = [0, 1, 2, 3, 4, 5, 6]
+        self.dtype = np.float32
+        self.init_kernel_type()
 
+    def check_with_place(self, place, inplace):
         self.check_input_and_optput(core.Scope(), place, inplace, True, True,
                                     True)
         self.check_input_and_optput(core.Scope(), place, inplace, False, True,
@@ -64,12 +67,12 @@ class TestSelectedRowsSumOp(OpTest):
                                     False)
 
     def init_kernel_type(self):
-        self.dtype = np.float32
+        pass
 
-    def _get_array(self, row_num, row_numel):
-        array = np.ones((row_num, row_numel)).astype(self.dtype)
-        for i in range(row_num):
-            array[i] *= i
+    def _get_array(self, rows, row_numel):
+        array = np.ones((len(rows), row_numel)).astype(self.dtype)
+        for i in range(len(rows)):
+            array[i] *= rows[i]
         return array
 
     def check_input_and_optput(self,
@@ -105,7 +108,7 @@ class TestSelectedRowsSumOp(OpTest):
             self.assertTrue(
                 np.array_equal(
                     np.array(out.get_tensor()),
-                    self._get_array(len(self.rows), self.row_numel) *
+                    self._get_array(self.rows, self.row_numel) *
                     has_data_w_num))
         else:
             self.assertEqual(len(out.rows()), 0)
@@ -121,7 +124,7 @@ class TestSelectedRowsSumOp(OpTest):
         w_selected_rows = var.get_selected_rows()
         w_selected_rows.set_height(self.height)
         w_selected_rows.set_rows(rows)
-        w_array = self._get_array(len(rows), self.row_numel)
+        w_array = self._get_array(self.rows, self.row_numel)
         w_tensor = w_selected_rows.get_tensor()
         w_tensor.set(w_array, place)
 
@@ -136,36 +139,91 @@ class TestSelectedRowsSumOp(OpTest):
                 self.check_with_place(place, inplace)
 
 
+class TestLoDTensorAndSelectedRowsOp(TestSelectedRowsSumOp):
+    def setUp(self):
+        self.height = 10
+        self.row_numel = 12
+        self.rows = [0, 1, 2, 2, 4, 5, 6]
+
+    def check_with_place(self, place, inplace):
+        scope = core.Scope()
+        if inplace:
+            self.create_lod_tensor(scope, place, "x1")
+            self.create_selected_rows(scope, place, "x2", True)
+            out = scope.var("x1").get_tensor()
+            out_name = "x1"
+        else:
+            self.create_selected_rows(scope, place, "x1", True)
+            self.create_lod_tensor(scope, place, "x2")
+            out = scope.var("out").get_tensor()
+            out_name = "out"
+
+        # create and run sum operator
+        sum_op = Operator("sum", X=["x1", "x2"], Out=out_name)
+        sum_op.run(scope, place)
+
+        result = np.ones((1, self.height)).astype(np.int32).tolist()[0]
+        for ele in self.rows:
+            result[ele] += 1
+
+        out_t = np.array(out)
+        self.assertEqual(out_t.shape[0], self.height)
+        self.assertTrue(
+            np.array_equal(out_t,
+                           self._get_array([i for i in range(
+                               self.height)], self.row_numel) * np.tile(
+                                   np.array(result).reshape(self.height, 1),
+                                   self.row_numel)))
+
+    def create_lod_tensor(self, scope, place, var_name):
+        var = scope.var(var_name)
+        w_tensor = var.get_tensor()
+        w_array = self._get_array([i for i in range(self.height)],
+                                  self.row_numel)
+        w_tensor.set(w_array, place)
+        return var
+
+
+#----------- test fp16 -----------
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestFP16SumOp(TestSumOp):
     def init_kernel_type(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=2e-2)
+        place = core.CUDAPlace(0)
+        if core.is_float16_supported(place):
+            self.check_output_with_place(place, atol=2e-2)
 
     # FIXME: Because of the precision fp16, max_relative_error
     # should be 0.15 here.
     def test_check_grad(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_grad(['x0'], 'Out', max_relative_error=0.15)
+        place = core.CUDAPlace(0)
+        if core.is_float16_supported(place):
+            self.check_grad(['x0'], 'Out', max_relative_error=0.15)
 
 
-class TestFP16SelectedRowsSumOp(TestSelectedRowsSumOp):
-    def init_kernel_type(self):
-        self.dtype = np.float16
+def create_test_sum_fp16_class(parent):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestSumFp16Case(parent):
+        def init_kernel_type(self):
+            self.dtype = np.float16
 
-    def test_w_is_selected_rows(self):
-        if core.is_compiled_with_cuda():
+        def test_w_is_selected_rows(self):
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
                 for inplace in [True, False]:
                     self.check_with_place(place, inplace)
 
+    cls_name = "{0}_{1}".format(parent.__name__, "SumFp16Test")
+    TestSumFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestSumFp16Case
+
+
+create_test_sum_fp16_class(TestSelectedRowsSumOp)
+create_test_sum_fp16_class(TestLoDTensorAndSelectedRowsOp)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..78b95de7e07b1d1fcdeeae63498e740c2b474c6d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py
@@ -0,0 +1,142 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle.fluid.executor import Executor
+
+
+class TestLoDTensorArrayConcat(unittest.TestCase):
+    def setUp(self):
+        self.op_type = "tensor_array_to_tensor"
+        self.attrs = {"axis": 0}
+        self.outputs = ["Out"]
+
+    def test_get_set(self):
+        scope = core.Scope()
+        program = fluid.Program()
+        block = program.global_block()
+
+        input_arr = block.create_var(
+            name="tmp_lod_tensor_array",
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY)
+        input_arr.persistable = True
+        input_arr_var = scope.var('tmp_lod_tensor_array')
+        input_tensor_array = input_arr_var.get_lod_tensor_array()
+        self.assertEqual(0, len(input_tensor_array))
+
+        cpu = core.CPUPlace()
+        for i in range(10):
+            t = core.LoDTensor()
+            if i == 0:
+                t.set(numpy.array([[i], [i]], dtype='float32'), cpu)
+            else:
+                t.set(numpy.array([[i]], dtype='float32'), cpu)
+            input_tensor_array.append(t)
+
+        self.assertEqual(10, len(input_tensor_array))
+
+        random_grad = numpy.random.random_sample([11]).astype(numpy.float32)
+
+        y_out = block.create_var(name="Out")
+        y_out.persistable = True
+        y_out_index = block.create_var(name="OutIndex")
+        y_out_index.persistable = True
+
+        y_grad_arr = block.create_var(
+            name='Out@GRAD', dtype='float32', shape=[11])
+        y_grad_arr.persistable = True
+        y_grad = scope.var('Out@GRAD')
+        y_grad_tensor = y_grad.get_tensor()
+        y_grad_tensor.set(random_grad, cpu)
+
+        op = block.append_op(
+            type=self.op_type,
+            inputs={"X": input_arr},
+            outputs={"Out": y_out,
+                     "OutIndex": y_out_index},
+            attrs=self.attrs)
+
+        out_grad = block.create_var(
+            name="tmp_lod_tensor_array@GRAD",
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY)
+        out_grad.persistable = True
+
+        grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(op.desc,
+                                                                  set(), [])
+        grad_op_desc = grad_op_desc_list[0]
+        new_op_desc = block.desc.append_op()
+        new_op_desc.copy_from(grad_op_desc)
+        for var_name in grad_op_desc.output_arg_names():
+            block.desc.var(var_name.encode("ascii"))
+
+        grad_op_desc.infer_var_type(block.desc)
+        grad_op_desc.infer_shape(block.desc)
+        for arg in grad_op_desc.output_arg_names():
+            grad_var = block.desc.find_var(arg.encode("ascii"))
+            grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+        fetch_list = []
+        fetch_list.append(block.var('Out'))
+        fetch_list.append(block.var('OutIndex'))
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        out = exe.run(program, fetch_list=fetch_list, scope=scope)
+        #print ("index: ", numpy.array(out[1]))  
+
+        # test forward
+        tensor_res = numpy.array(out[0])
+        tensor_res_out_idx = numpy.array(out[1])
+        tensor_gt = numpy.array(
+            [0] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='float32')
+
+        self.assertEqual(len(tensor_res), len(tensor_gt))
+        self.assertEqual(len(tensor_res_out_idx), 10)
+
+        for i in range(len(tensor_res)):
+            self.assertEqual(tensor_res[i], tensor_gt[i])
+
+        for i in range(len(tensor_res_out_idx)):
+            if i == 0:
+                self.assertEqual(tensor_res_out_idx[i], 2)
+            else:
+                self.assertEqual(tensor_res_out_idx[i], 1)
+
+        # test backward
+        grad_tensor = scope.var('tmp_lod_tensor_array@GRAD')
+        grad_tensor_array = grad_tensor.get_lod_tensor_array()
+
+        self.assertEqual(10, len(grad_tensor_array))
+
+        for i in range(len(grad_tensor_array)):
+            if i == 0:
+                self.assertEqual(
+                    numpy.array(grad_tensor_array[i])[0],
+                    numpy.array(random_grad[i]))
+                self.assertEqual(
+                    numpy.array(grad_tensor_array[i])[1],
+                    numpy.array(random_grad[i + 1]))
+            if i == 1:
+                self.assertEqual(
+                    numpy.array(grad_tensor_array[i]),
+                    numpy.array(random_grad[i + 1]))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 7c7fba76718e911907bb9bef69b3e8688bbf52fc..89bc24802751340b6d4657be8673d714f3d3dc2b 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -31,18 +31,17 @@ Steps to transpile pserver:
 """
 
 import math
-import sys
 import numpy as np
 import collections
-import six
 import logging
 
-from .ps_dispatcher import RoundRobin, HashName, PSDispatcher
+from .ps_dispatcher import RoundRobin, PSDispatcher
 from .. import core, framework, unique_name
 from ..framework import Program, default_main_program, \
     default_startup_program, Block, \
     Parameter, grad_var_name
 from .details import *
+from ..distribute_lookup_table import find_distributed_lookup_table
 from functools import reduce
 
 LOOKUP_TABLE_TYPE = "lookup_table"
@@ -292,7 +291,8 @@ class DistributeTranspiler(object):
         self.optimize_ops, self.params_grads = self._get_optimize_pass()
 
         ps_dispatcher = self.config.split_method(self.pserver_endpoints)
-        self.has_distributed_lookup_table = self._has_distributed_lookup_table()
+        self.table_name = find_distributed_lookup_table(self.origin_program)
+        self.has_distributed_lookup_table = self.table_name != None
         self.param_name_to_grad_name = dict()
         self.grad_name_to_param_name = dict()
         for param_var, grad_var in self.params_grads:
@@ -966,28 +966,6 @@ to transpile() call.")
 
     # ====================== private transpiler functions =====================
 
-    def _has_distributed_lookup_table(self):
-        # process lookup_table_op
-        # 1. check all lookup_table_op is distributed
-        # 2. check all lookup_table_op share the same table.
-        distributed_lookup_table_ops = []
-        # support only one distributed_lookup_table now
-        self.table_name = None
-        for op in self.origin_program.global_block().ops:
-            if op.type == LOOKUP_TABLE_TYPE:
-                if op.attr('is_distributed') is True:
-                    if self.table_name is None:
-                        self.table_name = op.input("W")[0]
-                    if self.table_name != op.input("W")[0]:
-                        raise RuntimeError("all distributed lookup_table_ops"
-                                           " should have only one table")
-                    distributed_lookup_table_ops.append(op)
-                else:
-                    if self.table_name is not None:
-                        assert op.input("W")[0] != self.table_name
-
-        return len(distributed_lookup_table_ops) > 0
-
     def _update_dist_lookup_table_vars(self, param_list, grad_list,
                                        params_grads):
         # TODO(wuyi): put find a way to put dist lookup table stuff all together.
@@ -1341,7 +1319,6 @@ to transpile() call.")
         """
         create a new block to handle save checkpoint.
         """
-        import os
 
         pserver_program.global_block().create_var(
             name="kLookupTablePath",
@@ -1706,13 +1683,27 @@ to transpile() call.")
             outputs=outputs,
             attrs=opt_op.all_attrs())
 
-    def _is_splited_grad_var(self, var, var_dict):
+    def _get_pserver_grad_param_var(self, var, var_dict):
+        """
+        Return pserver side grad/param variable, return None
+        if the variable is not grad/param, e.g.
+
+            a@GRAD -> a@GRAD.block0
+            a@GRAD -> a@GRAD (a is not splited)
+            fc_0.w_0 -> fc_0.w_0.block_0
+            fc_0.w_0 -> fc_0.w_0 (weight is not splited)
+            _generated_var_123 -> None
+        """
         grad_block = None
         for _, g in six.iteritems(var_dict):
             if self._orig_varname(g.name) == self._orig_varname(var.name):
+                # skip per trainer vars
                 if g.name.find(".trainer_") == -1:
-                    grad_block = g
-                    break
+                    # only param or grads have splited blocks
+                    if self._orig_varname(g.name) in self.grad_name_to_param_name or\
+                        self._orig_varname(g.name) in self.param_name_to_grad_name:
+                        grad_block = g
+                        break
         return grad_block
 
     def _clone_lr_op(self, program, block, op):
@@ -1745,32 +1736,38 @@ to transpile() call.")
         for key, varlist in six.iteritems(inputs):
             if not isinstance(varlist, list):
                 varlist = [varlist]
-            for var in varlist:
-                # for ops like clipping and weight decay, get the splited var
+            for i in range(len(varlist)):
+                var = varlist[i]
+                # for ops like clipping and weight decay, get the splited var (xxx.block0)
                 # for inputs/outputs
-                grad_block = self._is_splited_grad_var(
+                grad_block = self._get_pserver_grad_param_var(
                     var, program.global_block().vars)
                 if grad_block:
-                    inputs[key] = grad_block
+                    varlist[i] = grad_block
                 elif var.name not in program.global_block().vars:
-                    program.global_block().create_var(
-                        name=var.name,
-                        persistable=var.persistable,
-                        dtype=var.dtype,
-                        shape=var.shape)
+                    tmpvar = program.global_block()._clone_variable(var)
+                    varlist[i] = tmpvar
+                else:
+                    varlist[i] = program.global_block().vars[var.name]
+            inputs[key] = varlist
 
         outputs = self._get_output_map_from_op(
             self.origin_program.global_block().vars, opt_op)
         for key, varlist in six.iteritems(outputs):
             if not isinstance(varlist, list):
                 varlist = [varlist]
-            for var in varlist:
-                grad_block = self._is_splited_grad_var(
+            for i in range(len(varlist)):
+                var = varlist[i]
+                grad_block = self._get_pserver_grad_param_var(
                     var, program.global_block().vars)
                 if grad_block:
-                    outputs[key] = grad_block
+                    varlist[i] = grad_block
                 elif var.name not in program.global_block().vars:
-                    program.global_block()._clone_variable(var)
+                    tmpvar = program.global_block()._clone_variable(var)
+                    varlist[i] = tmpvar
+                else:
+                    varlist[i] = program.global_block().vars[var.name]
+            outputs[key] = varlist
 
         return optimize_block.append_op(
             type=opt_op.type,
diff --git a/python/setup.py.in b/python/setup.py.in
index b1ff9f3a5c3d877edb6bc6a12efce053a44b4c9c..c623057d5081a6fedcd90eb5f5d53531a5d62bb8 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -174,6 +174,18 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
             raise Exception("patch libmkldnn.so failed, command: %s" % command)
         package_data['paddle.libs']+=['libmkldnn.so.0']
         shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
+if '${WITH_NGRAPH}' == 'ON':
+    if '${CMAKE_BUILD_TYPE}' == 'Release':
+        # only change rpath in Release mode.
+        command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}"
+        if os.system(command) != 0:
+            raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command)
+    shutil.copy('${NGRAPH_SHARED_LIB}', libs_path)
+    shutil.copy('${NGRAPH_CPU_LIB}', libs_path)
+    shutil.copy('${NGRAPH_TBB_LIB}', libs_path)
+    package_data['paddle.libs']+=['${NGRAPH_SHARED_LIB_NAME}',
+                                  '${NGRAPH_CPU_LIB_NAME}',
+                                  '${NGRAPH_TBB_LIB_NAME}']
 # remove unused paddle/libs/__init__.py
 os.remove(libs_path+'/__init__.py')
 package_dir['paddle.libs']=libs_path