diff --git a/AUTHORS.md b/AUTHORS.md
index 41b7193677a0208ba2fa82b72862292572dcb6ef..4060f75613ac4dadf353ff53a73fd0647a8052be 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -43,6 +43,7 @@
 | qingqing01 | Qing-Qing Dang |
 | reyoung | Yang Yu |
 | Superjom | Chun-Wei Yan |
+| tensor-tang | Jian Tang |
 | tianbingsz | Tian-Bing Xu |
 | tpatejko | Tomasz Patejko |
 | typhoonzero | Yi Wu |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ed704585d8a6bf3befd9a549aa5a62a33fea3da9..996a79fbbc3005680205e9fc0442b6bc6199bebb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,11 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
         "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 if(WIN32)
     set(CMAKE_STATIC_LIBRARY_PREFIX lib)
+    add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
 endif(WIN32)
 
 if(NOT CMAKE_CROSSCOMPILING)
@@ -41,6 +46,7 @@ option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_F
 option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
+option(WITH_NGRAPH      "Compile PaddlePaddle with nGraph support."     OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -65,6 +71,8 @@ option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
+option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF)
+option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(ON_INFER         "Turn on inference optimization."               OFF)
@@ -103,6 +111,8 @@ if(ANDROID OR IOS)
         "Disable RDMA when cross-compiling for Android and iOS" FORCE)
     set(WITH_MKL OFF CACHE STRING
         "Disable MKL when cross-compiling for Android and iOS" FORCE)
+    set(WITH_NGRAPH OFF CACHE STRING
+        "Disable nGraph when cross-compiling for Android and iOS" FORCE)
     set(WITH_GOLANG OFF CACHE STRING
         "Disable golang when cross-compiling for Android and iOS" FORCE)
 
@@ -171,6 +181,7 @@ include(external/protobuf)  # download, build, install protobuf
 include(external/python)    # download, build, install python
 include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
+include(external/ngraph)    # download, build, install nGraph
 include(external/swig)      # download, build, install swig
 include(external/boost)     # download boost
 include(external/any)       # download libn::any
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index f507bb41a1103c093e9569176ee868cfaac6bf7b..964d5fd45b350db2e5948574f53a427e53484ff4 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -157,6 +157,9 @@ list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
 if(NOT WITH_DSO)
     # TODO(panyx0718): CUPTI only allows DSO?
     list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
+    if(WIN32)
+      set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
+    endif(WIN32)
 endif(NOT WITH_DSO)
 
 # setting nvcc arch flags
@@ -196,10 +199,12 @@ elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
     list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
 endif()
 else(NOT WIN32)
-if(CMAKE_BUILD_TYPE STREQUAL "Release")
+if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    list(APPEND CUDA_NVCC_FLAGS  "-g -G")
+elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
   list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
 else()
-  message(FATAL "Windows only support Release build now. Please set visual studio build type to Release, x64 build.")
+  message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
 endif()
 endif(NOT WIN32)
 
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index cd51533926de7bb132ab7bfab1686d664a331410..09bec347dbd569203103eccc7dbc0521c291bc0a 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -2,7 +2,12 @@ if(NOT WITH_GPU)
     return()
 endif()
 
-set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT")
+if(WIN32)
+    set(CUDNN_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+else(WIN32)
+    set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT")
+endif(WIN32)
+
 find_path(CUDNN_INCLUDE_DIR cudnn.h
     PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
     $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
index 84354c446e2f54fa13b90fa37221eed90968b251..06fc6061bc98eec8c4c71860333f7d3456952aeb 100644
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -58,19 +58,21 @@ ExternalProject_Add(
                         -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
                         -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
                         -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
+                        -DBUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN}
+                        -DBUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM}
                         ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
 )
 
 message(STATUS "Anakin for inference is enabled")
 message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
-
+add_dependencies(extern_anakin protobuf mklml)
 add_library(anakin_shared SHARED IMPORTED GLOBAL)
 set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB})
-add_dependencies(anakin_shared extern_anakin protobuf mklml)
+add_dependencies(anakin_shared extern_anakin)
 
 add_library(anakin_saber SHARED IMPORTED GLOBAL)
 set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
-add_dependencies(anakin_saber extern_anakin protobuf mklml)
+add_dependencies(anakin_saber extern_anakin)
 
 list(APPEND external_project_dependencies anakin_shared anakin_saber)
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index ada61de8eb15ae10288ac54f588e9adf84acee37..5a78a1d1b7dea0d95ae3fa2c9f39679899dd1bcb 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -28,34 +28,28 @@ if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL))
     set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
     set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
 endif()
-IF (WIN32)
-    MESSAGE(WARNING, "In windows, boost can not be downloaded automaticlly, please build it manually and put it at " ${THIRD_PARTY_PATH}install/boost)
-else()
-    MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
-ENDIF(WIN32)
+
+MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
 
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
-set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
-set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
 
+set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}" CACHE PATH "boost include directory." FORCE)
+set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
 include_directories(${BOOST_INCLUDE_DIR})
 
-if (NOT WIN32)
 ExternalProject_Add(
     ${BOOST_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DOWNLOAD_DIR          ${BOOST_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
-    && tar zxf ${BOOST_TAR}.tar.gz
+    URL      ${BOOST_URL}
     DOWNLOAD_NO_PROGRESS  1
     PREFIX                ${BOOST_SOURCES_DIR}
     CONFIGURE_COMMAND     ""
     BUILD_COMMAND         ""
     INSTALL_COMMAND       ""
     UPDATE_COMMAND        ""
-)
-endif(NOT WIN32)
+    )
 
 if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
     set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index cf58cc39762351f8b37d073bcd218d249285bf52..4e98e4bf889bc13938931be7f6cb204c83250a5c 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -35,7 +35,12 @@ ExternalProject_Add(
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                     -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    -DBUILD_STATIC_LIBS=ON
                     -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                     -DBUILD_TESTING=OFF
@@ -48,8 +53,8 @@ ExternalProject_Add(
 IF(WIN32)
   IF(NOT EXISTS "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib")
     add_custom_command(TARGET extern_gflags POST_BUILD
-    COMMAND cmake -E rename ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib
-  )
+            COMMAND cmake -E copy ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib
+            )
   ENDIF()
 ENDIF(WIN32)
 ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 25ef2970ac52f12f961c9c6d3a589fec4c80983f..8cd0455c16bf84909b735102e7fb1089744c4245 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -46,7 +46,11 @@ ExternalProject_Add(
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                     -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
                     -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
                     -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
@@ -63,7 +67,7 @@ ExternalProject_Add(
 IF(WIN32)
   IF(NOT EXISTS "${GLOG_INSTALL_DIR}/lib/libglog.lib")
     add_custom_command(TARGET extern_glog POST_BUILD
-    COMMAND cmake -E rename ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib
+    COMMAND cmake -E copy ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib
   )
   ENDIF()
 ENDIF(WIN32)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 9fea9ca05bce921b105a0f092c321b0c3a55c63c..785148d4f9f44032e2ce5bf93f0dc80fc865808b 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -37,7 +37,6 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
 
 INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
-INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include mkldnn.h
 
 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
     SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..2e335579f32df4f146c8d88e05e684a9a8105e20
--- /dev/null
+++ b/cmake/external/ngraph.cmake
@@ -0,0 +1,92 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_library(ngraph INTERFACE)
+
+IF(WIN32 OR APPLE)
+    MESSAGE(WARNING
+        "Windows or Mac is not supported with nGraph in Paddle yet."
+        "Force WITH_NGRAPH=OFF")
+    SET(WITH_NGRAPH OFF CACHE STRING "Disable nGraph in Windows and MacOS" FORCE)
+ENDIF()
+
+IF(${WITH_NGRAPH} AND NOT ${WITH_MKLDNN})
+    MESSAGE(WARNING
+        "nGraph needs mkl-dnn to be enabled."
+        "Force WITH_NGRAPH=OFF")
+    SET(WITH_NGRAPH OFF CACHE STRING "Disable nGraph if mkl-dnn is disabled" FORCE)
+ENDIF()
+
+IF(NOT ${WITH_NGRAPH})
+    return()
+ENDIF()
+
+INCLUDE(ExternalProject)
+
+SET(NGRAPH_PROJECT         "extern_ngraph")
+SET(NGRAPH_VERSION         "0.9")
+SET(NGRAPH_GIT_TAG         "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
+SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
+SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
+SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
+SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION})
+SET(NGRAPH_CPU_LIB_NAME    libcpu_backend.so)
+SET(NGRAPH_TBB_LIB_NAME    libtbb.so.2)
+SET(NGRAPH_GIT_REPO        "https://github.com/NervanaSystems/ngraph.git")
+
+ExternalProject_Add(
+    ${NGRAPH_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    DEPENDS             ${MKLDNN_PROJECT} ${MKLML_PROJECT}
+    GIT_REPOSITORY      ${NGRAPH_GIT_REPO}
+    GIT_TAG             ${NGRAPH_GIT_TAG}
+    PREFIX              ${NGRAPH_SOURCES_DIR}
+    UPDATE_COMMAND      ""
+    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${NGRAPH_INSTALL_DIR}
+    CMAKE_ARGS          -DNGRAPH_UNIT_TEST_ENABLE=FALSE
+    CMAKE_ARGS          -DNGRAPH_TOOLS_ENABLE=FALSE
+    CMAKE_ARGS          -DNGRAPH_INTERPRETER_ENABLE=FALSE
+    CMAKE_ARGS          -DNGRAPH_DEX_ONLY=TRUE
+    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+    CMAKE_ARGS          -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
+    CMAKE_ARGS          -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
+)
+
+if(UNIX AND NOT APPLE)
+    include(GNUInstallDirs)
+    SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
+else()
+    SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/lib)
+endif()
+MESSAGE(STATUS "nGraph lib will be installed at: ${NGRAPH_LIB_DIR}")
+
+SET(NGRAPH_SHARED_LIB      ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
+SET(NGRAPH_CPU_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
+SET(NGRAPH_TBB_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
+
+# Workaround for nGraph expecting mklml to be in mkldnn install directory.
+ExternalProject_Add_Step(
+    ${NGRAPH_PROJECT}
+    PrepareMKL
+    COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_LIB} ${MKLDNN_INSTALL_DIR}/lib/libmklml_intel.so
+    COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_IOMP_LIB} ${MKLDNN_INSTALL_DIR}/lib/libiomp5.so
+    DEPENDEES download
+    DEPENDERS configure
+)
+
+add_dependencies(ngraph ${NGRAPH_PROJECT})
+target_compile_definitions(ngraph INTERFACE -DPADDLE_WITH_NGRAPH)
+target_include_directories(ngraph INTERFACE ${NGRAPH_INC_DIR})
+target_link_libraries(ngraph INTERFACE ${NGRAPH_SHARED_LIB})
+LIST(APPEND external_project_dependencies ngraph)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 755dbd610c40c2d9b85d3017b6f000a869b0f39a..aeb976b840e999a20e8cab11939cbb1f49a27850 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -17,12 +17,8 @@ IF(USE_EIGEN_FOR_BLAS)
 ENDIF(USE_EIGEN_FOR_BLAS)
 
 INCLUDE(cblas)
-# IF(WIN32 AND NOT ${CBLAS_FOUND})
-
-
 
 IF(NOT ${CBLAS_FOUND})
-
     INCLUDE(ExternalProject)
 
     SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
@@ -34,6 +30,7 @@ IF(NOT ${CBLAS_FOUND})
         CACHE FILEPATH "openblas library." FORCE)
 
     ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
+
     IF (WIN32)
         SET(CBLAS_FOUND true)
         MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 550b0dada8e90c1e2b33705fd53c065672113b45..e1e619e572b05e83fbe751af2e5391aafc494416 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -30,66 +30,61 @@ UNSET_VAR(PROTOBUF_LITE_LIBRARY)
 UNSET_VAR(PROTOBUF_LIBRARY)
 UNSET_VAR(PROTOBUF_INCLUDE_DIR)
 UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
+function(protobuf_generate_python SRCS)
+    # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
+    if(NOT ARGN)
+        message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
+        return()
+    endif()
 
-if(NOT COMMAND protobuf_generate_python)  # before cmake 3.4, protobuf_genrerate_python is not defined.
-    function(protobuf_generate_python SRCS)
-        # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
-        if(NOT ARGN)
-            message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
-            return()
-        endif()
-
-        if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
-            # Create an include path for each file specified
-            foreach(FIL ${ARGN})
-                get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-                get_filename_component(ABS_PATH ${ABS_FIL} PATH)
-                list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
-                if(${_contains_already} EQUAL -1)
-                    list(APPEND _protobuf_include_path -I ${ABS_PATH})
-                endif()
-            endforeach()
-        else()
-            set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
-        endif()
-
-        if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
-            set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
-        endif()
-
-        if(DEFINED Protobuf_IMPORT_DIRS)
-            foreach(DIR ${Protobuf_IMPORT_DIRS})
-                get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
-                list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
-                if(${_contains_already} EQUAL -1)
-                    list(APPEND _protobuf_include_path -I ${ABS_PATH})
-                endif()
-            endforeach()
-        endif()
-
-        set(${SRCS})
+    if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
+        # Create an include path for each file specified
         foreach(FIL ${ARGN})
             get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-            get_filename_component(FIL_WE ${FIL} NAME_WE)
-            if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
-                get_filename_component(FIL_DIR ${FIL} DIRECTORY)
-                if(FIL_DIR)
-                    set(FIL_WE "${FIL_DIR}/${FIL_WE}")
-                endif()
+            get_filename_component(ABS_PATH ${ABS_FIL} PATH)
+            list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+            if(${_contains_already} EQUAL -1)
+                list(APPEND _protobuf_include_path -I ${ABS_PATH})
             endif()
+        endforeach()
+    else()
+        set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
+    endif()
+    if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
+        set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
+    endif()
 
-            list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
-            add_custom_command(
-                    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
-                    COMMAND  ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
-                    DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE}
-                    COMMENT "Running Python protocol buffer compiler on ${FIL}"
-                    VERBATIM )
+    if(DEFINED Protobuf_IMPORT_DIRS)
+        foreach(DIR ${Protobuf_IMPORT_DIRS})
+            get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
+            list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+            if(${_contains_already} EQUAL -1)
+                list(APPEND _protobuf_include_path -I ${ABS_PATH})
+            endif()
         endforeach()
+    endif()
 
-        set(${SRCS} ${${SRCS}} PARENT_SCOPE)
-    endfunction()
-endif()
+    set(${SRCS})
+    foreach(FIL ${ARGN})
+        get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+        get_filename_component(FIL_WE ${FIL} NAME_WE)
+        if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
+            get_filename_component(FIL_DIR ${FIL} DIRECTORY)
+            if(FIL_DIR)
+                set(FIL_WE "${FIL_DIR}/${FIL_WE}")
+            endif()
+        endif()
+        list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
+        add_custom_command(
+                OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
+                COMMAND  ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
+                DEPENDS ${ABS_FIL} ${PROTOBUF_PROTOC_EXECUTABLE}
+                COMMENT "Running Python protocol buffer compiler on ${FIL}"
+                VERBATIM )
+    endforeach()
+
+    set(${SRCS} ${${SRCS}} PARENT_SCOPE)
+endfunction()
 
 # Print and set the protobuf library information,
 # finish this cmake process and exit from this file.
@@ -126,6 +121,7 @@ macro(PROMPT_PROTOBUF_LIB)
     # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
     # make `protobuf_generate_cpp` happy.
     SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
+
     FOREACH(dep ${protobuf_DEPS})
         ADD_DEPENDENCIES(protobuf ${dep})
         ADD_DEPENDENCIES(protobuf_lite ${dep})
@@ -144,7 +140,6 @@ endmacro()
 set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
 IF (WIN32)
     SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf)
-    MESSAGE(WARNING, "In windows, protobuf only support msvc build, please build it manually and put it at " ${PROTOBUF_ROOT})
 ENDIF(WIN32)
 
 if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
@@ -192,13 +187,20 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
         SET(OPTIONAL_ARGS
             "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
             "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
-            "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
             "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
+            "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}"
+            "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}"
+            "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
+            "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
+            "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"
             "-Dprotobuf_WITH_ZLIB=ON"
             "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}"
             ${EXTERNAL_OPTIONAL_ARGS})
         SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
     ENDIF()
+    IF(WIN32)
+        SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64")
+    ENDIF()
 
     SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
     SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index f17b8d46dc2d8ded81ced7de5827d5e7fd5109f0..a3599dd798c07f57ed82e3f25b6bb9fc4f8bdc3a 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -21,6 +21,48 @@ INCLUDE(python_module)
 FIND_PACKAGE(PythonInterp ${PY_VERSION})
 FIND_PACKAGE(PythonLibs ${PY_VERSION})
 
+if(WIN32)
+    execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+"from distutils import sysconfig as s;import sys;import struct;
+print(sys.prefix);
+print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
+"
+            RESULT_VARIABLE _PYTHON_SUCCESS
+            OUTPUT_VARIABLE _PYTHON_VALUES
+            ERROR_VARIABLE _PYTHON_ERROR_VALUE)
+
+    if(NOT _PYTHON_SUCCESS MATCHES 0)
+        set(PYTHONLIBS_FOUND FALSE)
+        return()
+    endif()
+
+    # Convert the process output into a list
+    string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
+    string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
+    list(GET _PYTHON_VALUES 0 PYTHON_PREFIX)
+    list(GET _PYTHON_VALUES 1 PYTHON_LIBRARY_SUFFIX)
+
+    # Make sure all directory separators are '/'
+    string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX})
+
+    set(PYTHON_LIBRARY
+            "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
+
+    # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the
+    # original python installation. They may be found relative to PYTHON_INCLUDE_DIR.
+    if(NOT EXISTS "${PYTHON_LIBRARY}")
+        get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY)
+        set(PYTHON_LIBRARY
+                "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
+    endif()
+
+    # raise an error if the python libs are still not found.
+    if(NOT EXISTS "${PYTHON_LIBRARY}")
+        message(FATAL_ERROR "Python libraries not found")
+    endif()
+    SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}")
+endif(WIN32)
+
 # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
 ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
index c227e09719bd5f0e825f81fb96f78105aa10c79b..4c2d64f627401071098e72bfb930fb5d62fa042d 100644
--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@@ -14,23 +14,52 @@ ELSE()
   ENDIF(APPLE)
 ENDIF()
 
-ExternalProject_Add(
-    extern_xxhash
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
-    GIT_TAG         "v0.6.5"
-    PREFIX          ${XXHASH_SOURCE_DIR}
-    DOWNLOAD_NAME   "xxhash"
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_IN_SOURCE 1
-    PATCH_COMMAND
-    BUILD_COMMAND     ${BUILD_CMD}
-    INSTALL_COMMAND   export PREFIX=${XXHASH_INSTALL_DIR}/ && make install
-    TEST_COMMAND      ""
-)
+if(WIN32)
+  ExternalProject_Add(
+          extern_xxhash
+          ${EXTERNAL_PROJECT_LOG_ARGS}
+          GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
+          GIT_TAG         "v0.6.5"
+          PREFIX          ${XXHASH_SOURCE_DIR}
+          DOWNLOAD_NAME   "xxhash"
+          UPDATE_COMMAND  ""
+          BUILD_IN_SOURCE 1
+          PATCH_COMMAND
+          CONFIGURE_COMMAND
+          ${CMAKE_COMMAND} ${XXHASH_SOURCE_DIR}/src/extern_xxhash/cmake_unofficial
+          -DCMAKE_INSTALL_PREFIX:PATH=${XXHASH_INSTALL_DIR}
+          -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
+          -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+          -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+          -DBUILD_XXHSUM=OFF
+          -DCMAKE_GENERATOR_PLATFORM=x64
+          -DBUILD_SHARED_LIBS=OFF
+          ${OPTIONAL_CACHE_ARGS}
+          TEST_COMMAND      ""
+  )
+else()
+  ExternalProject_Add(
+      extern_xxhash
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
+      GIT_TAG         "v0.6.5"
+      PREFIX          ${XXHASH_SOURCE_DIR}
+      DOWNLOAD_NAME   "xxhash"
+      UPDATE_COMMAND  ""
+      CONFIGURE_COMMAND ""
+      BUILD_IN_SOURCE 1
+      PATCH_COMMAND
+      BUILD_COMMAND     ${BUILD_CMD}
+      INSTALL_COMMAND   export PREFIX=${XXHASH_INSTALL_DIR}/ && make install
+      TEST_COMMAND      ""
+  )
+endif()
 
-set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
+if (WIN32)
+  set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib")
+else()
+  set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
+endif ()
 INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR})
 
 add_library(xxhash STATIC IMPORTED GLOBAL)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 62227c67849dbb476339a176e0c98e295cbf529c..e21f89c7c585053631391852522d47cd7ffa7638 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -266,7 +266,11 @@ function(cc_library TARGET_NAME)
       if("${cc_library_DEPS};" MATCHES "python;")
         list(REMOVE_ITEM cc_library_DEPS python)
         add_dependencies(${TARGET_NAME} python)
-        target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup")
+        if(WIN32)
+          target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
+        else()
+          target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup")
+        endif(WIN32)
       endif()
       target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
@@ -288,6 +292,45 @@ function(cc_library TARGET_NAME)
   endif(cc_library_SRCS)
 endfunction(cc_library)
 
+# The link operation under windows may exceeds the maximum characters limit, simply break the link command
+# into multiple link opeartion can fix that, say
+# original:
+#     lib /out:target.lib a.lib b.lib c.lib d.lib
+# after:
+#    1. lib /out:dummy_lib_1.lib a.lib b.lib
+#    2. lib /out:dummy_lib_2.lib c.lib d.lib
+#    1. lib /out:target.lib dummy_lib_1.lib dummy_lib_2.lib
+function(sep_library TARGET_NAME)
+  set(options STATIC static SHARED shared)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(sep_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(dummy_index 1)
+  set(dummy_offset 1)
+  # the dummy target would be consisted of limit size libraries
+  set(dummy_limit 50)
+  list(LENGTH sep_library_DEPS sep_all_len)
+  foreach(v ${sep_library_DEPS})
+    list(APPEND dummy_list ${v})
+    list(LENGTH dummy_list listlen )
+    if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${sep_all_len}))
+      message("create dummy library ${TARGET_NAME}_dummy_lib_${dummy_index} for ${TARGET_NAME}")
+      cc_library(${TARGET_NAME}_dummy_lib_${dummy_index} STATIC DEPS ${dummy_list})
+      foreach(i ${dummy_list})
+        list(REMOVE_AT dummy_list 0)
+      endforeach()
+      list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_lib_${dummy_index})
+      MATH(EXPR dummy_index "${dummy_index}+1")
+    endif()
+    MATH(EXPR dummy_offset "${dummy_offset}+1")
+  endforeach()
+  if(${sep_library_SHARED})
+    cc_library(${TARGET_NAME} SHARED SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list})
+  else(${sep_library_SHARED})
+    cc_library(${TARGET_NAME} STATIC SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list})
+  endif(${sep_library_SHARED})
+endfunction(sep_library)
+
 function(cc_binary TARGET_NAME)
   set(options "")
   set(oneValueArgs "")
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index efdb093a7b28e19f3b2a774dd54f2e7f042e9ca7..729bdcb3dc5324df0a5272402ef203012be0072a 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -22,175 +22,196 @@ function(copy TARGET)
 
     list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
     list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
-    if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
+    if (NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
         message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
-    endif()
+    endif ()
     math(EXPR len "${copy_lib_SRCS_len} - 1")
 
     add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
-    foreach(index RANGE ${len})
+    foreach (index RANGE ${len})
         list(GET copy_lib_SRCS ${index} src)
         list(GET copy_lib_DSTS ${index} dst)
-        add_custom_command(TARGET ${TARGET} PRE_BUILD
-          COMMAND mkdir -p "${dst}"
-          COMMAND cp -r "${src}" "${dst}"
-          COMMENT "copying ${src} -> ${dst}")
-    endforeach()
+        if (WIN32)
+            # windows cmd shell will not expand wildcard automatically.
+            # below expand the files,libs and copy them by rules.
+            file(GLOB header_files ${src} "*.h")
+            file(GLOB static_lib_files ${src} "*.lib")
+            file(GLOB dll_lib_files ${src} "*.dll")
+            set(src_files ${header_files} ${static_lib_files} ${dll_lib_files})
+
+            if (NOT "${src_files}" STREQUAL "")
+                list(REMOVE_DUPLICATES src_files)
+            endif ()
+            add_custom_command(TARGET ${TARGET} PRE_BUILD
+                    COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
+                    )
+            foreach (src_file ${src_files})
+                add_custom_command(TARGET ${TARGET} PRE_BUILD
+                        COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}"
+                        COMMENT "copying ${src_file} -> ${dst}")
+            endforeach ()
+        else (WIN32) # not windows
+            add_custom_command(TARGET ${TARGET} PRE_BUILD
+                    COMMAND mkdir -p "${dst}"
+                    COMMAND cp -r "${src}" "${dst}"
+                    COMMENT "copying ${src} -> ${dst}")
+        endif (WIN32) # not windows
+    endforeach ()
 endfunction()
 
 # third party
 set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3")
 copy(eigen3_lib
-  SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
-  DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
-  DEPS eigen3
-)
+        SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
+        DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
+        DEPS eigen3
+        )
 
 set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/gflags")
 copy(gflags_lib
-  SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
-  DSTS ${dst_dir} ${dst_dir}/lib
-  DEPS gflags
-)
+        SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
+        DSTS ${dst_dir} ${dst_dir}/lib
+        DEPS gflags
+        )
 
 set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/glog")
 copy(glog_lib
-  SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
-  DSTS ${dst_dir} ${dst_dir}/lib
-  DEPS glog
-)
+        SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
+        DSTS ${dst_dir} ${dst_dir}/lib
+        DEPS glog
+        )
 
 set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost/")
 copy(boost_lib
-  SRCS ${BOOST_INCLUDE_DIR}/boost
-  DSTS ${dst_dir}
-  DEPS boost
-)
+        SRCS ${BOOST_INCLUDE_DIR}/boost
+        DSTS ${dst_dir}
+        DEPS boost
+        )
 
 set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash")
 copy(xxhash_lib
-  SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
-  DSTS ${dst_dir} ${dst_dir}/lib
-  DEPS xxhash
-)
+        SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
+        DSTS ${dst_dir} ${dst_dir}/lib
+        DEPS xxhash
+        )
 
-if(NOT PROTOBUF_FOUND)
+if (NOT PROTOBUF_FOUND)
     set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
     copy(protobuf_lib
-      SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
-      DSTS ${dst_dir} ${dst_dir}/lib
-      DEPS extern_protobuf
-    )
-endif()
+            SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
+            DSTS ${dst_dir} ${dst_dir}/lib
+            DEPS extern_protobuf
+            )
+endif ()
 
-if(NOT CBLAS_FOUND)
+if (NOT CBLAS_FOUND)
     set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
     copy(openblas_lib
-      SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
-      DSTS ${dst_dir} ${dst_dir}
-      DEPS extern_openblas
-    )
+            SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
+            DSTS ${dst_dir} ${dst_dir}
+            DEPS extern_openblas
+            )
 elseif (WITH_MKLML)
     set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml")
     copy(mklml_lib
-      SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
-      DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
-      DEPS mklml
-    )
-endif()
-
-if(WITH_MKLDNN)
-  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
-  copy(mkldnn_lib
-    SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
-    DSTS ${dst_dir} ${dst_dir}/lib
-    DEPS mkldnn
-  )
-endif()
+            SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
+            DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
+            DEPS mklml
+            )
+endif ()
+
+if (WITH_MKLDNN)
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
+    copy(mkldnn_lib
+            SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
+            DSTS ${dst_dir} ${dst_dir}/lib
+            DEPS mkldnn
+            )
+endif ()
 
 if (NOT WIN32)
-if(NOT MOBILE_INFERENCE AND NOT RPI)
-  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
-  copy(snappy_lib
-    SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
-    DSTS ${dst_dir} ${dst_dir}/lib
-    DEPS snappy)
-
-  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
-  copy(snappystream_lib
-    SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
-    DSTS ${dst_dir} ${dst_dir}/lib
-    DEPS snappystream)
-
-  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
-  copy(zlib_lib
-    SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
-    DSTS ${dst_dir} ${dst_dir}/lib
-    DEPS zlib)
-endif()
-endif(NOT WIN32)
+    if (NOT MOBILE_INFERENCE AND NOT RPI)
+        set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
+        copy(snappy_lib
+                SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
+                DSTS ${dst_dir} ${dst_dir}/lib
+                DEPS snappy)
+
+        set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
+        copy(snappystream_lib
+                SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
+                DSTS ${dst_dir} ${dst_dir}/lib
+                DEPS snappystream)
+
+        set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
+        copy(zlib_lib
+                SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
+                DSTS ${dst_dir} ${dst_dir}/lib
+                DEPS zlib)
+    endif ()
+endif (NOT WIN32)
 
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
 set(module "framework")
 if (NOT WIN32)
-set(framework_lib_deps framework_py_proto)
-endif(NOT WIN32)
+    set(framework_lib_deps framework_py_proto)
+endif (NOT WIN32)
 copy(framework_lib DEPS ${framework_lib_deps}
-  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
-       ${src_dir}/${module}/ir/*.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir
-)
+        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
+        ${src_dir}/${module}/ir/*.h
+        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir
+        )
 
 set(module "memory")
 copy(memory_lib
-  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
-)
+        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
+        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
+        )
 
 set(inference_deps paddle_fluid_shared paddle_fluid)
 
 set(module "inference/api")
 if (WITH_ANAKIN AND WITH_MKL)
     copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
-        SRCS
-        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
-        ${ANAKIN_INSTALL_DIR} # anakin release
-        DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
-     list(APPEND inference_deps anakin_inference_lib)
-endif()
+            SRCS
+            ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
+            ${ANAKIN_INSTALL_DIR} # anakin release
+            DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
+    list(APPEND inference_deps anakin_inference_lib)
+endif ()
 
 set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
   SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-       ${src_dir}/${module}/api/paddle_inference_api.h
+       ${src_dir}/${module}/api/paddle_*.h
        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
   DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
-)
+        )
 
 set(module "platform")
 copy(platform_lib DEPS profiler_py_proto
-  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
-)
+        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
+        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
+        )
 
 set(module "string")
 copy(string_lib
-  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
-)
+        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
+        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
+        )
 
 set(module "pybind")
 copy(pybind_lib
-  SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h
-  DSTS ${dst_dir}/${module}
-)
+        SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h
+        DSTS ${dst_dir}/${module}
+        )
 
 # CMakeCache Info
 copy(cmake_cache
-  SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
-  DSTS ${FLUID_INSTALL_DIR})
+        SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
+        DSTS ${FLUID_INSTALL_DIR})
 
 # This command generates a complete fluid library for both train and inference
 add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
@@ -198,14 +219,14 @@ add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
 # Following commands generate a inference-only fluid library
 # third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR}
 copy(third_party DEPS fluid_lib_dist
-  SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt
-  DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR}
-)
+        SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt
+        DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR}
+        )
 
-# only need libpaddle_fluid.so/a and paddle_inference_api.h for inference-only library
+# only need libpaddle_fluid.so/a and paddle_*.h for inference-only library
 copy(inference_api_lib DEPS fluid_lib_dist
   SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-       ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_inference_api.h
+       ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_*.h
   DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include
 )
 
@@ -213,20 +234,20 @@ add_custom_target(inference_lib_dist DEPENDS third_party inference_api_lib)
 
 # paddle fluid version
 function(version version_file)
-  execute_process(
-    COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-    OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
-  file(WRITE ${version_file}
-    "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
-    "WITH_MKL: ${WITH_MKL}\n"
-    "WITH_MKLDNN: ${WITH_MKLDNN}\n"
-    "WITH_GPU: ${WITH_GPU}\n")
-  if(WITH_GPU)
-    file(APPEND ${version_file}
-      "CUDA version: ${CUDA_VERSION}\n"
-      "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
-  endif()
+    execute_process(
+            COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+            OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
+    file(WRITE ${version_file}
+            "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
+            "WITH_MKL: ${WITH_MKL}\n"
+            "WITH_MKLDNN: ${WITH_MKLDNN}\n"
+            "WITH_GPU: ${WITH_GPU}\n")
+    if (WITH_GPU)
+        file(APPEND ${version_file}
+                "CUDA version: ${CUDA_VERSION}\n"
+                "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
+    endif ()
 endfunction()
 version(${FLUID_INSTALL_DIR}/version.txt)
 version(${FLUID_INFERENCE_INSTALL_DIR}/version.txt)
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
index fa0e834a1dfd6e60f0ec07945be9a4d84017316f..3dc7171551bfb7aff8d1e75083c98b00378d247f 100644
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -34,4 +34,5 @@ if(TENSORRT_FOUND)
         "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
     include_directories(${TENSORRT_INCLUDE_DIR})
     list(APPEND EXTERNAL_LIBS ${TENSORRT_LIBRARY})
+    add_definitions(-DPADDLE_WITH_TENSORRT)
 endif()
diff --git a/doc/v2/dev/contribute_to_paddle_en.md b/doc/v2/dev/contribute_to_paddle_en.md
index c97564d93a7f0a753a23cd97d2467d595bd154ff..72723396444c0a6cc0516f6f2379b2d868ba59f7 120000
--- a/doc/v2/dev/contribute_to_paddle_en.md
+++ b/doc/v2/dev/contribute_to_paddle_en.md
@@ -1 +1 @@
-../../../CONTRIBUTING.md
\ No newline at end of file
+../../../CONTRIBUTING.md
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index a155c7052be4f07b568af7954f270ea13bd70164..3378d210cdf6a625e11b1dd5fe348aa04cdb9361 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -103,7 +103,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
-paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode'], varargs=None, keywords=None, defaults=(False, -100, False))
+paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, False, False))
 paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
@@ -184,6 +184,7 @@ paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name']
 paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
 paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@@ -273,6 +274,7 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k
 paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False))
+paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, None))
 paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False))
 paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index 7d48f0057140cf021a21ea7e304b7e38cc8b9ec2..abadda3adb00e1f41e90e07aa5e961134e69ae3d 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -4,11 +4,12 @@ add_subdirectory(framework)
 add_subdirectory(operators)
 add_subdirectory(string)
 
-if (NOT WIN32)
 add_subdirectory(pybind)
+if (NOT WIN32)
 add_subdirectory(recordio)
 endif(NOT WIN32)
 
 # NOTE: please add subdirectory inference at last.
 add_subdirectory(inference)
+
 add_subdirectory(train)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 844291140602a7a0aac9d9d40256deaf9d8a4c60..cb9057672cc2c29af21b662edc189004bb0a4866 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -136,20 +136,32 @@ cc_library(version SRCS version.cc)
 cc_test(version_test SRCS version_test.cc DEPS version)
 
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
+cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto)
+if(NOT WIN32)
+cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
+  shape_inference data_transform lod_tensor profiler)
+endif(NOT WIN32)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 
-if (NOT WIN32)
 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
-add_custom_command(TARGET framework_py_proto POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-    COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
-    COMMENT "Copy generated python proto into directory paddle/fluid/proto."
-    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+if (NOT WIN32)
+  add_custom_command(TARGET framework_py_proto POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
+      COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
+      COMMENT "Copy generated python proto into directory paddle/fluid/proto."
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+else(NOT WIN32)
+  string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/")
+  add_custom_command(TARGET framework_py_proto POST_BUILD
+          COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
+          COMMAND copy /Y *.py ${proto_dstpath}
+          COMMENT "Copy generated python proto into directory paddle/fluid/proto."
+          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif(NOT WIN32)
 
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
@@ -163,10 +175,14 @@ if(WITH_DISTRIBUTE)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+  if(NOT WIN32)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator)
+  else(NOT WIN32)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+  endif(NOT WIN32)
   cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
- 
+
 if (NOT WIN32)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 48f94a1f05614d4b797562ac67cdb9828fd0456e..37202f869508c283e1b464942cadc0ebe3eef39c 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -79,9 +79,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
   BuildStrategy strategy_;
 };
 
-std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy()
-    const {
+std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
+    bool finalize_strategy) const {
+  if (is_finalized_) {
+    return pass_builder_;
+  }
   pass_builder_.reset(new ParallelExecutorPassBuilder(*this));
+  if (finalize_strategy) {
+    is_finalized_ = true;
+  }
   return pass_builder_;
 }
 
@@ -95,10 +101,8 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
 #else
     const bool use_cuda) const {
 #endif
-  // Create a default one if not initialized by user.
-  if (!pass_builder_) {
-    CreatePassesFromStrategy();
-  }
+  // Create a default one if not finalized by user.
+  CreatePassesFromStrategy(false);
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
 
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 6c7b54db8f610aa34cd51dcbc13063290cae3ac0..fc2641dbd48274b43db0b1f156e3e1128f96772e 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -75,12 +75,20 @@ struct BuildStrategy {
 
   bool remove_unnecessary_lock_{false};
 
+  // NOTE:
+  // Before you add new options, think if it's a general strategy that works
+  // with other strategy. If not, the strategy should be created through
+  // CreatePassesFromStrategy and the pass can be managed separately.
+
   // User normally doesn't need to call this API.
   // The PassBuilder allows for more customized insert, remove of passes
   // from python side.
   // A new PassBuilder is created based on configs defined above and
   // passes are owned by the PassBuilder.
-  std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy() const;
+  std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy(
+      bool finalize_strategy) const;
+
+  bool IsFinalized() const { return is_finalized_; }
 
   // Apply the passes built by the pass_builder_. The passes will be
   // applied to the Program and output an ir::Graph.
@@ -97,6 +105,7 @@ struct BuildStrategy {
 #endif
 
  private:
+  mutable bool is_finalized_ = false;
   mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
 };
 
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index ca11c0083961f9b3d04e33113a0d685d508918f9..949510e03705a4a0900f1c7b8758a8f7308aa44b 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -30,8 +30,8 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
       local_scopes_(local_scopes),
       places_(places),
       graph_(std::move(graph)),
-      pool_(strategy.num_threads_ +
-            1),  // add one more thread for generate op_deps
+      pool_(strategy.num_threads_),
+      prepare_pool_(1),  // add one more thread for generate op_deps
       fetch_ctxs_(places) {
   for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
     int dep = static_cast<int>(op->NotReadyInputSize());
@@ -160,7 +160,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
   });
 }
 void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
-  atomic_op_deps_ = pool_.enqueue([&] {
+  atomic_op_deps_ = prepare_pool_.enqueue([&] {
     auto *op_deps = new std::unordered_map<OpHandleBase *, std::atomic<int>>;
     for (auto &pair : op_deps_) {
       (*op_deps)[pair.first] = pair.second;
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index 8b8382447105c8caa36963214684d6ee9fa15200..949616f02d5168e6abab932d608e4b20ee64304a 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -46,6 +46,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<OpHandleBase *> bootstrap_ops_;
 
   ::ThreadPool pool_;
+  ::ThreadPool prepare_pool_;
   platform::DeviceContextPool fetch_ctxs_;
   std::atomic<int> remaining_;
 
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index d97f03660c70ed19ee27fd9f8096a4328b33a892..7ce08b728d9436c3b6e678faf328ddf1c45b7080 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/ngraph_operator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/operators/detail/macros.h"
@@ -25,6 +26,7 @@ limitations under the License. */
 
 DECLARE_bool(benchmark);
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
+DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
 
 namespace paddle {
 namespace framework {
@@ -81,6 +83,24 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
   }
 }
 
+static void EnableFusedOp(ExecutorPrepareContext* ctx) {
+#ifdef PADDLE_WITH_NGRAPH
+  VLOG(3) << "use_ngraph=True";
+  auto intervals = FusedOperator::FusedOpIntervals(&ctx->ops_);
+  for (auto& interval : intervals) {
+    auto* fused_op = new FusedOperator(ctx->prog_, ctx->block_id_,
+                                       interval.at(0), interval.at(1));
+    *interval[0] = std::unique_ptr<OperatorBase>(fused_op);
+  }
+  for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
+    ctx->ops_.erase(it->at(0) + 1, it->at(1));
+  }
+#else
+  LOG(WARNING)
+      << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
+#endif
+}
+
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
 void Executor::Close() {
@@ -338,6 +358,7 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
   for (auto& op_desc : block.AllOps()) {
     ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
   }
+  if (FLAGS_use_ngraph) EnableFusedOp(ctx.get());
   return ctx;
 }
 
@@ -359,6 +380,7 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                                   bool create_local_scope, bool create_vars,
                                   bool keep_kids) {
+  PADDLE_ENFORCE_NOT_NULL(scope);
   Scope* local_scope = scope;
   if (create_vars) {
     if (create_local_scope) {
@@ -473,6 +495,5 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) {
       << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
 #endif
 }
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index b403252c972d26da6deeca54ce88a9547ffe7afa..818b3334ea4171fd7a9cbaa896ee1672e8ecca51 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -29,7 +29,7 @@ template <typename T>
 class GarbageCollector {
  public:
   GarbageCollector(const platform::Place &place, size_t max_memory_size)
-      : max_memory_size_(std::max(max_memory_size, static_cast<size_t>(1))) {
+      : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
     garbages_.reset(new std::deque<T *>());
     dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
   }
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 4cf973253cc4f1f22d2fc578a1ac3a8c95e479c9..504f7e6d6c13d6c40d72a53e52fec920457f2dae 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -5,6 +5,7 @@ file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")
 
 
 # Usage: pass_library(target inference) will append to paddle_inference_pass.h
+unset(INFER_IR_PASSES CACHE) # clear the global variable
 function(pass_library TARGET DEST)
     set(options "")
     set(oneValueArgs "")
@@ -15,10 +16,11 @@ function(pass_library TARGET DEST)
     if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
         message(STATUS "add pass ${TARGET} ${DEST}")
         file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
-        set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE)
+        set(INFER_IR_PASSES ${INFER_IR_PASSES} ${TARGET} CACHE INTERNAL "")
     endif()
 endfunction()
 
+
 cc_library(node SRCS node.cc DEPS proto_desc)
 cc_library(graph SRCS graph.cc DEPS node pretty_log)
 cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
index 6b284b1c1a4a37803229f4d55b100ca1da3a741d..c436dd414d01ab61d143427fe7ecd34a82f11f8d 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -91,10 +91,10 @@ void FindWhileOp(Graph* graph) {
 #undef OP_SET_IN
 #undef OP_SET_OUT
 
-  auto* X = graph->RetriveNode(34);
-  auto* LSTMOUT = graph->RetriveNode(81);
-  auto* cell_init = graph->RetriveNode(6);
-  auto* hidden_init = graph->RetriveNode(8);
+  auto* X = graph->RetrieveNode(34);
+  auto* LSTMOUT = graph->RetrieveNode(81);
+  auto* cell_init = graph->RetrieveNode(6);
+  auto* hidden_init = graph->RetrieveNode(8);
 
   auto* lstm_op = graph->CreateOpNode(&op_desc);
   PrepareParameters(graph, param);
@@ -211,12 +211,12 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
   VLOG(30) << "LSTMWeight resized to " << out->dims();
 
   float* out_data = out->mutable_data<float>(platform::CPUPlace());
-  std::array<const float*, 4> tensors(
-      {{W_forget_w0.data<float>(), W_input_w0.data<float>(),
-        W_output_w0.data<float>(), W_cell_w0.data<float>()}});
-  std::array<const float*, 4> tensors1(
-      {{W_forget_w1.data<float>(), W_input_w1.data<float>(),
-        W_output_w1.data<float>(), W_cell_w1.data<float>()}});
+  std::array<const float*, 4> tensors{
+      W_forget_w0.data<float>(), W_input_w0.data<float>(),
+      W_output_w0.data<float>(), W_cell_w0.data<float>()};
+  std::array<const float*, 4> tensors1{
+      W_forget_w1.data<float>(), W_input_w1.data<float>(),
+      W_output_w1.data<float>(), W_cell_w1.data<float>()};
 
   for (int row = 0; row < D; row++) {
     for (int col = 0; col < 4; col++) {
@@ -238,9 +238,9 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
 void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
                      const LoDTensor& B_output, const LoDTensor& B_cell,
                      LoDTensor* out) {
-  std::array<const float*, 4> tensors(
-      {{B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
-        B_cell.data<float>()}});
+  std::array<const float*, 4> tensors{
+      B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
+      B_cell.data<float>()};
 
   PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1);
   int D = B_forget.dims()[0];
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index 3348abb19b3339b2b3e8b50485133b15a1973a32..7b6ce0da07309a0ed2a5c8bcd5f59d84105261d7 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -57,6 +57,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
     desc.SetInput("W", std::vector<std::string>({fc_Y_in}));
     desc.SetInput("Bias", std::vector<std::string>({fc_bias_in}));
     desc.SetOutput("Out", std::vector<std::string>({fc_out_out}));
+    desc.SetAttr("in_num_col_dims", mul->Op()->GetAttr("x_num_col_dims"));
     desc.SetType("fc");
     auto fc_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
     GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out});
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
index 2db7d95cae1c8c59691fd642e2462e92ed58814f..4e1e4e27f9ba932b56ecc25e816a2aee9d42362e 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@@ -29,6 +29,7 @@ void SetOp(ProgramDesc* prog, const std::string& type,
   if (type == "mul") {
     op->SetInput("X", {inputs[0]});
     op->SetInput("Y", {inputs[1]});
+    op->SetAttr("x_num_col_dims", {1});
   } else if (type == "elementwise_add") {
     op->SetInput("X", inputs);
   }
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index a2a8baa5e45d1791120e32c62dd0dbc533668290..ae0e42ff5e89466013382ab97650e6afeeff3d2d 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -84,8 +84,6 @@ void CheckProgram(const ProgramDesc &program) {
 
 Graph::Graph(const ProgramDesc &program) : program_(program) {
   CheckProgram(program_);
-  // Make the nodes id start from 0.
-  Node::ResetId();
   auto var_nodes = InitFromProgram(program_);
   ResolveHazard(var_nodes);
 }
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 6384d89d2f2af4ab1d733af5eb1561cab2d09728..0c856f8e610077c69416ccfb8a763d4b8ae881b8 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -116,13 +116,17 @@ class Graph {
   // Create a normal variable with non-null VarDesc.
   ir::Node *CreateVarNode(VarDesc *var_desc) {
     PADDLE_ENFORCE(var_desc);
-    return AddNode(new ir::Node(var_desc));
+    auto *x = AddNode(new ir::Node(var_desc));
+    x->SetId(num_node_created_++);
+    return x;
   }
 
   // Create a normal runnable operator with OpDesc.
   ir::Node *CreateOpNode(OpDesc *op_desc) {
     PADDLE_ENFORCE(op_desc);
-    return AddNode(new ir::Node(op_desc));
+    auto *x = AddNode(new ir::Node(op_desc));
+    x->SetId(num_node_created_++);
+    return x;
   }
 
   // Create a control dependency var that connects 2 operations. The
@@ -132,13 +136,17 @@ class Graph {
     // TODO(panyx0718): control var name should be really unique.
     const std::string name = string::Sprintf(
         "%s@%llu", ir::Node::kControlDepVarName, node_set_.size());
-    return AddNode(new ir::Node(name, ir::Node::Type::kVariable));
+    auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable));
+    x->SetId(num_node_created_++);
+    return x;
   }
 
   // A more free style way of creating a graph node. Mostly use for test
   // or "copy" from another node. Avoid using it if possible.
   ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type) {
-    return AddNode(new ir::Node(name, type));
+    auto *x = AddNode(new ir::Node(name, type));
+    x->SetId(num_node_created_++);
+    return x;
   }
 
   // Clear all node information of the graph and return the ownership of the
@@ -160,7 +168,7 @@ class Graph {
   }
 
   // NOTE low performance, but simple and secure.
-  Node *RetriveNode(int id) {
+  Node *RetrieveNode(int id) {
     for (auto &node : nodes_) {
       if (node.second->id() == id) {
         return node.second.get();
@@ -169,6 +177,7 @@ class Graph {
     return nullptr;
   }
 
+  const ProgramDesc &program() const { return program_; }
   std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
       const ProgramDesc &program);
 
@@ -190,6 +199,7 @@ class Graph {
   std::map<std::string, std::function<void(void)>> attr_dels_;
   std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_;
   std::unordered_set<ir::Node *> node_set_;
+  size_t num_node_created_{0};  // help to generate a unique node id.
 };
 
 bool IsControlDepVar(const ir::Node &var);
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 0a3c8a6cb5c4187063db8ec2decd1f8d55aa39ea..b534a5509279ef7bfc5fc92ec726224e6c5ed16f 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -167,10 +167,12 @@ struct HitGroup {
 
   bool Match(Node *node, PDNode *pat) {
     if (nodes_.count(node)) {
-      if (!roles.count(pat)) return false;
-      return roles[pat] == node;
+      if (roles.count(pat) && roles[pat] == node) return true;
+      return false;
+    } else {
+      if (roles.count(pat) && roles[pat] != node) return false;
+      return true;
     }
-    return !roles.count(pat) || roles.at(pat) == node;
   }
 
   void Register(Node *node, PDNode *pat) {
@@ -198,7 +200,6 @@ GraphPatternDetector::DetectPatterns() {
   std::vector<GraphPatternDetector::subgraph_t> result;
   std::vector<HitGroup> init_groups;
   std::array<std::vector<HitGroup>, 2> bi_records;
-  // PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed");
   auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get()
                                                : pattern_.edges().front().first;
   if (!pdnodes2nodes_.count(first_pnode)) return result;
@@ -228,11 +229,12 @@ GraphPatternDetector::DetectPatterns() {
         VLOG(80) << "check " << source->id() << " -- " << target->id();
         // TODO(Superjomn) add some prune strategies.
         for (const auto &group : pre_groups) {
-          HitGroup new_group = group;
-          if (IsNodesLink(source, target) &&
-              new_group.Match(source, edge.first)) {
-            new_group.Register(source, edge.first);
-            if (new_group.Match(target, edge.second)) {
+          if (IsNodesLink(source, target)) {
+            HitGroup new_group = group;
+            bool flag = new_group.Match(source, edge.first) &&
+                        new_group.Match(target, edge.second);
+            if (flag) {
+              new_group.Register(source, edge.first);
               new_group.Register(target, edge.second);
               cur_groups.push_back(new_group);
               // TODO(Superjomn) need to unique
@@ -261,14 +263,16 @@ GraphPatternDetector::DetectPatterns() {
   return result;
 }
 
-bool GraphItemCMP(const std::pair<PDNode *, Node *> &a,
+struct GraphItemLessThan {
+  bool operator()(const std::pair<PDNode *, Node *> &a,
                   const std::pair<PDNode *, Node *> &b) {
-  if (a.first != b.first) {
-    return a.first < b.first;
-  } else {
-    return a.second < b.second;
+    if (a.first != b.first) {
+      return a.first < b.first;
+    } else {
+      return a.second < b.second;
+    }
   }
-}
+};
 
 // TODO(Superjomn) enhance the function as it marks unique unique as duplicates
 // see https://github.com/PaddlePaddle/Paddle/issues/13550
@@ -282,7 +286,7 @@ void GraphPatternDetector::UniquePatterns(
   for (auto &g : *subgraphs) {
     // Sort the items in the sub-graph, and transform to a string key.
     std::vector<std::pair<PDNode *, Node *>> sorted_keys(g.begin(), g.end());
-    std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemCMP);
+    std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemLessThan());
     std::stringstream ss;
     for (auto &item : sorted_keys) {
       ss << item.first << ":" << item.second;
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 9e462ac671ee931fc17a31f32a76049a0990341f..1c5155df7867f95fb403d51bf633084a6c400f12 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -310,8 +310,8 @@ void GraphSafeRemoveNodes(Graph* graph,
                           const std::unordered_set<const Node*>& nodes);
 
 // Some pre-defined patterns those can be reused in multiple passes.
-// The related Fluid Layer or Op should be one pattern here for better reusage
-// accross different fusion.
+// The related Fluid Layer or Op should be one pattern here for better re-usage
+// across different fusion.
 namespace patterns {
 
 struct KeyCounter {
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc
index 414d8f79b15de091c62af5fe099ffae144156e4e..36f36933265c69fcd450894a3e32bbb3e547b62c 100644
--- a/paddle/fluid/framework/ir/graph_to_program_pass.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc
@@ -35,10 +35,11 @@ std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
       new proto::ProgramDesc(*program.Proto()));
 
   auto block = program_pb->mutable_blocks(kRootBlockIndex);
+  block->set_idx(kRootBlockIndex);
   block->clear_vars();
   std::unordered_set<std::string> visited_vars;
   for (ir::Node* n : graph->Nodes()) {
-    if (n->NodeType() == ir::Node::Type::kVariable) {
+    if (n->IsVar()) {
       if (n->Var() && visited_vars.count(n->Var()->Name()) == 0) {
         visited_vars.insert(n->Var()->Name());
         block->add_vars()->MergeFrom(*n->Var()->Proto());
diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc
index 084a4ba2def87eaa8badb3ca2c39865c6e5cb981..2ee12cc410393d1e1aa5fc9e5374d858eca1b901 100644
--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@@ -66,6 +66,76 @@ NodesDFSIterator &NodesDFSIterator::operator=(const NodesDFSIterator &other) {
 }
 Node *NodesDFSIterator::operator->() { return stack_.top(); }
 
+inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
+  return node.inputs.size() == n;
+}
+
+NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
+  PADDLE_ENFORCE(!source.empty(),
+                 "Start points of topological sorting should not be empty!");
+  // CHECK all the inputs' in-degree is 0
+  for (auto *node : source) {
+    PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
+  }
+
+  std::unordered_set<Node *> visited;
+  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
+
+  std::vector<Node *> inlink_visited;
+  while (!to_visit.empty()) {
+    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
+    for (auto *p : queue) {
+      inlink_visited.clear();
+
+      std::copy_if(p->inputs.begin(), p->inputs.end(),
+                   std::back_inserter(inlink_visited),
+                   [&](Node *x) -> bool { return visited.count(x) != 0; });
+
+      if (inlink_visited.size() == p->inputs.size()) {
+        sorted_.push_back(p);
+        for (auto *_ : p->outputs) {
+          if (!visited.count(_)) {
+            to_visit.insert(_);
+          }
+        }
+
+        to_visit.erase(p);
+        visited.insert(p);
+      }
+    }
+  }
+}
+
+NodesTSIterator::NodesTSIterator(const NodesTSIterator &other)
+    : sorted_(other.sorted_), cursor_(other.cursor_) {}
+
+Node &NodesTSIterator::operator*() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return *sorted_[cursor_];
+}
+
+NodesTSIterator &NodesTSIterator::operator++() {
+  if (++cursor_ >= sorted_.size()) {
+    sorted_.clear();
+    cursor_ = 0;
+  }
+  return *this;
+}
+NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) {
+  cursor_ = other.cursor_;
+  sorted_ = other.sorted_;
+  return *this;
+}
+
+bool NodesTSIterator::operator==(const NodesTSIterator &other) {
+  return sorted_ == other.sorted_ && cursor_ == other.cursor_;
+}
+
+Node *NodesTSIterator::operator->() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return sorted_[cursor_];
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_traits.h b/paddle/fluid/framework/ir/graph_traits.h
index f42bab20ed97e372d2da0c4a492a4458ab94e0a0..f6772f9a37567c83c49bd44d551481edda1a74ae 100644
--- a/paddle/fluid/framework/ir/graph_traits.h
+++ b/paddle/fluid/framework/ir/graph_traits.h
@@ -62,6 +62,32 @@ struct NodesDFSIterator
   std::unordered_set<Node *> visited_;
 };
 
+// Topological sorting iterator on nodes.
+struct NodesTSIterator
+    : public std::iterator<std::forward_iterator_tag, Node *> {
+  NodesTSIterator() = default;
+  NodesTSIterator(const std::vector<Node *> &source);
+  NodesTSIterator(NodesTSIterator &&other)
+      : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
+    other.cursor_ = 0;
+  }
+  NodesTSIterator(const NodesTSIterator &other);
+
+  Node &operator*();
+  NodesTSIterator &operator++();
+  // TODO(Superjomn) current implementation just compare the first
+  // element, need to compare the graph and all the elements in the queue and
+  // set.
+  NodesTSIterator &operator=(const NodesTSIterator &other);
+  bool operator==(const NodesTSIterator &other);
+  bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
+  Node *operator->();
+
+ private:
+  std::vector<Node *> sorted_;
+  size_t cursor_{0};
+};
+
 /*
  * GraphTraits contains some graph traversal algorithms.
  *
@@ -76,6 +102,14 @@ struct GraphTraits {
                                             NodesDFSIterator());
   }
 
+  static iterator_range<NodesTSIterator> TS(const Graph &g) {
+    auto start_points = ExtractStartPoints(g);
+    PADDLE_ENFORCE(!start_points.empty());
+    NodesTSIterator x(start_points);
+    return iterator_range<NodesTSIterator>(NodesTSIterator(start_points),
+                                           NodesTSIterator());
+  }
+
  private:
   // The nodes those have no input will be treated as start points.
   static std::vector<Node *> ExtractStartPoints(const Graph &g) {
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
index 9277abe8c1b79c5f76f4610d0554bf337f329518..50d9113088903aa7681d6c6af5cc65f846d32787 100644
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -17,8 +17,12 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
+// msvc15 don't support constexpr in correct way.
+#if !defined(_WIN32)
 constexpr char Node::kControlDepVarName[];
-int Node::count_ = 0;
+#else
+const char Node::kControlDepVarName[] = "__control_var";
+#endif
 
 std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
                                         Node::Type type) {
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index eedb375cf46165ebb09af56e5ab052a0327f1d0c..d2a393b3f19e9aab79098757dae663d030b0fa2b 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -55,7 +55,11 @@ class Node {
   }
 
   enum class Type { kOperation, kVariable };
+#if !defined(_WIN32)  // msvc not support constexpr correctly.
   static constexpr char kControlDepVarName[] = "__control_var";
+#else
+  static const char kControlDepVarName[];
+#endif
 
   Type NodeType() const { return type_; }
 
@@ -115,37 +119,30 @@ class Node {
   int id_;
 
  private:
+  // ID can only set by a Graph.
+  void SetId(int id) { id_ = id; }
+
   friend class Graph;
   friend std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
                                                  Node::Type type);
 
   explicit Node(const std::string& name, Type type)
-      : name_(name),
-        var_desc_(nullptr),
-        op_desc_(nullptr),
-        type_(type),
-        id_(count_++) {}
+      : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}
 
   explicit Node(VarDesc* var_desc)
       : name_(var_desc->Name()),
         var_desc_(new VarDesc(*var_desc)),
         op_desc_(nullptr),
-        type_(Type::kVariable),
-        id_(count_++) {}
+        type_(Type::kVariable) {}
 
   explicit Node(OpDesc* op_desc)
       : name_(op_desc->Type()),
         var_desc_(nullptr),
         op_desc_(new OpDesc(*op_desc, op_desc->Block())),
-        type_(Type::kOperation),
-        id_(count_++) {}
+        type_(Type::kOperation) {}
 
   Node() = delete;
 
-  static int count_;
-  // Please don't use this API or make this public.
-  static void ResetId() { count_ = 0; }
-
   boost::any wrapper_;
   std::function<void(void)> wrapper_deleter_;
   std::type_index wrapper_type_ = std::type_index(typeid(void));
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 8ac8d7677e1fe339d7802ea262e61a02a678aab5..615b539695de8c3f9a256d17d4d49e61902da394 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -93,6 +93,7 @@ class Pass {
  protected:
   virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const {
     LOG(FATAL) << "Calling virtual Pass not implemented.";
+    return graph;
   }
 
  private:
@@ -196,26 +197,26 @@ struct PassRegistrar : public Registrar {
                 msg)
 
 // Register a new pass that can be applied on the IR.
-#define REGISTER_PASS(pass_type, pass_class)                          \
-  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                                \
-      __reg_pass__##pass_type,                                        \
-      "REGISTER_PASS must be called in global namespace");            \
-  static ::paddle::framework::ir::PassRegistrar<pass_class>           \
-      __pass_registrar_##pass_type##__(#pass_type);                   \
-  int TouchPassRegistrar_##pass_type() {                              \
-    __pass_registrar_##pass_type##__.Touch();                         \
-    return 0;                                                         \
-  }                                                                   \
-  static ::paddle::framework::ir::PassRegistrar<pass_class>           \
-      &__pass_tmp_registrar_##pass_type##__ __attribute__((unused)) = \
+#define REGISTER_PASS(pass_type, pass_class)                \
+  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                      \
+      __reg_pass__##pass_type,                              \
+      "REGISTER_PASS must be called in global namespace");  \
+  static ::paddle::framework::ir::PassRegistrar<pass_class> \
+      __pass_registrar_##pass_type##__(#pass_type);         \
+  int TouchPassRegistrar_##pass_type() {                    \
+    __pass_registrar_##pass_type##__.Touch();               \
+    return 0;                                               \
+  }                                                         \
+  static ::paddle::framework::ir::PassRegistrar<pass_class> \
+      &__pass_tmp_registrar_##pass_type##__ UNUSED =        \
           __pass_registrar_##pass_type##__
 
-#define USE_PASS(pass_type)                                           \
-  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                                \
-      __use_pass_itself_##pass_type,                                  \
-      "USE_PASS must be called in global namespace");                 \
-  extern int TouchPassRegistrar_##pass_type();                        \
-  static int use_pass_itself_##pass_type##_ __attribute__((unused)) = \
+#define USE_PASS(pass_type)                           \
+  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                \
+      __use_pass_itself_##pass_type,                  \
+      "USE_PASS must be called in global namespace"); \
+  extern int TouchPassRegistrar_##pass_type();        \
+  static int use_pass_itself_##pass_type##_ UNUSED =  \
       TouchPassRegistrar_##pass_type()
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 8e660f97f051b194a0305dc82371fbe64da7e061..c384456b648d4497bf4bd003b183b773186e0f15 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -57,60 +57,58 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
   }
 }
 
-void NaiveExecutor::Prepare(Scope *parent_scope,
-                            const ProgramDesc &program_desc, int block_id,
-                            bool with_feed_fetch_ops) {
-  if (!parent_scope) {
+void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
+                            int block_id, bool with_feed_fetch_ops) {
+  if (!scope) {
     scope_ = new framework::Scope;
   } else {
-    scope_ = &parent_scope->NewScope();
+    scope_ = scope;
   }
-  CreateVariables(program_desc, scope_, block_id);
+
+  VLOG(3) << "NaiveExecutor init with scope " << scope;
   CreateOps(program_desc, block_id, with_feed_fetch_ops);
 }
 
 void NaiveExecutor::Run() {
   for (auto &op : ops_) {
-    VLOG(40) << "run " << op->Type();
+    VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
+            << " on scope " << scope_;
     op->Run(*scope_, place_);
   }
 }
 
-void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope,
-                                    int block_id) {
-  PADDLE_ENFORCE(scope);
+void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
+                                    bool persistable, Scope *scope) {
+  PADDLE_ENFORCE_NOT_NULL(scope);
+
   auto &global_block = desc.Block(block_id);
 
-  const Scope *ancestor_scope = scope;
-  while (ancestor_scope->parent()) {
-    ancestor_scope = ancestor_scope->parent();
+  const auto *anc = scope;
+  PADDLE_ENFORCE(anc->parent() != anc);
+  while (anc->parent()) {
+    anc = anc->parent();
   }
 
-  if (ancestor_scope != scope) {
-    for (auto &var : global_block.AllVars()) {
-      if (var->Name() == framework::kEmptyVarName) {
-        continue;
-      }
-      // Create persistable vars in ancestor scope.
-      if (var->Persistable()) {
-        auto *ptr = const_cast<Scope *>(ancestor_scope)->Var(var->Name());
-        InitializeVariable(ptr, var->GetType());
-        VLOG(30) << "Create Variable " << var->Name()
-                 << " global, which pointer is " << ptr;
-      } else {  // Create temporary variables in local scope.
-        auto *ptr = scope->Var(var->Name());
+  for (auto &var : global_block.AllVars()) {
+    if (var->Name() == framework::kEmptyVarName) {
+      continue;
+    }
+
+    if (persistable == var->Persistable()) {
+      if (persistable) {
+        if (!anc->FindVar(var->Name())) {
+          auto *ptr = const_cast<Scope *>(anc)->Var(var->Name());
+          VLOG(3) << scope << " Create persistable variable " << var->Name()
+                  << ", which pointer is " << ptr;
+          InitializeVariable(ptr, var->GetType());
+        }
+      } else {
+        auto *ptr = const_cast<Scope *>(scope)->Var(var->Name());
+        VLOG(3) << scope << " Create variable " << var->Name()
+                << ", which pointer is " << ptr;
         InitializeVariable(ptr, var->GetType());
-        VLOG(30) << "Create Variable " << var->Name()
-                 << " locally, which pointer is " << ptr;
       }
     }
-  } else {
-    for (auto &var : global_block.AllVars()) {
-      auto *ptr = scope->Var(var->Name());
-      InitializeVariable(ptr, var->GetType());
-      VLOG(30) << "Create variable " << var->Name() << ", which pointer is "
-               << ptr;
-    }
   }
 }
 
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
index ddfa6e1f4d8b73f594fc381ab505797491cdd378..5e673f68574c4ddaa4c9260367d09e9f62f6b751 100644
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -35,8 +35,14 @@ class NaiveExecutor {
   // Create child scope.
   // Create variables.
   // @with_feed_fetch_ops: whether to work with the feed and fetch operators.
-  void Prepare(Scope* parent_scope, const ProgramDesc& program_desc,
-               int block_id, bool with_feed_fetch_ops);
+  void Prepare(Scope* scope, const ProgramDesc& program_desc, int block_id,
+               bool with_feed_fetch_ops);
+
+  // Create variables before head.
+  // Create parameters if persistable is ture, or create the temporary variables
+  // instead.
+  void CreateVariables(const ProgramDesc& desc, int block_id, bool persistable,
+                       Scope* scope);
 
   // Run all the operators.
   void Run();
@@ -49,8 +55,6 @@ class NaiveExecutor {
   void CleanFeedFetchOps();
 
  protected:
-  void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id);
-
   void CreateOps(const ProgramDesc& desc, int block_id,
                  bool with_feed_fetch_ops);
 
diff --git a/paddle/fluid/framework/naive_executor_test.cc b/paddle/fluid/framework/naive_executor_test.cc
index 6b9f79b9d398bc5a0ee6ba66587924daad0dbbc5..c917630666b082ab7148550707f9f1f720aa25d3 100644
--- a/paddle/fluid/framework/naive_executor_test.cc
+++ b/paddle/fluid/framework/naive_executor_test.cc
@@ -39,7 +39,7 @@ TEST(NaiveExecutor, Basic) {
 
   auto place = platform::CPUPlace();
   NaiveExecutor exe(place);
-  exe.Prepare(nullptr, program, 0, false /*with feed fetch ops*/);
+  exe.Prepare(nullptr, program, 0, false);
   auto* a_tensor = exe.FindTensor("a");
   auto* b_tensor = exe.FindTensor("b");
   auto* c_tensor = exe.FindTensor("c");
diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8177436d0bd90c3bcf8f91d5c55b66be188b19f9
--- /dev/null
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_NGRAPH
+#include <algorithm>
+#include <functional>
+
+#include "paddle/fluid/framework/ngraph_bridge.h"
+
+#include "ngraph/ngraph.hpp"
+
+namespace paddle {
+namespace framework {
+
+std::map<std::string,
+         std::function<void(const std::shared_ptr<OperatorBase>&,
+                            std::shared_ptr<std::unordered_map<
+                                std::string, std::shared_ptr<ngraph::Node>>>)>>
+    NgraphBridge::NG_NODE_MAP = {};
+
+void NgraphBridge::build_graph(const std::shared_ptr<OperatorBase>& op) {
+  auto& op_type = op->Type();
+  NG_NODE_MAP[op_type](op, ngb_node_map);
+}
+
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/framework/ngraph_bridge.h
new file mode 100644
index 0000000000000000000000000000000000000000..55bf0d21f3471013b1fb780e852d813313345f03
--- /dev/null
+++ b/paddle/fluid/framework/ngraph_bridge.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_NGRAPH
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "ngraph/ngraph.hpp"
+
+namespace paddle {
+namespace framework {
+
+class NgraphBridge {
+ public:
+  static std::map<
+      std::string,
+      std::function<void(const std::shared_ptr<OperatorBase>&,
+                         std::shared_ptr<std::unordered_map<
+                             std::string, std::shared_ptr<ngraph::Node>>>)>>
+      NG_NODE_MAP;
+
+  explicit NgraphBridge(
+      std::shared_ptr<
+          std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+          var_node_map)
+      : ngb_node_map(var_node_map) {}
+
+  void build_graph(const std::shared_ptr<OperatorBase>& op);
+
+ private:
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      ngb_node_map;
+};
+
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d967b2780c21713a2f9a73a3402964103f44269e
--- /dev/null
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -0,0 +1,220 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_NGRAPH
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <map>
+
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/ngraph_operator.h"
+#include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/framework/var_type.h"
+
+namespace paddle {
+namespace framework {
+
+static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = {
+    {proto::VarType::FP32, ngraph::element::f32},
+    {proto::VarType::FP64, ngraph::element::f64},
+    {proto::VarType::INT32, ngraph::element::i32},
+    {proto::VarType::INT64, ngraph::element::i64},
+    {proto::VarType::BOOL, ngraph::element::boolean},
+};
+
+typedef enum {                /* nGraph support state on ops          */
+               FULL_TRAIN,    /* Support full ops for train           */
+               PARTIAL_TRAIN, /* Support partial ops for train        */
+               FULL_TEST,     /* Support full list of ops for test    */
+               PARTIAL_TEST   /* Support partial list of ops for test */
+} op_state;
+
+class NgraphOperator {
+ public:
+  explicit NgraphOperator(const Scope& scope, const platform::Place& place,
+                          const std::vector<std::shared_ptr<OperatorBase>>& ops,
+                          const std::unordered_map<
+                              std::string, ngraph::element::Type>& var_type_map,
+                          const std::unordered_set<std::string>& persist,
+                          const std::unordered_set<std::string>& fetches,
+                          const std::unordered_set<std::string>& post_op_inputs,
+                          op_state ng_op_state)
+      : scope_(scope),
+        place_(place),
+        fused_ops_(ops),
+        var_type_map_(var_type_map),
+        persistables_(persist),
+        fetches_(fetches),
+        post_op_inputs_(post_op_inputs),
+        ng_op_state_(ng_op_state) {}
+
+  void Run(const Scope& scope, const platform::Place& place) const;
+
+ private:
+  static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
+      func_cache;
+  const Scope& scope_;
+  const platform::Place& place_;
+  std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
+  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
+  std::unordered_set<std::string> persistables_;
+  std::unordered_set<std::string> fetches_;
+  std::unordered_set<std::string> post_op_inputs_;
+  op_state ng_op_state_;
+};
+
+std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
+FusedOperator::FusedOpIntervals(
+    std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops) {
+  std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
+      intervals;
+  if (ops->empty()) {
+    return intervals;
+  }
+  size_t size = ops->size();
+  size_t left = 0;
+  while (left < size && ops.at(left)->Type() != kFeedOpType) {
+    ++left;
+  }
+  if (left == size) {
+    return intervals;
+  }
+  while (left < size && ops->at(left)->Type() == kFeedOpType) {
+    ++left;
+  }
+
+  size_t right = left;
+  while (right < size && ops->at(right)->Type() != kFetchOpType) {
+    ++right;
+  }
+  if (right == size) {
+    return intervals;
+  }
+  if (left >= right) return intervals;
+
+  // (left, right - 1) represents indices between feed and fetch
+  size_t pivot = left;
+  while (pivot < right) {
+    auto op_type = ops->at(pivot)->Type();
+    if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) ==
+        paddle::framework::NgraphBridge::NG_NODE_MAP.end()) {
+      ++pivot;
+    } else {
+      size_t start = pivot, end = start;
+      while (pivot < right &&
+             (paddle::framework::NgraphBridge::NG_NODE_MAP.find(
+                  ops.at(pivot)->Type()) !=
+              paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
+        ++pivot;
+        ++end;
+      }
+      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>
+          interval = {ops->begin() + start, ops->begin() + end};
+      intervals.push_back(interval);
+    }
+  }  // end while
+
+  return intervals;
+}
+
+FusedOperator::FusedOperator(
+    const ProgramDesc& prog, size_t block_id,
+    std::vector<std::unique_ptr<OperatorBase>>::iterator start,
+    std::vector<std::unique_ptr<OperatorBase>>::iterator end,
+    const std::string& type, const VariableNameMap& inputs,
+    const VariableNameMap& outputs, const AttributeMap& attrs)
+    : OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) {
+  for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start;
+       it != end; ++it) {
+    fused_ops_.push_back(std::move(*it));
+  }
+
+  for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = end;
+       (*it)->Type() != kFetchOpType; ++it) {
+    for (auto& var_name_item : (*it)->Inputs()) {
+      for (auto& var_name : var_name_item.second) {
+        post_op_inputs_.insert(var_name);
+      }
+    }
+  }
+
+  if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) {
+    is_complete = true;
+  }
+
+  Process();
+}
+
+void FusedOperator::Process() {
+  auto& bdesc = pdesc_.Block(block_);
+  for (auto& var : bdesc.AllVars()) {
+    if (!(var->GetType() == proto::VarType::SELECTED_ROWS ||
+          var->GetType() == proto::VarType::LOD_TENSOR ||
+          var->GetType() == proto::VarType::LOD_TENSOR_ARRAY)) {
+      continue;
+    }
+
+    auto var_name = var->Name();
+    if (var->Name() == framework::kEmptyVarName) {
+      continue;
+    }
+
+    if (var_name != "fetch" && var_name != "feed") {
+      auto pd_type = var->GetDataType();
+      if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) {
+        PADDLE_THROW("Data type of var %s not found in pd2ng_type_map",
+                     var_name);
+      }
+      var_type_map_[var_name] = pd2ng_type_map[pd_type];
+    }
+
+    if (var->Persistable()) {
+      persistables_.insert(var->Name());
+    }
+  }
+
+  for (auto* op : bdesc.AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      std::string fetch_target_name = op->Input("X")[0];
+      fetches_.insert(fetch_target_name);
+    }
+  }
+}
+
+void FusedOperator::RunImpl(const Scope& scope,
+                            const platform::Place& place) const {
+  op_state ng_op_state = PARTIAL_TEST;
+  auto& bdesc = pdesc_.Block(block_);
+  for (auto* op : bdesc.AllOps()) {
+    if (op->Type().find("_grad") != std::string::npos) {
+      ng_op_state = PARTIAL_TRAIN;
+      break;
+    }
+  }
+
+  if (is_full) {
+    ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
+  }
+
+  NgraphOperator ngraph_op(scope, place, fused_ops_, var_type_map_,
+                           persistables_, fetches_, post_op_inputs_,
+                           ng_op_state);
+  ngraph_op.Run(scope, place);
+}
+
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f655cef1dde624bcf4944b5c096279097e1c8ae
--- /dev/null
+++ b/paddle/fluid/framework/ngraph_operator.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_NGRAPH
+
+#include <algorithm>
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/ngraph_bridge.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/variant.h"
+
+#include "ngraph/ngraph.hpp"
+
+namespace paddle {
+namespace framework {
+
+class FusedOperator : public OperatorBase {
+ public:
+  static std::vector<
+      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
+  FusedOpIntervals(
+      std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops);
+
+  explicit FusedOperator(
+      const ProgramDesc& prog, size_t block_id,
+      std::vector<std::unique_ptr<OperatorBase>>::iterator start,
+      std::vector<std::unique_ptr<OperatorBase>>::iterator end,
+      const std::string& type = "fused_op", const VariableNameMap& inputs = {},
+      const VariableNameMap& outputs = {}, const AttributeMap& attrs = {});
+
+  void RunImpl(const Scope& scope, const platform::Place& place) const final;
+
+ private:
+  const ProgramDesc pdesc_;
+  size_t block_;
+  std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
+  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
+  std::unordered_set<std::string> persistables_;
+  std::unordered_set<std::string> fetches_;
+  std::unordered_set<std::string> post_op_inputs_;
+  bool is_full_ = false;
+
+  void Process();
+};
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 5624878d439873e5f6aee6ec9234e31d5c77ff97..6bd744edc22e6a90ce64e9d699e7f3c5c60d4908 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -150,14 +150,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #endif
   }
 
-  // The profile has a process-wide mutex, results in serious performance issue
-  // in concurrency scenerio. Here use an `if` to fix this issue.
-  // Please not remove the `if`, ask @Superjomn if there are any concern.
+// The profile has a process-wide mutex, results in serious performance issue
+// in concurrency scenerio. Here use an `if` to fix this issue.
+// Please not remove the `if`, ask @Superjomn if there are any concern.
+#ifndef _WIN32
   if (platform::IsProfileEnabled()) {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     platform::RecordEvent record_event(Type(), pool.Get(place));
     RunImpl(scope, place);
-  } else {
+  } else
+#endif
+  {
     RunImpl(scope, place);
   }
   VLOG(30) << place << " " << DebugStringEx(&scope);
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 0c407f8c1d11a8a0f99551fc51d2ef2be5262c63..bbeef150254f8f7a1f382a5b81055a6a5589eee1 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 
 #include <memory>  // for unique_ptr
+#include <queue>
 #include <set>
+#include <unordered_set>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/string/printf.h"
@@ -36,6 +38,16 @@ DEFINE_double(
     "Memory size threshold (GB) when the garbage collector clear tensors."
     "Disabled when this value is less than 0");
 
+// When in inference scenario, the scopes will not be written by two threads in
+// a mean time, but a scope may be read by multiple threads concurrently, and
+// the mutex will cause serious performance issue.
+// So the mutex is disabled when `ON_INFER`.
+#ifdef ON_INFER
+#define SCOPE_LOCK_GUARD
+#else
+#define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);
+#endif
+
 namespace paddle {
 namespace framework {
 
@@ -49,18 +61,18 @@ int64_t GetEagerDeletionThreshold() {
 Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   kids_.push_back(new Scope(this));
   return *kids_.back();
 }
 
 Variable* Scope::Var(const std::string& name) {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   return VarInternal(name);
 }
 
 Variable* Scope::Var(std::string* name) {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   if (name != nullptr) {
     *name = new_name;
@@ -69,34 +81,34 @@ Variable* Scope::Var(std::string* name) {
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   return FindVarInternal(name);
 }
 
 Variable* Scope::FindLocalVar(const std::string& name) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   return FindVarLocally(name);
 }
 
 const Scope* Scope::FindScope(const Variable* var) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   return FindScopeInternal(var);
 }
 
 void Scope::DropKids() {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   for (Scope* s : kids_) delete s;
   kids_.clear();
 }
 
 bool Scope::HasKid(const Scope* scope) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   return it != this->kids_.end();
 }
 
 std::vector<std::string> Scope::LocalVarNames() const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   std::vector<std::string> known_vars;
   known_vars.reserve(this->vars_.size());
   for (auto& p : vars_) {
@@ -106,9 +118,10 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }
 
 void Scope::DeleteScope(Scope* scope) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
-  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
+  PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
+                 this, scope);
   this->kids_.erase(it);
   // When making memory benchmark on Fluid, we have to delete scope sync.
   if (FLAGS_benchmark || FLAGS_eager_delete_scope) {
@@ -119,7 +132,7 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   std::set<std::string> var_set(var_names.begin(), var_names.end());
   for (auto it = vars_.begin(); it != vars_.end();) {
     if (var_set.find(it->first) != var_set.end()) {
@@ -132,12 +145,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 
 void Scope::Rename(const std::string& origin_name,
                    const std::string& new_name) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   RenameInternal(origin_name, new_name);
 }
 
 std::string Scope::Rename(const std::string& origin_name) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   RenameInternal(origin_name, new_name);
   return new_name;
@@ -189,5 +202,46 @@ Variable* Scope::FindVarLocally(const std::string& name) const {
   return nullptr;
 }
 
+std::string GenScopeTreeDebugInfo(Scope* root) {
+  std::stringstream os;
+
+  if (!root) return "";
+
+  // level traversal
+  std::queue<Scope*> queue;
+  queue.push(root);
+
+  std::vector<Scope*> scopes;
+
+  while (!queue.empty()) {
+    auto* end = queue.back();
+    Scope* q = nullptr;
+    while (q != end) {
+      q = queue.front();
+      queue.pop();
+      os << q << " ";
+      scopes.push_back(q);
+
+      for (auto* c : q->kids()) {
+        queue.push(c);
+      }
+    }
+    // end of a level
+    os << "\n------------------------------------------\n";
+  }
+
+  os << "\nDetails:\n\n";
+
+  for (Scope* q : scopes) {
+    os << "====\n";
+    os << q << ":\n";
+    for (auto& var : q->LocalVarNames()) {
+      os << "  - " << var << "\n";
+    }
+  }
+
+  return os.str();
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 9462620e829ec815e1553f6378a67463ea3b8aa3..1901ffbe57e0d85193c3a218f06eba06a0f287a5 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -78,11 +78,11 @@ class Scope {
   /// Drop all kids scopes belonged to this scope.
   void DropKids();
 
-  std::list<Scope*>& kids() const { return kids_; }
-
   /// Find if a scope exists in the kid scopes
   bool HasKid(const Scope* scope) const;
 
+  const std::list<Scope*>& kids() const { return kids_; }
+
   // enumerate all the variables current contains.
   std::vector<std::string> LocalVarNames() const;
 
@@ -118,12 +118,17 @@ class Scope {
 
   // Scope in `kids_` are owned by this class.
   mutable std::list<Scope*> kids_;
-  Scope const* parent_{nullptr};
+  const Scope* parent_{nullptr};
 
   DISABLE_COPY_AND_ASSIGN(Scope);
 
  private:
   mutable std::mutex mutex_;
 };
+
+// Generate some debug string about the inherience structure of scope, quite
+// naive.
+std::string GenScopeTreeDebugInfo(Scope*);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 3319c772ec789bc5b28307906adfb2a9417d9182..f4f2b769d5e47d8fba8d08476df4cd8e54133551 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -63,6 +63,26 @@ struct TensorCopyVisitor {
   int64_t size_;
 };
 
+struct TensorFillVisitor {
+  TensorFillVisitor(framework::Tensor* dst, int64_t dst_offset, int64_t size,
+                    float value)
+      : dst_(dst), dst_offset_(dst_offset), size_(size) {}
+
+  template <typename T>
+  void apply() const {
+    // TODO(qiao): support other place
+    platform::CPUPlace cpu;
+    auto* tensor_data = dst_->mutable_data<T>(cpu);
+    auto* start = tensor_data + dst_offset_;
+    auto* end = start + size_;
+    std::fill(start, end, static_cast<T>(0.0));
+  }
+
+  framework::Tensor* dst_;
+  int64_t dst_offset_;
+  int64_t size_;
+};
+
 void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
                        const platform::DeviceContext& dev_ctx) {
   {  // the 1st field, uint32_t version
@@ -120,7 +140,17 @@ bool SelectedRows::HasKey(int64_t key) const {
                                                                    : true;
 }
 
-int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown) {
+int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown,
+                                     bool is_test) {
+  if (is_test) {
+    auto iter = id_to_index_.find(key);
+    if (iter == id_to_index_.end()) {
+      return -1;
+    } else {
+      return iter->second;
+    }
+  }
+
   rwlock_->RDLock();
   auto iter = id_to_index_.find(key);
   if (iter == id_to_index_.end()) {
@@ -172,7 +202,7 @@ void SelectedRows::SyncIndex() {
 }
 
 void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
-                       bool auto_grown) {
+                       bool auto_grown, bool is_test) {
   PADDLE_ENFORCE(value->IsInitialized(),
                  "The value tensor should be initialized.");
   if (ids.numel() == 0) {
@@ -183,11 +213,19 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
                       "output tensor should have the same shape with table "
                       "except the dims[0].");
     for (int i = 0; i < ids.numel(); ++i) {
-      int64_t index = AutoGrownIndex(ids.data<int64_t>()[i], auto_grown);
-      framework::VisitDataType(
-          framework::ToDataType(value_->type()),
-          TensorCopyVisitor(value, i * value_width, *value_.get(),
-                            index * value_width, value_width));
+      auto id = ids.data<int64_t>()[i];
+      int64_t index = AutoGrownIndex(id, auto_grown, is_test);
+      if (index < 0) {
+        VLOG(5) << "id " << id << " not in the table, return 0";
+        framework::VisitDataType(
+            framework::ToDataType(value_->type()),
+            TensorFillVisitor(value, i * value_width, value_width, 0.0));
+      } else {
+        framework::VisitDataType(
+            framework::ToDataType(value_->type()),
+            TensorCopyVisitor(value, i * value_width, *value_.get(),
+                              index * value_width, value_width));
+      }
     }
   }
 }
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index daf5e95304fb84eaba26a30c45414d5021e7ffcb..55ca02038e083da4f8984f70fecf4ca2d878088e 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -105,7 +105,7 @@ class SelectedRows {
    * the value
    */
   void Get(const framework::Tensor& ids, framework::Tensor* value,
-           bool auto_grown = false);
+           bool auto_grown = false, bool is_test = false);
 
   /*
    * @brief Get the index of the key from id_to_index_ map. If the key not
@@ -118,7 +118,7 @@ class SelectedRows {
    *
    * @return index of the key.
    */
-  int64_t AutoGrownIndex(int64_t key, bool auto_grown);
+  int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false);
 
   void SyncIndex();
 
diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc
index 9c427a4ae4c9660b107ca891a60db306cb09301f..3b0509e0344efedf08ab21cac0a075049617ca97 100644
--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -84,10 +84,14 @@ TEST(SelectedRows, SparseTable) {
       data[i * embedding_width + j] = static_cast<float>(i);
     }
   }
-  ASSERT_EQ(table.AutoGrownIndex(10, true), 0);
-  ASSERT_EQ(table.AutoGrownIndex(8, true), 1);
-  ASSERT_EQ(table.AutoGrownIndex(8, true), 1);
-  ASSERT_EQ(table.AutoGrownIndex(6, true), 2);
+  ASSERT_EQ(table.AutoGrownIndex(10, true, false), 0);
+  ASSERT_EQ(table.AutoGrownIndex(8, true, false), 1);
+  ASSERT_EQ(table.AutoGrownIndex(8, true, false), 1);
+  ASSERT_EQ(table.AutoGrownIndex(6, true, false), 2);
+  for (int64_t i = 11; i < 20; i++) {
+    ASSERT_EQ(table.AutoGrownIndex(i, true, true), -1);
+    ASSERT_TRUE(!table.HasKey(i));
+  }
   ASSERT_TRUE(table.HasKey(10));
   ASSERT_TRUE(table.HasKey(8));
   ASSERT_TRUE(table.HasKey(6));
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index e5678cf607a8ff3763e79c1f321a81c021846fb1..fc656613010d18608c2780c96212f2d7bb674704 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -16,9 +16,21 @@ cc_library(paddle_fluid_api
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) 
 
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
+get_property(fluid_third_partys GLOBAL PROPERTY FLUID_THRID_PARTYS)
+if (WIN32)
+list(APPEND fluid_third_partys gflags glog protobuf cblas)
+endif(WIN32)
 
 # paddle_fluid_origin exclude inference api interface
-cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+if(WIN32)
+  sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+  if(WITH_GPU AND NOT WITH_DSO)
+    target_link_libraries(paddle_fluid_origin ${cuda_modules})
+  endif(WITH_GPU AND NOT WITH_DSO)
+else(WIN32)
+  cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+endif(WIN32)
 
 add_subdirectory(api)
 
@@ -27,13 +39,17 @@ set(SHARED_INFERENCE_SRCS
     io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc)
-if (WITH_GPU AND TENSORRT_FOUND)
-  set(STATIC_INFERENCE_APIS ${STATIC_INFERENCE_APIS} paddle_inference_tensorrt_subgraph_engine)
-  set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/api/api_tensorrt_subgraph_engine.cc)
-endif()
 
-# Create static library
-cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
+if(WIN32)
+  sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
+    analysis_config paddle_pass_builder)
+  if(WITH_GPU AND NOT WITH_DSO)
+    target_link_libraries(paddle_fluid ${cuda_modules})
+  endif(WITH_GPU AND NOT WITH_DSO)
+else(WIN32)
+  cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
+    analysis_config paddle_pass_builder)
+endif(WIN32)
 
 if(NOT APPLE)
   # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
@@ -42,11 +58,20 @@ if(NOT APPLE)
 endif()
 
 # Create shared library
-cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
-    DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)
+if(WIN32)
+  sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
+          DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder)
+  target_link_libraries(paddle_fluid_shared shlwapi)
+  if(WITH_GPU AND NOT WITH_DSO)
+    target_link_libraries(paddle_fluid_origin ${cuda_modules})
+  endif(WITH_GPU AND NOT WITH_DSO)
+else(WIN32)
+  cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
+      DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder)
+endif()
 
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
-if(NOT APPLE)
+if(NOT APPLE AND NOT WIN32)
   # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
   set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map")
   set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 0354f9e6e9588af601210b8a71ae98c1f90d62f0..eb89fc5e1124e97b082d6299e3efc44591a8b01b 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -1,24 +1,25 @@
-cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass)
-set(analysis_deps
-        framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log)
+unset(analysis_deps CACHE)
+set(analysis_deps # analysis_deps can be extended accross the project
+        framework_proto proto_desc graph pass paddle_fluid_api executor pretty_log
+        ir_pass_manager
+        CACHE INTERNAL "")
 
-cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
+add_subdirectory(ir_passes)
+add_subdirectory(passes)
+
+cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass ${INFER_IR_PASSES})
+
+cc_library(argument SRCS argument.cc DEPS scope proto_desc)
+cc_library(analysis_pass SRCS analysis_pass.cc DEPS proto_desc)
+
+cc_library(analysis SRCS
   analyzer.cc
   helper.cc
-  # passes
-  analysis_pass.cc
-  fluid_to_data_flow_graph_pass.cc
-  data_flow_graph_to_fluid_pass.cc
-  dfg_graphviz_draw_pass.cc
-  tensorrt_subgraph_pass.cc
-  tensorrt_subgraph_node_mark_pass.cc
-  fluid_to_ir_pass.cc
-  model_store_pass.cc
-  DEPS ${analysis_deps})
+  analysis_pass
+  DEPS ${analysis_deps}
+  )
 
-cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
-cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid)
 
 function(inference_analysis_test TARGET)
   if(WITH_TESTING)
@@ -34,13 +35,3 @@ function(inference_analysis_test TARGET)
 endfunction(inference_analysis_test)
 
 inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api)
-inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
-inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
-inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc)
-inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
-inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
-inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
-inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
-inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
-inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
-inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h
index 13805ea4acf936b242bcd86b2faf89813753a9fe..299f235a74ae0ffb663be61079607d8ac1105a97 100644
--- a/paddle/fluid/inference/analysis/analysis_pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
@@ -19,42 +19,36 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/inference/analysis/argument.h"
-#include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/inference/analysis/node.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
+/*
+ * AnalysisPass is a pass used to control the IR passes.
+ */
 class AnalysisPass {
  public:
   AnalysisPass() = default;
   virtual ~AnalysisPass() = default;
-  // Mutable Pass.
-  virtual bool Initialize(Argument *argument) { return false; }
-  // Readonly Pass.
-  virtual bool Initialize(const Argument &argument) { return false; }
 
-  // Virtual method overriden by subclasses to do any necessary clean up after
-  // all passes have run.
-  virtual bool Finalize() { return false; }
-
-  // Create a debugger Pass that draw the DFG by graphviz toolkit.
-  virtual AnalysisPass *CreateGraphvizDebugerPass() const { return nullptr; }
-
-  // Run on a single DataFlowGraph.
-  virtual void Run(DataFlowGraph *x) = 0;
+  // Run on a single Graph.
+  void Run(Argument* argument) { RunImpl(argument); }
 
   // Human-readable short representation.
   virtual std::string repr() const = 0;
   // Human-readable long description.
   virtual std::string description() const { return "No DOC"; }
-};
 
-// GraphPass processes on any GraphType.
-class DataFlowGraphPass : public AnalysisPass {};
+ protected:
+  // User should implement these.
+  virtual void RunImpl(Argument* argument) = 0;
+
+  Argument* argument_{nullptr};
+};
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index d55303a51e9fee3057455470b4b3139dc5f85e89..c8ed373ee7c32552608d501aa642677f940cd520 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -15,138 +15,23 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include <string>
 #include <vector>
-
-#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
-#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
-#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
-#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
-#include "paddle/fluid/inference/analysis/model_store_pass.h"
-#include "paddle/fluid/inference/analysis/pass_manager.h"
-#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
-#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
-
-DEFINE_bool(IA_enable_tensorrt_subgraph_engine, false,
-            "Enable subgraph to TensorRT engine for acceleration");
-
-DEFINE_bool(IA_enable_ir, false, "Turn on IR support");
-
-DEFINE_string(IA_graphviz_log_root, "./",
-              "Graphviz debuger for data flow graphs.");
-
-DEFINE_string(IA_output_storage_path, "", "optimized model output path");
+#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
+#include "paddle/fluid/inference/analysis/passes/passes.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-class DfgPassManagerImpl final : public DfgPassManager {
- public:
-  DfgPassManagerImpl() {
-    // TODO(Superjomn) set the key with pass reprs.
-    if (!FLAGS_IA_enable_ir) {
-      AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
-    } else {
-      AddPass("fluid-to-ir-pass", new FluidToIrPass);
-    }
-    TryAddTensorRtPass();
-    AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
-    if (!FLAGS_IA_output_storage_path.empty()) {
-      AddPass("model-store-pass", new ModelStorePass);
-    }
-  }
+Analyzer::Analyzer() {}
 
-  std::string repr() const override { return "dfg-pass-manager"; }
-  std::string description() const override { return "DFG pass manager."; }
+void Analyzer::Run(Argument *argument) { RunIrAnalysis(argument); }
 
- private:
-  void AddPass(const std::string& name, AnalysisPass* pass) {
-    VLOG(30) << "Adding pass " << name;
-    Register(name, pass);
-    AddGraphvizDebugerPass(pass);
-  }
+void Analyzer::RunIrAnalysis(Argument *argument) {
+  std::vector<std::string> passes({"ir_analysis_compose_pass"});
 
-  void TryAddTensorRtPass() {
-    if (FLAGS_IA_enable_tensorrt_subgraph_engine) {
-      auto trt_teller = [&](const Node* node) {
-        std::unordered_set<std::string> teller_set(
-            {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
-             "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
-             "elementwise_add", "dropout"});
-        if (!node->IsFunction()) return false;
-
-        const auto* func = static_cast<const Function*>(node);
-        if (teller_set.count(func->func_type())) {
-          return true;
-        } else {
-          return false;
-        }
-      };
-
-      AddPass("tensorrt-subgraph-marker",
-              new TensorRTSubgraphNodeMarkPass(trt_teller));
-      AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
-    }
-  }
-
-  // Add the graphviz debuger pass if the parent pass has one.
-  void AddGraphvizDebugerPass(AnalysisPass* pass) {
-    auto* debuger_pass = pass->CreateGraphvizDebugerPass();
-    if (debuger_pass) {
-      Register(debuger_pass->repr(), debuger_pass);
-    }
+  for (auto &pass : passes) {
+    PassRegistry::Global().Retreive(pass)->Run(argument);
   }
-};
-
-Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
-
-void Analyzer::Run(Argument* argument) {
-  std::vector<std::string> passes;
-  passes.push_back("graph_viz_pass");  // add graphviz for debug.
-#ifdef PADDLE_WITH_MKLDNN
-  if (use_mkldnn_) {
-    VLOG(30) << "Adding MKL-DNN placement pass";
-    passes.push_back("mkldnn_placement_pass");
-  }
-#endif
-  // infer_clean_graph_pass should be the first default pass
-  // after mkldnn_placement_pass.
-  passes.push_back("infer_clean_graph_pass");
-  passes.push_back("graph_viz_pass");  // add graphviz for debug.
-  for (auto& pass : ir_passes_) {
-    // skip mkldnn pass when use_mkldnn_ = false;
-    bool skip_pass = (!use_mkldnn_) && pass.find("mkldnn") != std::string::npos;
-    if (!disabled_ir_passes_.count(pass) && !skip_pass) {
-      passes.push_back(pass);
-      passes.push_back("graph_viz_pass");  // add graphviz for debug.
-    }
-  }
-  argument->Set(kFluidToIrPassesAttr, new std::vector<std::string>(passes));
-
-  for (auto& x : data_) {
-    PADDLE_ENFORCE(x->Initialize(argument));
-    x->RunAll();
-    PADDLE_ENFORCE(x->Finalize());
-  }
-}
-
-Analyzer& Analyzer::IncludeAllIrPasses() {
-  ir_passes_ = all_ir_passes_;
-  return *this;
-}
-
-Analyzer& Analyzer::DisableIrPasses(const std::vector<std::string>& passes) {
-  disabled_ir_passes_.insert(passes.begin(), passes.end());
-  return *this;
-}
-
-Analyzer& Analyzer::IncludeIrPasses(const std::vector<std::string>& passes) {
-  ir_passes_ = passes;
-  return *this;
-}
-
-Analyzer& Analyzer::SetUseMkldnn(bool use_mkldnn) {
-  use_mkldnn_ = use_mkldnn;
-  return *this;
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index 3af1d572dfd81197797dd7e57d87ba12c2f3548e..b43e67f20f493cd8151871ca3a36eb6fdadcf9ff 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -40,56 +40,21 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/flags.h"
-#include "paddle/fluid/inference/analysis/pass_manager.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-class Analyzer : public OrderedRegistry<PassManager> {
+class Analyzer final {
  public:
-  // Register all the pass-managers.
   Analyzer();
 
   void Run(Argument* argument);
 
-  Analyzer& DisableIrPasses(const std::vector<std::string>& passes);
-  Analyzer& IncludeIrPasses(const std::vector<std::string>& passes);
-  Analyzer& IncludeAllIrPasses();
-  Analyzer& SetUseMkldnn(bool use_mkldnn);
-
   DISABLE_COPY_AND_ASSIGN(Analyzer);
 
- private:
-  // All avaiable IR passes.
-  // The bigger fuse comes first, so that the small operators prefer to be
-  // merged in a larger fuse op. The small fusion will not break the pattern of
-  // larger fusion.
-  const std::vector<std::string> all_ir_passes_{{
-      // Manual update the passes here.
-      "attention_lstm_fuse_pass",       //
-      "seqconv_eltadd_relu_fuse_pass",  //
-      "embedding_fc_lstm_fuse_pass",    //
-      "fc_lstm_fuse_pass",              //
-      "mul_lstm_fuse_pass",             //
-      "fc_gru_fuse_pass",               //
-      "mul_gru_fuse_pass",              //
-      "seq_concat_fc_fuse_pass",        //
-      "fc_fuse_pass",                   //
-      "conv_bn_fuse_pass",              //
-      "conv_eltwiseadd_bn_fuse_pass",   //
-#ifdef PADDLE_WITH_MKLDNN
-      "depthwise_conv_mkldnn_pass",             //
-      "conv_bias_mkldnn_fuse_pass",             //
-      "conv_relu_mkldnn_fuse_pass",             //
-      "conv_elementwise_add_mkldnn_fuse_pass",  //
-#endif
-  }};
-
-  std::unordered_set<std::string> disabled_ir_passes_;
-  // Ir passes to run
-  std::vector<std::string> ir_passes_;
-  bool use_mkldnn_;
+ protected:
+  void RunIrAnalysis(Argument* argument);
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 5430e5c1ef1c70d27295ebc1a9bd427cd95f006a..48fc5dda2a5bfa24d679d4bf655e580dafc614b3 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -27,21 +27,21 @@ namespace analysis {
 using namespace framework;  // NOLINT
 
 TEST(Analyzer, analysis_without_tensorrt) {
-  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
   Argument argument;
-  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
+  argument.SetModelDir(FLAGS_inference_model_dir);
+  argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
+
   Analyzer analyser;
   analyser.Run(&argument);
 }
 
 TEST(Analyzer, analysis_with_tensorrt) {
-  FLAGS_IA_enable_tensorrt_subgraph_engine = true;
   Argument argument;
-  argument.Set<int>("minimum_subgraph_size", new int(0));
-  argument.Set<int>("max_batch_size", new int(3));
-  argument.Set<int>("workspace_size", new int(1 << 20));
-  argument.Set<std::string>("precision_mode", new std::string("FP32"));
-  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
+  argument.SetTensorRtMaxBatchSize(3);
+  argument.SetTensorRtWorkspaceSize(1 << 20);
+  argument.SetModelDir(FLAGS_inference_model_dir);
+  argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
+
   Analyzer analyser;
   analyser.Run(&argument);
 }
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 9495e2435c79ff660c64322d2acd8e058e09e563..d7a2f3d1e3a3251263c8670aef5db538fa2c48ea 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -24,13 +24,16 @@
 #pragma once
 
 #include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
+using framework::ir::Graph;
 
 /*
  * The argument definition of both Pass and PassManagers.
@@ -39,75 +42,99 @@ namespace analysis {
  */
 struct Argument {
   Argument() = default;
-  explicit Argument(const std::string& fluid_model_dir)
-      : fluid_model_dir(new std::string(fluid_model_dir)) {}
-  // The directory of the trained model.
-  std::unique_ptr<std::string> fluid_model_dir;
-  // The path of `__model__` and `param`, this is used when the file name of
-  // model and param is changed.
-  std::unique_ptr<std::string> fluid_model_program_path;
-  std::unique_ptr<std::string> fluid_model_param_path;
-
-  // The graph that process by the Passes or PassManagers.
-  std::unique_ptr<DataFlowGraph> main_dfg;
-
-  // The original program desc.
-  std::unique_ptr<framework::proto::ProgramDesc> origin_program_desc;
-
-  // The processed program desc.
-  std::unique_ptr<framework::proto::ProgramDesc> transformed_program_desc;
-
-  // The output storage path of ModelStorePass.
-  std::unique_ptr<std::string> model_output_store_path;
-
-  // Support for any other attributes.
-  template <typename T>
-  void Set(const std::string& key, T* data) {
-    PADDLE_ENFORCE_NOT_NULL(data);
-    PADDLE_ENFORCE(!attrs_.count(key), "Duplicate set Argument's attr [%s]",
-                   key);
-    attrs_[key] = data;
-    attr_deleters_[key] = [data, key]() {
-      VLOG(30) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
-      VLOG(30) << "argument delete attr: " << key;
-      delete data;
-    };
-  }
-
-  bool Has(const std::string& name) const { return attrs_.count(name); }
-
-  template <typename T>
-  T* Release(const std::string& key) {
-    PADDLE_ENFORCE(attrs_.count(key));
-    auto* res = boost::any_cast<T*>(attrs_.at(key));
-    attrs_.erase(key);
-    attr_deleters_.erase(key);
-    return res;
-  }
-
-  template <typename T>
-  T& Get(const std::string& key) {
-    PADDLE_ENFORCE(Has(key));
-    return *boost::any_cast<T*>(attrs_.at(key));
-  }
-
-  ~Argument() {
-    for (auto& item : attr_deleters_) {
-      item.second();
-    }
-  }
+  explicit Argument(const std::string& model_dir) { SetModelDir(model_dir); }
+
+  using unique_ptr_t = std::unique_ptr<void, std::function<void(void*)>>;
+  using fusion_statis_t = std::unordered_map<std::string, int>;
+
+  bool Has(const std::string& key) const { return valid_fields_.count(key); }
+
+#define DECL_ARGUMENT_FIELD(field__, Field, type__) \
+ public:                                            \
+  type__& field__() {                               \
+    PADDLE_ENFORCE(Has(#field__));                  \
+    return field__##_;                              \
+  }                                                 \
+  void Set##Field(const type__& x) {                \
+    field__##_ = x;                                 \
+    valid_fields_.insert(#field__);                 \
+  }                                                 \
+  DECL_ARGUMENT_FIELD_VALID(field__);               \
+  type__* field__##_ptr() { return &field__##_; }   \
+                                                    \
+ private:                                           \
+  type__ field__##_;
+
+#define DECL_ARGUMENT_FIELD_VALID(field__) \
+  bool field__##_valid() { return Has(#field__); }
+
+#define DECL_ARGUMENT_UNIQUE_FIELD(field__, Field, type__)                \
+ public:                                                                  \
+  type__& field__() {                                                     \
+    PADDLE_ENFORCE_NOT_NULL(field__##_);                                  \
+    PADDLE_ENFORCE(Has(#field__));                                        \
+    return *static_cast<type__*>(field__##_.get());                       \
+  }                                                                       \
+  void Set##Field(type__* x) {                                            \
+    field__##_ =                                                          \
+        unique_ptr_t(x, [](void* x) { delete static_cast<type__*>(x); }); \
+    valid_fields_.insert(#field__);                                       \
+  }                                                                       \
+  void Set##Field##NotOwned(type__* x) {                                  \
+    valid_fields_.insert(#field__);                                       \
+    field__##_ = unique_ptr_t(x, [](void* x) {});                         \
+  }                                                                       \
+  DECL_ARGUMENT_FIELD_VALID(field__);                                     \
+  type__* field__##_ptr() {                                               \
+    PADDLE_ENFORCE(Has(#field__));                                        \
+    return static_cast<type__*>(field__##_.get());                        \
+  }                                                                       \
+  type__* Release##Field() {                                              \
+    PADDLE_ENFORCE(Has(#field__));                                        \
+    valid_fields_.erase(#field__);                                        \
+    return static_cast<type__*>(field__##_.release());                    \
+  }                                                                       \
+                                                                          \
+ private:                                                                 \
+  unique_ptr_t field__##_;
+
+  // Model path
+  DECL_ARGUMENT_FIELD(model_dir, ModelDir, std::string);
+  // Model specified with program and parameters files.
+  DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string);
+  DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string);
+
+  // The overall graph to work on.
+  DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph);
+  // The overall Scope to work on.
+  DECL_ARGUMENT_UNIQUE_FIELD(scope, Scope, framework::Scope);
+
+  DECL_ARGUMENT_UNIQUE_FIELD(main_program, MainProgram, framework::ProgramDesc);
+
+  // The ir passes to perform in analysis phase.
+  DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses,
+                      std::vector<std::string>);
+
+  DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
+  DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
+  DECL_ARGUMENT_FIELD(tensorrt_node_teller, TensorRtNodeTeller,
+                      std::function<bool(const framework::ir::Node*)>);
+  DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
+  DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
+
+  // The program transformed by IR analysis phase.
+  DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram,
+                             framework::proto::ProgramDesc);
+
+  DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t);
 
  private:
-  std::unordered_map<std::string, boost::any> attrs_;
-  std::unordered_map<std::string, std::function<void()>> attr_deleters_;
+  std::unordered_set<std::string> valid_fields_;
 };
 
-#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
-#define ANALYSIS_ARGUMENT_CHECK_FIELD(field__)               \
-  if (UNLIKELY(!(field__))) {                                \
-    LOG(ERROR) << "field " << #field__ << " should be set."; \
-    return false;                                            \
-  }
+#define ARGUMENT_CHECK_FIELD(argument__, fieldname__) \
+  PADDLE_ENFORCE(argument__->Has(#fieldname__),       \
+                 "the argument field [%s] should be set", #fieldname__);
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
deleted file mode 100644
index bdcb30f159ec17a8d3b23d020b2ea082e71ace1b..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ /dev/null
@@ -1,496 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/data_flow_graph.h"
-#include "paddle/fluid/inference/analysis/dot.h"
-#include "paddle/fluid/inference/analysis/node.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-using ir_node_t = framework::ir::Node;
-using ir_graph_t = framework::ir::Graph;
-
-// It is a better idea that the inputs and outputs of this graph is set manually
-// before, but there must be a Pass that helps to prune the unnecessary ops that
-// do not contribute to the given targets, so in this pass, analysis and get the
-// inputs and outputs is OK.
-void DataFlowGraph::Build() {
-  inputs_.clear();
-  outputs_.clear();
-  std::unordered_set<Node *> ins;
-  std::unordered_set<Node *> outs;
-  for (auto &node : nodes.nodes()) {
-    for (auto *in : node->inlinks) {
-      ins.insert(in);
-    }
-    for (auto *out : node->outlinks) {
-      outs.insert(out);
-    }
-  }
-
-  // The nodes that in ins but not in outs is the graph's inputs
-  // similarly, the nodes that in outs but not in ins is the graphs' outputs
-  for (auto *in : ins) {
-    if (!outs.count(in)) {
-      inputs_.push_back(in);
-    }
-  }
-  for (auto *out : outs) {
-    if (!ins.count(out)) {
-      outputs_.push_back(out);
-    }
-  }
-
-  Clean();
-}
-
-void DataFlowGraph::Build(const framework::proto::ProgramDesc &prog) {
-  // insert vars
-  // The `var2id` keeps a map from a variable's name to its Node-id, the Node-id
-  // will keep updating to its latest alias during the graph-building.
-  std::unordered_map<std::string, size_t> var2id;
-  auto &main_block = prog.blocks(framework::kRootBlockIndex);
-  for (int i = 0; i < main_block.vars_size(); i++) {
-    const auto &var = main_block.vars(i);
-    auto *v = nodes.Create(Node::Type::kValue);
-    v->SetName(var.name());
-    v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
-    v->SetPbMsg(var.SerializeAsString());
-    var2id[var.name()] = v->id();
-  }
-
-  // The variables in a SSA can only write once, so if a variable is written
-  // multiple times(quite common in our ProgramDesc design), multiple alias
-  // Nodes of this variable will be created, and each will just write once.
-
-  // An set that keep all the names of the variables(the original, not alias)
-  // that have been written(as outputs). Once an Op's output variable hit the
-  // set, it should create a new alias and update the global alias for this
-  // variable. And that make a Data Flow Graph a SSA.
-  std::unordered_set<Node *> unique_written_vars;
-  for (int i = 0; i < main_block.ops_size(); i++) {
-    const auto &op = main_block.ops(i);
-    auto *o = nodes.Create(Node::Type::kFunction);
-    o->SetName(op.type());
-    static_cast<Function *>(o)->SetFuncType(op.type());
-    // Link to the original protobuf message's memory, make it easier to
-    // generate from a data flow graph to fluid ProgramDesc.
-    o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
-    o->SetPbMsg(op.SerializeAsString());
-
-    // set inputs and outputs
-    for (int j = 0; j < op.inputs_size(); j++) {
-      auto &in_var = op.inputs(j);
-      for (int k = 0; k < in_var.arguments_size(); k++) {
-        auto *in = nodes.GetMutable(var2id.at(in_var.arguments(k)));
-        in->outlinks.push_back(o);
-        o->inlinks.push_back(in);
-        unique_written_vars.insert(in);
-      }
-    }
-    for (int j = 0; j < op.outputs_size(); j++) {
-      auto &out_var = op.outputs(j);
-      for (int k = 0; k < out_var.arguments_size(); k++) {
-        auto *out = nodes.GetMutable(var2id[out_var.arguments(k)]);
-        if (unique_written_vars.count(out)) {
-          // Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
-          auto *out_alias = nodes.Create(Node::Type::kValue);
-          out_alias->SetName(out->name());
-          out_alias->SetPbDesc(out->pb_desc());
-          out_alias->SetPbMsg(out->pb_msg());
-          var2id[out_alias->name()] =
-              out_alias->id();  // update variable's alias Node
-          LOG(INFO) << "loop found in graph, create SSA alias node ["
-                    << out_alias->repr() << "] for [" << out->repr() << "]";
-          out = out_alias;
-        }
-        out->inlinks.push_back(o);
-        o->outlinks.push_back(out);
-      }
-    }
-  }
-  // Analysis and extract the inputs and outputs of this graph.
-  Build();
-}
-
-void DataFlowGraph::Build(const framework::ir::Graph &graph) {
-  // Create nodes
-  std::unordered_map<ir_node_t *, Node *> ir_node_map;
-  for (auto *ir_node : graph.Nodes()) {
-    Node *x{nullptr};
-    if (ir_node->IsOp()) {
-      PADDLE_ENFORCE(ir_node->Op());
-      VLOG(40) << "get op " << ir_node << " " << ir_node->Name();
-      x = nodes.Create(Node::Type::kFunction);
-      x->attr("ir_node").Pointer() = ir_node;
-      PADDLE_ENFORCE(ir_node->Op()->Proto());
-      x->SetName(ir_node->Op()->Proto()->type());
-      x->SetPbMsg(ir_node->Op()->Proto()->SerializeAsString());
-    } else if (ir_node->IsVar()) {
-      // Not create a Node for IR ControlDepVar, considering Inference currently
-      // just used in single thread scenerio.
-      VLOG(40) << "get var " << ir_node->Name();
-      x = nodes.Create(Node::Type::kValue);
-      x->attr("ir_node").Pointer() = ir_node;
-      x->SetName(ir_node->Name());
-      // x->SetPbMsg(ir_node->Var()->Proto()->SerializeAsString());
-    } else {
-      PADDLE_THROW("Failed to create an Node from IR, unknown type");
-    }
-    ir_node_map.emplace(ir_node, x);
-  }
-  VLOG(40) << "finish creating Nodes";
-
-  VLOG(40) << "to create edge";
-  // Create links
-  for (auto *ir_node : graph.Nodes()) {
-    auto it = ir_node_map.find(ir_node);
-    // Skip ControlDepVar.
-    if (it == ir_node_map.end()) continue;
-    auto *node = it->second;
-    for (auto *x : ir_node->inputs) {
-      if (!ir_node_map.count(x)) continue;
-      node->inlinks.push_back(ir_node_map.at(x));
-    }
-    for (auto *x : ir_node->outputs) {
-      if (!ir_node_map.count(x)) continue;
-      node->outlinks.push_back(ir_node_map.at(x));
-    }
-  }
-
-  Build();
-  PADDLE_ENFORCE(!inputs_.empty(),
-                 "Can't deduce any inputs from the graph, Is the graph empty?");
-
-  ir_graph = &graph;
-  VLOG(30) << "finished build from IR";
-}
-
-void DataFlowGraph::Clean() {
-  for (auto &node : nodes.nodes()) {
-    std::unordered_set<Node *> inlinks_set(node->inlinks.begin(),
-                                           node->inlinks.end());
-    std::unordered_set<Node *> outlinks_set(node->outlinks.begin(),
-                                            node->outlinks.end());
-    if (inlinks_set.size() < node->inlinks.size()) {
-      node->inlinks.assign(inlinks_set.begin(), inlinks_set.end());
-    }
-    if (outlinks_set.size() < node->outlinks.size()) {
-      node->outlinks.assign(outlinks_set.begin(), outlinks_set.end());
-    }
-  }
-}
-
-std::string DataFlowGraph::DotString() const {
-  Dot dot;
-
-  // Add nodes
-  for (size_t i = 0; i < nodes.size(); i++) {
-    const Node &node = nodes.Get(i);
-    dot.AddNode(node.repr(), node.dot_attrs());
-  }
-
-  // Add edges
-  for (size_t i = 0; i < nodes.size(); i++) {
-    const Node &node = nodes.Get(i);
-    for (auto &in : node.inlinks) {
-      dot.AddEdge(in->repr(), node.repr(), {});
-    }
-  }
-  return dot.Build();
-}
-
-std::string DataFlowGraph::HumanReadableInfo(bool show_values,
-                                             bool show_functions) const {
-  std::stringstream values, functions;
-  for (auto &n : nodes.nodes()) {
-    if (show_values && n->IsValue()) {
-      values << n->repr() << "\n";
-    }
-    if (show_functions && n->IsFunction()) {
-      functions << n->repr() << "\n";
-    }
-  }
-  return "Values:\n" + values.str() + "\n\n" + "Functions:\n" + functions.str();
-}
-
-//
-// NodesBFSIterator
-//
-
-GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
-    const std::vector<Node *> &source)
-    : queue_(source.begin(), source.end()) {}
-
-GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
-    GraphTraits<DataFlowGraph>::NodesBFSIterator &&other) noexcept
-    : queue_(std::move(other.queue_)),
-      visited_(std::move(other.visited_)) {}
-
-GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
-    const GraphTraits<DataFlowGraph>::NodesBFSIterator &other)
-    : queue_(other.queue_), visited_(other.visited_) {}
-
-Node &GraphTraits<DataFlowGraph>::NodesBFSIterator::operator*() {
-  PADDLE_ENFORCE(!queue_.empty());
-  return *queue_.front();
-}
-
-Node *GraphTraits<DataFlowGraph>::NodesBFSIterator::operator->() {
-  PADDLE_ENFORCE(!queue_.empty());
-  return queue_.front();
-}
-
-GraphTraits<DataFlowGraph>::NodesBFSIterator &
-GraphTraits<DataFlowGraph>::NodesBFSIterator::operator=(
-    const GraphTraits<DataFlowGraph>::NodesBFSIterator &other) {
-  queue_ = other.queue_;
-  visited_ = other.visited_;
-  return *this;
-}
-
-GraphTraits<DataFlowGraph>::NodesBFSIterator
-    &GraphTraits<DataFlowGraph>::NodesBFSIterator::operator++() {
-  PADDLE_ENFORCE(!queue_.empty());
-  auto *cur = queue_.front();
-  visited_.insert(cur);
-  queue_.pop_front();
-  for (auto *output : cur->outlinks) {
-    if (!visited_.count(output)) {
-      queue_.push_back(output);
-      visited_.insert(output);
-    }
-  }
-  return *this;
-}
-
-bool GraphTraits<DataFlowGraph>::NodesBFSIterator::operator==(
-    const GraphTraits<DataFlowGraph>::NodesBFSIterator &other) {
-  if (queue_.empty()) return other.queue_.empty();
-  if ((!queue_.empty()) && (!other.queue_.empty())) {
-    return queue_.front() == other.queue_.front() &&
-           visited_.size() == other.visited_.size();
-    // equality of queue and
-    // visited. Just a light but week implementation.
-  }
-  return false;
-}
-
-//
-// NodesDFSIterator
-//
-GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
-    const std::vector<Node *> &source) {
-  for (auto *x : source) stack_.push(x);
-}
-
-GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
-    GraphTraits<DataFlowGraph>::NodesDFSIterator &&other) noexcept
-    : stack_(std::move(other.stack_)),
-      visited_(std::move(other.visited_)) {}
-
-GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
-    const GraphTraits<DataFlowGraph>::NodesDFSIterator &other)
-    : stack_(other.stack_), visited_(other.visited_) {}
-
-Node &GraphTraits<DataFlowGraph>::NodesDFSIterator::operator*() {
-  PADDLE_ENFORCE(!stack_.empty());
-  return *stack_.top();
-}
-
-GraphTraits<DataFlowGraph>::NodesDFSIterator
-    &GraphTraits<DataFlowGraph>::NodesDFSIterator::operator++() {
-  if (stack_.empty()) return *this;
-  visited_.insert(stack_.top());
-  auto *cur = stack_.top();
-  stack_.pop();
-  for (auto *x : cur->outlinks) {
-    if (!visited_.count(x)) {
-      stack_.push(x);
-      visited_.insert(x);
-    }
-  }
-  return *this;
-}
-bool GraphTraits<DataFlowGraph>::NodesDFSIterator::operator==(
-    const GraphTraits<DataFlowGraph>::NodesDFSIterator &other) {
-  if (stack_.empty()) return other.stack_.empty();
-  if ((!stack_.empty()) && (!other.stack_.empty())) {
-    return stack_.top() == other.stack_.top();
-  }
-  return false;
-}
-
-GraphTraits<DataFlowGraph>::NodesDFSIterator &
-GraphTraits<DataFlowGraph>::NodesDFSIterator::operator=(
-    const GraphTraits<DataFlowGraph>::NodesDFSIterator &other) {
-  stack_ = other.stack_;
-  visited_ = other.visited_;
-  return *this;
-}
-Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
-  return stack_.top();
-}
-
-inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
-  return node.inlinks.size() == n;
-}
-
-GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
-    const std::vector<Node *> &source) {
-  PADDLE_ENFORCE(!source.empty(),
-                 "Start points of topological sorting should not be empty!");
-  // CHECK all the inputs' in-degree is 0
-  for (auto *node : source) {
-    PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
-  }
-
-  std::unordered_set<Node *> visited;
-  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
-
-  std::vector<Node *> inlink_visited;
-  while (!to_visit.empty()) {
-    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
-    for (auto *p : queue) {
-      if (p->deleted()) {
-        visited.insert(p);
-        to_visit.erase(p);
-        continue;
-      }
-      inlink_visited.clear();
-
-      std::copy_if(p->inlinks.begin(), p->inlinks.end(),
-                   std::back_inserter(inlink_visited),
-                   [&](Node *x) { return visited.count(x); });
-
-      if (inlink_visited.size() == p->inlinks.size()) {
-        sorted_.push_back(p);
-        for (auto *_ : p->outlinks) {
-          if (!visited.count(_)) {
-            to_visit.insert(_);
-          }
-        }
-
-        to_visit.erase(p);
-        visited.insert(p);
-      }
-    }
-  }
-}
-
-GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
-    const paddle::inference::analysis::GraphTraits<
-        DataFlowGraph>::NodesTSIterator &other)
-    : sorted_(other.sorted_), cursor_(other.cursor_) {}
-
-Node &GraphTraits<DataFlowGraph>::NodesTSIterator::operator*() {
-  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
-  return *sorted_[cursor_];
-}
-
-paddle::inference::analysis::GraphTraits<DataFlowGraph>::NodesTSIterator
-    &GraphTraits<DataFlowGraph>::NodesTSIterator::operator++() {
-  if (++cursor_ >= sorted_.size()) {
-    sorted_.clear();
-    cursor_ = 0;
-  }
-  return *this;
-}
-paddle::inference::analysis::GraphTraits<DataFlowGraph>::NodesTSIterator &
-GraphTraits<DataFlowGraph>::NodesTSIterator::operator=(
-    const paddle::inference::analysis::GraphTraits<
-        DataFlowGraph>::NodesTSIterator &other) {
-  cursor_ = other.cursor_;
-  sorted_ = other.sorted_;
-  return *this;
-}
-
-bool GraphTraits<DataFlowGraph>::NodesTSIterator::operator==(
-    const paddle::inference::analysis::GraphTraits<
-        DataFlowGraph>::NodesTSIterator &other) {
-  return sorted_ == other.sorted_ && cursor_ == other.cursor_;
-}
-
-Node *GraphTraits<DataFlowGraph>::NodesTSIterator::operator->() {
-  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
-  return sorted_[cursor_];
-}
-
-std::pair<std::vector<Node *>, std::vector<Node *>>
-ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
-  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
-  std::unordered_set<Node *> inputs;
-  std::unordered_set<Node *> outputs;
-  // Input a Value, check whether its inlink is in the subgraph.
-  auto inlink_in_subgraph = [&](Node *n) {
-    for (auto *in : n->inlinks) {
-      if (nodes.count(in)) return true;
-    }
-    return false;
-  };
-
-  for (auto &node : graph) {
-    for (auto *in : node->inlinks) {
-      // The Value that is written by nodes inside a sub-graph shouldn't be the
-      // input of the sub-graph.
-      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
-          !inlink_in_subgraph(in)) {
-        inputs.insert(in);
-      }
-    }
-    for (auto *out : node->outlinks) {
-      if (!nodes.count(out) && out->type() == Node::Type::kValue) {
-        outputs.insert(out);
-      }
-    }
-  }
-  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
-                        std::vector<Node *>(outputs.begin(), outputs.end()));
-}
-
-// Filter the Intermediate results of the subgraph node.
-void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
-  std::vector<Node *> op_nodes;
-  for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) {
-    if (node.type() == Node::Type::kValue || node.deleted()) {
-      continue;
-    }
-    op_nodes.push_back(&node);
-  }
-  size_t op_num = op_nodes.size();
-  for (size_t i = 0; i < op_num; i++) {
-    if (op_nodes[i]->type() == Node::Type::kFunction) continue;
-    std::unordered_set<std::string> follow_up_input_names;
-    for (size_t j = i + 1; j < op_num; j++) {
-      for (auto *in : op_nodes[j]->inlinks) {
-        follow_up_input_names.insert(in->name());
-      }
-    }
-    std::vector<Node *> filtered_subgraph_outlinks;
-    for (auto *out : op_nodes[i]->outlinks) {
-      if (follow_up_input_names.count(out->name())) {
-        filtered_subgraph_outlinks.push_back(out);
-      } else {
-        out->SetDeleted();
-      }
-    }
-    // The filtered_subgraph_outlinks may be empty.
-    op_nodes[i]->outlinks = filtered_subgraph_outlinks;
-  }
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
deleted file mode 100644
index 437e097acd24aad384df6712ce0de6106b3b5c65..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * Data flow graph is an pass that build the basic graph. It contains a graph
- * and the iterators that enable the iteration over the graph.
- */
-
-#pragma once
-
-#include <deque>
-#include <stack>
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/inference/analysis/graph_traits.h"
-#include "paddle/fluid/inference/analysis/node.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * DataFlowGraph - A container of Value and Function Nodes.
- *
- * This is the base graph for any other type of graphs, such as SSA or CFG.
- */
-struct DataFlowGraph {
-  NodeMap nodes;
-  // inputs and outputs are deduced from the graph.
-  // Used to interact with IR.
-  const framework::ir::Graph *ir_graph{nullptr};
-
-  // Extract inputs and outputs of the graph.
-  void Build();
-
-  void Build(const framework::proto::ProgramDesc &prog);
-
-  // Build a graph from ir::Graph.
-  void Build(const framework::ir::Graph &graph);
-
-  // Get an attribute.
-  AnyAttr &Attr(const std::string &key) { return attrs_[key]; }
-
-  // Output a DOT graph file for debug.
-  std::string DotString() const;
-
-  std::string HumanReadableInfo(bool show_values = true,
-                                bool show_functions = true) const;
-
-  const std::vector<Node *> &inputs() const {
-    PADDLE_ENFORCE(!inputs_.empty(),
-                   "No inputs are deduced, need to Build() first.");
-    return inputs_;
-  }
-  const std::vector<Node *> &outputs() const {
-    PADDLE_ENFORCE(!outputs_.empty(),
-                   "No outputs are deduced, need to Build() first.");
-    return outputs_;
-  }
-
- private:
-  mutable std::vector<Node *> inputs_;
-  mutable std::vector<Node *> outputs_;
-  std::unordered_map<std::string, AnyAttr> attrs_;
-
-  // Remove duplicate edges and so on.
-  void Clean();
-};
-
-/*
- * An graph trait help to traverse the graph using BFS.
- * The BFS start from a graph's inputs, the graph should be fully-connected, so
- * that the iterator can reach the end.
- */
-template <>
-struct GraphTraits<DataFlowGraph> {
-  // BFS iterator on nodes.
-  struct NodesBFSIterator
-      : public std::iterator<std::forward_iterator_tag, Node *> {
-    NodesBFSIterator() = default;
-    explicit NodesBFSIterator(const std::vector<Node *> &source);
-    NodesBFSIterator(NodesBFSIterator &&other) noexcept;
-    // NOTE Heavy to use.
-    NodesBFSIterator(const NodesBFSIterator &other);
-
-    Node &operator*();
-    NodesBFSIterator &operator++();
-    Node *operator->();
-    // TODO(Superjomn) current implementation just compare the first
-    // element, need to compare the graph and all the elements in the queue and
-    // set.
-    NodesBFSIterator &operator=(const NodesBFSIterator &other);
-    bool operator==(const NodesBFSIterator &other);
-    bool operator!=(const NodesBFSIterator &other) { return !(*this == other); }
-
-   private:
-    std::deque<Node *> queue_;
-    std::unordered_set<Node *> visited_;
-  };
-
-  // DFS iterator on nodes.
-  struct NodesDFSIterator
-      : public std::iterator<std::forward_iterator_tag, Node *> {
-    NodesDFSIterator() = default;
-    NodesDFSIterator(const std::vector<Node *> &source);
-    NodesDFSIterator(NodesDFSIterator &&other) noexcept;
-    NodesDFSIterator(const NodesDFSIterator &other);
-
-    Node &operator*();
-    NodesDFSIterator &operator++();
-    // TODO(Superjomn) current implementation just compare the first
-    // element, need to compare the graph and all the elements in the queue and
-    // set.
-    NodesDFSIterator &operator=(const NodesDFSIterator &other);
-    bool operator==(const NodesDFSIterator &other);
-    bool operator!=(const NodesDFSIterator &other) { return !(*this == other); }
-    Node *operator->();
-
-   private:
-    std::stack<Node *> stack_;
-    std::unordered_set<Node *> visited_;
-  };
-
-  // Topological sorting iterator on nodes.
-  struct NodesTSIterator
-      : public std::iterator<std::forward_iterator_tag, Node *> {
-    NodesTSIterator() = default;
-    NodesTSIterator(const std::vector<Node *> &source);
-    NodesTSIterator(NodesTSIterator &&other)
-        : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
-      other.cursor_ = 0;
-    }
-    NodesTSIterator(const NodesTSIterator &other);
-
-    Node &operator*();
-    NodesTSIterator &operator++();
-    // TODO(Superjomn) current implementation just compare the first
-    // element, need to compare the graph and all the elements in the queue and
-    // set.
-    NodesTSIterator &operator=(const NodesTSIterator &other);
-    bool operator==(const NodesTSIterator &other);
-    bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
-    Node *operator->();
-
-   private:
-    std::vector<Node *> sorted_;
-    size_t cursor_{0};
-  };
-
-  explicit GraphTraits(const DataFlowGraph &graph) : graph_(graph) {}
-
-  // default use BFS to visit the nodes.
-  iterator_range<NodesBFSIterator> nodes() {
-    return iterator_range<NodesBFSIterator>(nodes_bfs_begin(), nodes_bfs_end());
-  }
-  iterator_range<NodesBFSIterator> nodes_in_BFS() {
-    return iterator_range<NodesBFSIterator>(nodes_bfs_begin(), nodes_bfs_end());
-  }
-  iterator_range<NodesDFSIterator> nodes_in_DFS() {
-    return iterator_range<NodesDFSIterator>(nodes_dfs_begin(), nodes_dfs_end());
-  }
-  iterator_range<NodesTSIterator> nodes_in_TS() {
-    return iterator_range<NodesTSIterator>(nodes_ts_begin(), nodes_ts_end());
-  }
-
- private:
-  NodesBFSIterator nodes_bfs_begin() {
-    return NodesBFSIterator(graph_.inputs());
-  }
-  NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); }
-
-  NodesDFSIterator nodes_dfs_begin() {
-    return NodesDFSIterator(graph_.inputs());
-  }
-  NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); }
-
-  NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_.inputs()); }
-  NodesTSIterator nodes_ts_end() { return NodesTSIterator(); }
-
- private:
-  const DataFlowGraph &graph_;
-};
-
-// Extract the inputs and outputs of a graph. The inputs and outputs of a
-// sub-graph is the inputs nodes and output nodes that doesn't inside the
-// sub-graph.
-std::pair<std::vector<Node *>, std::vector<Node *>>
-ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);  // NOLINT
-
-void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph);
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
deleted file mode 100644
index 50ce20621fb289023ecccf7bb39d98169765d5ee..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/data_flow_graph.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-TEST(DataFlowGraph, BFS) {
-  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
-  auto dfg = ProgramDescToDFG(desc);
-  dfg.Build();
-
-  for (auto* in : dfg.inputs()) {
-    LOG(INFO) << "inputs: " << in->name() << " "
-              << static_cast<int>(in->type());
-  }
-  for (auto* out : dfg.outputs()) {
-    LOG(INFO) << "outputs: " << out->name() << " "
-              << static_cast<int>(out->type());
-  }
-
-  size_t count = 0;
-  for (auto& node : GraphTraits<DataFlowGraph>(dfg).nodes()) {
-    LOG(INFO) << "visiting " << node.name();
-    ++count;
-  }
-  ASSERT_EQ(count, dfg.nodes.size());
-}
-
-TEST(DataFlowGraph, DFS) {
-  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
-  DataFlowGraph dfg;
-  dfg.Build(desc);
-  size_t count = 0;
-  for (auto& node : GraphTraits<DataFlowGraph>(dfg).nodes_in_DFS()) {
-    LOG(INFO) << "visiting " << node.name();
-    ++count;
-  }
-  ASSERT_EQ(count, dfg.nodes.size());
-}
-
-// Topological sorting.
-/*
- * Graph topology
- * inputs: 0, 1, 2
- * 0 -> 4
- * 0 -> 5
- * 1 -> 6
- * 2 -> 7
- * 4 -> 5
- * 4 -> 7
- * 4 -> 3
- * 7 -> 3
- */
-TEST(DataFlowGraph, TS) {
-  DataFlowGraph graph;
-
-  for (int i = 0; i < 8; i++) {
-    auto* node = graph.nodes.Create(Node::Type::kValue);
-    node->SetName("node-" + std::to_string(i));
-  }
-
-  auto add_link = [&](int i, int j) {
-    Node* source = graph.nodes.GetMutable(i);
-    Node* target = graph.nodes.GetMutable(j);
-    target->inlinks.push_back(source);
-    source->outlinks.push_back(target);
-  };
-
-  add_link(0, 4);
-  add_link(0, 5);
-  add_link(1, 6);
-  add_link(2, 7);
-  add_link(4, 5);
-  add_link(4, 7);
-  add_link(4, 3);
-  add_link(7, 3);
-  graph.Build();
-
-  auto its = GraphTraits<DataFlowGraph>(graph).nodes_in_TS();
-  std::vector<int> sorted_ids;
-  for (auto it = its.begin(); it != its.end(); ++it) {
-    LOG(INFO) << it->name();
-    sorted_ids.push_back(it->id());
-  }
-
-  // Assert a occurs prior to b in the sorted_ids.
-  auto assert_positive_sequence_pair = [&](int a, int b) {
-    auto a_offset = std::find(sorted_ids.begin(), sorted_ids.end(), a);
-    auto b_offset = std::find(sorted_ids.begin(), sorted_ids.end(), b);
-    ASSERT_LT(a_offset, b_offset);
-  };
-
-  assert_positive_sequence_pair(2, 7);
-  assert_positive_sequence_pair(7, 3);
-  assert_positive_sequence_pair(4, 3);
-  assert_positive_sequence_pair(0, 4);
-  assert_positive_sequence_pair(0, 5);
-  assert_positive_sequence_pair(1, 6);
-  assert_positive_sequence_pair(4, 5);
-  assert_positive_sequence_pair(4, 7);
-}
-
-TEST(DataFlowGraph, Build_ProgramDesc) {
-  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
-  DataFlowGraph graph;
-  graph.Build(desc);
-  ASSERT_EQ(graph.nodes.size(), 38UL);
-}
-
-void SetOp(framework::ProgramDesc* prog, const std::string& type,
-           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-  op->SetType(type);
-  op->SetInput("Xs", inputs);
-  op->SetOutput("Xs", outputs);
-  op->SetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName(),
-              static_cast<int>(framework::OpRole::kForward));
-}
-
-TEST(DataFlowGraph, Build_IR_Graph) {
-  framework::ProgramDesc prog;
-  for (auto& v : std::vector<std::string>({"a", "b", "c", "d", "e", "f"})) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    var->SetType(framework::proto::VarType::SELECTED_ROWS);
-    if (v == "c") {
-      var->SetPersistable(true);
-    }
-  }
-
-  SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
-        std::vector<std::string>({"b"}));
-  SetOp(&prog, "OP1", std::vector<std::string>({"a"}),
-        std::vector<std::string>({"c"}));
-  SetOp(&prog, "mul", std::vector<std::string>({"b", "c"}),
-        std::vector<std::string>({"d"}));
-  SetOp(&prog, "elementwise_add", std::vector<std::string>({"d", "e"}),
-        std::vector<std::string>({"f"}));
-
-  DataFlowGraph graph;
-
-  framework::ir::Graph ir_graph(prog);
-
-  graph.Build(ir_graph);
-
-  ASSERT_EQ(graph.nodes.size(), ir_graph.Nodes().size());
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
deleted file mode 100644
index dbe138514b20a4be20e7cca800f8e12b230e7824..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ /dev/null
@@ -1,285 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
-#include <vector>
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/proto_desc.h"
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
-#include "paddle/fluid/inference/io.h"
-
-namespace paddle {
-namespace inference {
-
-namespace analysis {
-
-using framework::proto::ProgramDesc;
-
-std::vector<std::string> ExtractParameters(
-    const std::vector<std::unique_ptr<Node>> &nodes);
-
-bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
-  ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
-  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
-  // The transformed_program_desc should inherit all the VarDesc and BlockDesc
-  // from the original program desc. The operators of the main block(the first
-  // block) should rewritten by data flow graph.
-  argument->transformed_program_desc.reset(
-      new ProgramDesc(*argument->origin_program_desc));
-  argument->transformed_program_desc->mutable_blocks(framework::kRootBlockIndex)
-      ->clear_ops();
-  desc_ = argument->transformed_program_desc.get();
-  argument_ = argument;
-  return true;
-}
-
-bool DataFlowGraphToFluidPass::Finalize() { return true; }
-
-void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
-  // FilterRedundantOutputOfSubGraph(graph);
-  for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) {
-    if (node.deleted()) continue;
-
-    switch (node.type()) {
-      case Node::Type::kFunction: {
-        AddFluidOp(&node);
-      } break;
-      case Node::Type::kFunctionBlock: {
-        AddEngineOp(&node);
-      } break;
-      default:
-        continue;
-    }
-  }
-
-  if (argument_->Has(framework::ir::kParamScopeAttr)) {
-    LOG(WARNING) << "parameter changes in the scope takes effect";
-  }
-
-  PADDLE_ENFORCE(argument_->transformed_program_desc.get());
-}
-
-void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
-  PADDLE_ENFORCE(node);
-  PADDLE_ENFORCE(node->IsFunction());
-  PADDLE_ENFORCE(node->pb_desc() || !node->pb_msg().empty(),
-                 "node has invalid protobuf repr.");
-
-  // currently only the main block is analyzed.
-  PADDLE_ENFORCE(desc_);
-  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  auto *op = main_block->add_ops();
-
-  if (node->pb_desc()) {
-    auto *ori_op = static_cast<framework::proto::OpDesc *>(node->pb_desc());
-    *op =
-        *ori_op;  // copy the attributes, by default, these will not be changed
-    // by analysis phrase.
-    // The inputs and outputs of the existing ops are not changed by tensorrt
-    // subgraph pass.
-    // NOTE It might be changed by other passes in the long run.
-  } else {
-    op->ParseFromString(node->pb_msg());
-  }
-}
-
-void CreateTrtEngineOp(Node *node, Argument *argument,
-                       framework::proto::BlockDesc *block) {
-  PADDLE_ENFORCE(argument->main_dfg.get());
-  const DataFlowGraph &graph = *(argument->main_dfg);
-  static int counter{0};
-  PADDLE_ENFORCE(node->IsFunctionBlock());
-  framework::OpDesc desc;
-  auto *func = static_cast<FunctionBlock *>(node);
-
-  // collect inputs
-  std::unordered_set<std::string> input_names;
-  std::unordered_set<std::string> input_names_with_id;
-  for (auto *x : func->inlinks) {
-    input_names.insert(x->name());
-    input_names_with_id.insert(x->name() + std::to_string(x->id()));
-  }
-  desc.SetInput(
-      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
-
-  std::unordered_set<std::string> output_names;
-  std::unordered_set<std::string> output_names_with_id;
-  for (auto *x : func->outlinks) {
-    output_names.insert(x->name());
-    output_names_with_id.insert(x->name() + std::to_string(x->id()));
-  }
-
-  desc.SetOutput(
-      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
-  desc.SetType("tensorrt_engine");
-
-  std::unordered_map<std::string, std::string> output_name_map;
-
-  // The following procedure is used to rename all the intermediate
-  // variables and the output variables of the subgraph.
-  // Why we do this?
-  // During the transition from fluid OP to tensorrt OP, we map
-  // the input and output Tensor(fluid data structure) of fluid OP
-  // to the correspondin ITensor (trt data structure) through the
-  // Tensor name. When we set up ITensor for an variable, we must
-  // ensure that it has not been set before.
-  // If there is variable in the fluid graph, which is not only the
-  // input of a OP, but also the output of a Op, there will be problems.
-  // So we have to rename the variable in the subgraph to make sure
-  // it is either an OP's input or an OP's output.
-
-  auto subgraph_nodes = func->subgraph;
-  for (int index = 0; index < block->ops_size(); index++) {
-    framework::proto::OpDesc *op = block->mutable_ops(index);
-    auto correspond_node = subgraph_nodes[index];
-    PADDLE_ENFORCE_EQ(correspond_node->name(), op->type());
-
-    std::unordered_map<std::string, size_t> var2id;
-    for (auto *in_var : correspond_node->inlinks) {
-      var2id[in_var->name()] = in_var->id();
-    }
-    // rename for the input variables of op inside subgraph
-    for (int i = 0; i < op->inputs_size(); i++) {
-      framework::proto::OpDesc_Var *in_var = op->mutable_inputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < in_var->arguments_size(); k++) {
-        std::string arg_value = in_var->arguments(k);
-        std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (input_names_with_id.count(arg_value_with_id)) {
-          replaced_names.push_back(arg_value);
-        } else {
-          replaced_names.push_back(arg_value_with_id);
-        }
-      }
-      in_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        in_var->add_arguments(replaced_names[k]);
-      }
-    }
-    var2id.clear();
-    for (auto out_var : correspond_node->outlinks) {
-      var2id[out_var->name()] = out_var->id();
-    }
-
-    // rename for the output variables of op inside subgraph
-    for (int i = 0; i < op->outputs_size(); i++) {
-      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < out_var->arguments_size(); k++) {
-        std::string arg_value = out_var->arguments(k);
-        std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (output_names_with_id.count(arg_value_with_id)) {
-          output_name_map[arg_value] = arg_value_with_id;
-        }
-        replaced_names.push_back(arg_value_with_id);
-      }
-      out_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        out_var->add_arguments(replaced_names[k]);
-      }
-    }
-  }
-  // When tensorrt engine runs at the end of the operation,
-  // output_mapping help us copy the data from the renamed ITensor
-  // to Tensor.
-  std::vector<std::string> output_mapping;
-  for (auto name : output_names) {
-    PADDLE_ENFORCE(output_name_map.count(name) != 0);
-    output_mapping.push_back(output_name_map[name]);
-  }
-
-  PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc");
-  // Set attrs
-
-  SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
-  SetAttr(desc.Proto(), "max_batch_size", argument->Get<int>("max_batch_size"));
-  SetAttr(desc.Proto(), "workspace_size", argument->Get<int>("workspace_size"));
-  SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
-  SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
-  SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
-  node->SetPbMsg(desc.Proto()->SerializeAsString());
-}
-
-std::vector<std::string> ExtractParameters(
-    const std::vector<std::unique_ptr<Node>> &nodes) {
-  std::vector<std::string> parameters;
-  for (const auto &node : nodes) {
-    if (!node->IsValue()) continue;
-    PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first");
-    framework::proto::VarDesc var;
-    var.ParseFromString(node->pb_msg());
-    if (var.persistable()) {
-      parameters.push_back(var.name());
-    }
-  }
-  return parameters;
-}
-
-void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
-  // TODO(Superjomn) Here need to expose some arguments for default setting.
-  PADDLE_ENFORCE(node->IsFunctionBlock());
-  auto *block_node = static_cast<FunctionBlock *>(node);
-  framework::proto::BlockDesc proto;
-  framework::BlockDesc block_desc(nullptr, &proto);
-  block_desc.Proto()->set_parent_idx(-1);
-  block_desc.Proto()->set_idx(0);
-  VLOG(40) << "origin variable size: "
-           << argument_->origin_program_desc->blocks(0).vars().size();
-  VLOG(40) << "transformed variable size: "
-           << block_desc.Proto()->vars().size();
-  // copy ops.
-
-  for (auto *node : block_node->subgraph) {
-    auto *op = block_desc.AppendOp();
-    PADDLE_ENFORCE(!node->pb_msg().empty());
-    op->Proto()->ParseFromString(node->pb_msg());
-  }
-
-  *block_desc.Proto()->mutable_vars() =
-      argument_->origin_program_desc->blocks(0).vars();
-  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
-  CreateTrtEngineOp(node, argument_, block_desc.Proto());
-  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  auto *op = main_block->add_ops();
-  PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
-  op->ParseFromString(node->pb_msg());
-}
-
-namespace {
-class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
- public:
-  using Config = DFG_GraphvizDrawPass::Config;
-  explicit DFG_DebuggerPass(const Config &config)
-      : DFG_GraphvizDrawPass(config) {}
-
-  std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
-
-  bool Finalize() override { return true; }
-};
-}  // namespace
-
-AnalysisPass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
-  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
-      FLAGS_IA_graphviz_log_root,
-      "data_flow_graph_to_fluid_graphviz_debugger"));
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
deleted file mode 100644
index 891c7226e245fa3b92892785362c186185a61f62..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-/*
- * This file implements the transformation from fluid ProgramDesc to data flow
- * graph.
- */
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/data_flow_graph.h"
-
-namespace paddle {
-namespace inference {
-
-namespace analysis {
-class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
- public:
-  DataFlowGraphToFluidPass() = default;
-
-  bool Initialize(Argument *argument) override;
-  bool Finalize() override;
-
-  void Run(DataFlowGraph *graph) override;
-
-  std::string repr() const override { return "DFG to fluid"; }
-  std::string description() const override {
-    return "Transform a DFG to a Fluid ProgramDesc";
-  }
-
-  AnalysisPass *CreateGraphvizDebugerPass() const override;
-
- protected:
-  // Add a Fluid Op into the ProgramDesc.
-  void AddFluidOp(Node *node);
-  // Add a EngineOp into the ProgramDesc.
-  void AddEngineOp(Node *node);
-
- private:
-  framework::proto::ProgramDesc *desc_;
-  Argument *argument_;
-};
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
deleted file mode 100644
index 4ef381db295b986b91173a728b6d98640f6f4f51..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
-
-#include <glog/logging.h>
-#include <google/protobuf/text_format.h>
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/io.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-TEST(DataFlowGraph, Test) {
-  Argument argument(FLAGS_inference_model_dir);
-
-  FluidToDataFlowGraphPass pass0;
-  DataFlowGraphToFluidPass pass1;
-  ASSERT_TRUE(pass0.Initialize(&argument));
-  ASSERT_TRUE(pass1.Initialize(&argument));
-
-  pass0.Run(argument.main_dfg.get());
-  pass1.Run(argument.main_dfg.get());
-
-  pass0.Finalize();
-  pass1.Finalize();
-
-  LOG(INFO) << argument.main_dfg->nodes.size();
-}
-
-};  // namespace analysis
-};  // namespace inference
-};  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
deleted file mode 100644
index 8888529a57a29c4349095c2ff4c527346716e026..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-int DFG_GraphvizDrawPass::counter_{0};
-
-void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {
-  auto content = Draw(graph);
-  auto dot_path = GenDotPath();
-  std::ofstream file(dot_path);
-  file.write(content.c_str(), content.size());
-  file.close();
-
-  auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png";
-  std::string message;
-  VLOG(30) << "draw to " << png_path;
-  ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message);
-}
-
-std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
-  Dot dot;
-  // Add nodes
-  for (size_t i = 0; i < graph->nodes.size(); i++) {
-    const Node &node = graph->nodes.Get(i);
-    if (config_.display_deleted_node || !node.deleted()) {
-      dot.AddNode(node.repr(), node.dot_attrs());
-    }
-  }
-  // Add edges
-  for (size_t i = 0; i < graph->nodes.size(); i++) {
-    const Node &node = graph->nodes.Get(i);
-    if (!config_.display_deleted_node && node.deleted()) continue;
-    for (auto &out : node.outlinks) {
-      if (!config_.display_deleted_node && out->deleted()) continue;
-      dot.AddEdge(node.repr(), out->repr(), {});
-    }
-  }
-  return dot.Build();
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
deleted file mode 100644
index e537bfc0e64d4ff46b3d61499a1a0298ed83533f..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file create an DFG_GraphvizDrawPass which helps to draw a data flow
- * graph's structure using graphviz.
- */
-
-#pragma once
-
-#include <fstream>
-#include <string>
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/dot.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * Output a dot file and write to some place.
- */
-class DFG_GraphvizDrawPass : public DataFlowGraphPass {
- public:
-  struct Config {
-    Config(const std::string &dir, const std::string &id,
-           bool display_deleted_node = false)
-        : dir(dir), id(id), display_deleted_node(display_deleted_node) {}
-
-    // The directory to store the .dot or .png files.
-    const std::string dir;
-    // The identifier for this dot file.
-    const std::string id;
-    // Whether to display deleted nodes, default false.
-    const bool display_deleted_node;
-  };
-
-  explicit DFG_GraphvizDrawPass(const Config &config) : config_(config) {}
-
-  bool Initialize(Argument *argument) override { return true; }
-  void Run(DataFlowGraph *graph) override;
-  bool Finalize() override { return true; }
-
-  std::string repr() const override { return "DFG graphviz drawer"; }
-  std::string description() const override {
-    return "Debug a DFG by draw with graphviz";
-  }
-
- protected:
-  // A counter to add a number prefix to the debugger image output so that they
-  // will sort in the triggered order.
-  static int counter_;
-
-  // Path of the dot file to output.
-  std::string GenDotPath() const {
-    return config_.dir + "/" + std::to_string(counter_++) + "-graph_" +
-           config_.id + ".dot";
-  }
-
-  virtual std::string Draw(DataFlowGraph *graph);
-
-  Config config_;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
deleted file mode 100644
index 928be7917047382d9b86294f6039b26b0ebf6f49..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
-
-#include <gtest/gtest.h>
-#include <fstream>
-#include <string>
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-TEST(DFG_GraphvizDrawPass, dfg_graphviz_draw_pass_tester) {
-  Argument argument(FLAGS_inference_model_dir);
-  FluidToDataFlowGraphPass pass0;
-  ASSERT_TRUE(pass0.Initialize(&argument));
-  pass0.Run(argument.main_dfg.get());
-
-  // auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
-
-  DFG_GraphvizDrawPass::Config config("./", "test");
-  DFG_GraphvizDrawPass pass(config);
-  pass.Initialize(&argument);
-  pass.Run(argument.main_dfg.get());
-
-  // test content
-  std::ifstream file("./0-graph_test.dot");
-  ASSERT_TRUE(file.is_open());
-
-  std::string line;
-  int no{0};
-  while (std::getline(file, line)) {
-    no++;
-  }
-  // DFG is sensitive to ProgramDesc, be careful to change the existing models.
-  ASSERT_EQ(no, 83);
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dot_tester.cc b/paddle/fluid/inference/analysis/dot_tester.cc
index 56ceb9bd5d6f41a601d66f6124fb7b4099c9337e..c785a312bf96c3586ea990fd9028cfd3b930d577 100644
--- a/paddle/fluid/inference/analysis/dot_tester.cc
+++ b/paddle/fluid/inference/analysis/dot_tester.cc
@@ -16,7 +16,6 @@
 
 #include <gtest/gtest.h>
 #include <memory>
-#include "paddle/fluid/inference/analysis/data_flow_graph.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
deleted file mode 100644
index 2b7d632c839e735ca03c6e17b94307b40cc13374..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
-#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
-  ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
-  if (argument->origin_program_desc) {
-    LOG(WARNING) << "argument's origin_program_desc is already set, might "
-                    "duplicate called";
-  }
-  if (!argument->fluid_model_program_path) {
-    ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir);
-    argument->fluid_model_program_path.reset(
-        new std::string(*argument->fluid_model_dir + "/__model__"));
-  }
-  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
-  auto program = LoadProgramDesc(*argument->fluid_model_program_path);
-  argument->origin_program_desc.reset(
-      new framework::proto::ProgramDesc(program));
-
-  if (!argument->main_dfg) {
-    argument->main_dfg.reset(new DataFlowGraph);
-  }
-  desc_ = argument->origin_program_desc.get();
-  return true;
-}
-
-bool FluidToDataFlowGraphPass::Finalize() { return true; }
-
-void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
-  PADDLE_ENFORCE(graph);
-  PADDLE_ENFORCE(desc_);
-  graph->Build(*desc_);
-}
-
-namespace {
-class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
- public:
-  using Config = DFG_GraphvizDrawPass::Config;
-  explicit DFG_DebuggerPass(const Config &config)
-      : DFG_GraphvizDrawPass(config) {}
-  std::string repr() const override { return "fluid-to-dfg-debuger-pass"; }
-  bool Finalize() override { return true; }
-};
-}
-
-AnalysisPass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
-  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
-      FLAGS_IA_graphviz_log_root, "fluid-to-dfg-debuger"));
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
deleted file mode 100644
index b9e262020e9522e167b998d57e2be2ac19b48447..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-/*
- * This file implements the transformation from data flow graph to fluid
- * ProgramDesc.
- */
-
-#pragma once
-
-#include <string>
-
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/data_flow_graph.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * Transform a FluidDesc to a SSA.
- */
-class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
- public:
-  FluidToDataFlowGraphPass() = default;
-
-  bool Initialize(Argument *argument) override;
-  bool Finalize() override;
-
-  void Run(DataFlowGraph *graph) override;
-
-  std::string repr() const override { return "fluid-to-data-flow-graph"; }
-  std::string description() const override {
-    return "transform a fluid ProgramDesc to a data flow graph.";
-  }
-
-  AnalysisPass *CreateGraphvizDebugerPass() const override;
-
- private:
-  framework::proto::ProgramDesc const *desc_;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
deleted file mode 100644
index 267a0a84ebf75615e0b390f4a1b3bf3b51793fc7..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-TEST(FluidToDataFlowGraphPass, Test) {
-  FluidToDataFlowGraphPass pass;
-  Argument argument(FLAGS_inference_model_dir);
-  pass.Initialize(&argument);
-  pass.Run(argument.main_dfg.get());
-  // Analysis is sensitive to ProgramDesc, careful to change the original model.
-  ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL);
-  pass.Finalize();
-  ASSERT_FALSE(argument.main_dfg->DotString().empty());
-  EXPECT_FALSE(argument.main_dfg->inputs().empty());
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
deleted file mode 100644
index 9f52af670b8e26c31230bc003af4c9ddc5b67802..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/inference/io.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-void FluidToIrPass::EnableParamModify(const std::string &model_dir,
-                                      const std::string &prog_file,
-                                      const std::string &param_file) {
-  PADDLE_ENFORCE(argument_);
-  argument_->Set(framework::ir::kParamScopeAttr, new framework::Scope);
-  // Load parameters.
-  VLOG(30) << "Loading parameters from " << model_dir;
-  LoadParams(&argument_->Get<framework::Scope>(framework::ir::kParamScopeAttr),
-             model_dir, prog_file, param_file);
-}
-
-bool FluidToIrPass::LoadParams(framework::Scope *scope, const std::string &dir,
-                               const std::string &prog_file,
-                               const std::string &param_file) {
-  platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
-  framework::Executor executor(place);
-  PADDLE_ENFORCE(argument_->origin_program_desc.get());
-  framework::ProgramDesc program(*argument_->origin_program_desc);
-  if ((!prog_file.empty()) && (!param_file.empty())) {
-    LOG(INFO) << "load single model file from " << prog_file;
-    Load(&executor, scope, prog_file, param_file);
-  } else if (!dir.empty()) {
-    LOG(INFO) << "load from dir " << dir;
-    Load(&executor, scope, dir);
-  } else {
-    LOG(ERROR) << "failed to load parameters";
-    return false;
-  }
-  return true;
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
deleted file mode 100644
index c2599e218a2306f9353b843b7ea3f18aeacb008e..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/flags.h"
-#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-static const char kFluidToIrPassesAttr[] = "__fluid_to_ir_passes__";
-
-class FluidToIrPass final : public DataFlowGraphPass {
- public:
-  FluidToIrPass() = default;
-
-  bool Initialize(Argument *argument) override {
-    ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
-    PADDLE_ENFORCE(argument->Has(kFluidToIrPassesAttr),
-                   "argument need the attr %s", kFluidToIrPassesAttr);
-    argument_ = argument;
-    if (argument->origin_program_desc) {
-      LOG(WARNING) << "argument's origin_program_desc is already set, might "
-                      "duplicate called";
-    }
-    // set fluid model program path
-    if (!argument->fluid_model_program_path) {
-      ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir);
-      argument->fluid_model_program_path.reset(
-          new std::string(*argument->fluid_model_dir + "/__model__"));
-    }
-    ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
-    // Load program.
-    auto program = LoadProgramDesc(*argument->fluid_model_program_path);
-    argument->origin_program_desc.reset(
-        new framework::proto::ProgramDesc(program));
-    // Create main data flow graph.
-    if (!argument->main_dfg) {
-      argument->main_dfg.reset(new DataFlowGraph);
-    }
-    argument->Set("ir_program_desc", new ProgramDesc(program));
-
-    LOG(INFO) << "Loading parameters";
-    // Load parameters to argument if needed.
-    if (argument->fluid_model_dir || (argument->fluid_model_program_path &&
-                                      argument->fluid_model_param_path)) {
-#define SAFE_GET(ATTR) std::string ATTR = argument->ATTR ? *argument->ATTR : "";
-      SAFE_GET(fluid_model_dir);
-      SAFE_GET(fluid_model_program_path);
-      SAFE_GET(fluid_model_param_path);
-#undef SAFE_GET
-      EnableParamModify(fluid_model_dir, fluid_model_program_path,
-                        fluid_model_param_path);
-    }
-
-    return true;
-  }
-
-  bool Finalize() override { return true; }
-
-  void Run(DataFlowGraph *graph) override {
-    // Call all the IR Passes
-    IRPassManager ir_passes(argument_->Get<ProgramDesc>("ir_program_desc"),
-                            nullptr);
-    // Pass the scope from analysis to IR if needed.
-    if (argument_->Has(framework::ir::kParamScopeAttr)) {
-      // Here the address is passed, attention that IR doesn't own the scope, so
-      // the real scope in analysis should live during the IR phase.
-      ir_passes.graph().Set(
-          framework::ir::kParamScopeAttr,
-          new framework::Scope *(&argument_->Get<framework::Scope>(
-              framework::ir::kParamScopeAttr)));
-    }
-
-    if (FLAGS_IA_enable_ir) {
-      const auto &ir_passes_to_apply =
-          argument_->Get<std::vector<std::string>>(kFluidToIrPassesAttr);
-      ir_passes.Apply(ir_passes_to_apply);
-    }
-
-    PADDLE_ENFORCE(argument_->main_dfg.get());
-    argument_->main_dfg->Build(ir_passes.graph());
-    // inherit the arguments from ir.
-    if (ir_passes.graph().Has(framework::ir::kFuseStatisAttr)) {
-      argument_->Set(
-          framework::ir::kFuseStatisAttr,
-          new std::unordered_map<std::string, int>(
-              ir_passes.graph().Get<std::unordered_map<std::string, int>>(
-                  framework::ir::kFuseStatisAttr)));
-    }
-  }
-
-  void EnableParamModify(const std::string &model_dir,
-                         const std::string &prog_file,
-                         const std::string &param_file);
-
-  std::string repr() const override { return "fluid-to-ir-pass"; }
-
- private:
-  // Load parameters from a single file or from a directory.
-  bool LoadParams(framework::Scope *scope, const std::string &dir,
-                  const std::string &prog_file, const std::string &param_file);
-
- private:
-  Argument *argument_{nullptr};
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/graph_traits.cc b/paddle/fluid/inference/analysis/graph_traits.cc
deleted file mode 100644
index 2ea70a1d2060e03769d67060dc6f008207342b52..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/graph_traits.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/graph_traits.h"
diff --git a/paddle/fluid/inference/analysis/graph_traits.h b/paddle/fluid/inference/analysis/graph_traits.h
deleted file mode 100644
index aed2b1e8e27d94b430201d70ecf09d4acc33c8fa..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/graph_traits.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file defines the GraphTraits<X> template class that should be specified
- * by classes that want to be iteratable by generic graph iterators.
- *
- * This file also defines the marker class Inverse that is used to iterate over
- * graphs in a graph defined, inverse ordering...
- */
-
-#pragma once
-
-#include "paddle/fluid/inference/analysis/helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * This class should be specialized by different graph types...
- * That's why the base class is empty.
- */
-template <typename GraphType>
-struct GraphTraits {
-  // using NodesBFSIterator = xxx
-
-  // NodesBFSIterator nodes_begin();
-  // NodesBFSIterator nodes_end();
-};
-
-/*
- * Inverse - This class is used as a marker class to tell the graph iterator to
- * iterate in a graph defined Inverse order.
- */
-template <typename GraphType>
-struct Inverse {
-  const GraphType &graph;
-
-  explicit Inverse(const GraphType &graph) : graph(graph) {}
-};
-
-/*
- * Provide a partial specialization of GraphTraits so that the inverse of an
- * inverse turns into the original graph.
- */
-template <typename GraphType>
-struct GraphTraits<Inverse<Inverse<GraphType>>> : GraphTraits<GraphType> {};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index 5151e2b69ac199dea136535ba445e890596f6227..269a0da9f9378601373e42d741f519843b111ec6 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace inference {
@@ -101,20 +102,20 @@ class OrderedRegistry {
  public:
   T *Register(const std::string &name, T *x) {
     PADDLE_ENFORCE(!dic_.count(name), "duplicate key [%s]", name);
-    dic_[name] = data_.size();
-    data_.emplace_back(std::unique_ptr<T>(x));
-    return data_.back().get();
+    dic_[name] = elements_.size();
+    elements_.emplace_back(std::unique_ptr<T>(x));
+    return elements_.back().get();
   }
 
   T *Lookup(const std::string &name) {
     auto it = dic_.find(name);
     if (it == dic_.end()) return nullptr;
-    return data_[it->second].get();
+    return elements_[it->second].get();
   }
 
  protected:
   std::unordered_map<std::string, int> dic_;
-  std::vector<std::unique_ptr<T>> data_;
+  std::vector<std::unique_ptr<T>> elements_;
 };
 
 template <typename T>
@@ -124,20 +125,6 @@ T &GetFromScope(const framework::Scope &scope, const std::string &name) {
   return *var->GetMutable<T>();
 }
 
-static void ExecShellCommand(const std::string &cmd, std::string *message) {
-  char buffer[128];
-  std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
-  if (!pipe) {
-    LOG(ERROR) << "error running command: " << cmd;
-    return;
-  }
-  while (!feof(pipe.get())) {
-    if (fgets(buffer, 128, pipe.get()) != nullptr) {
-      *message += buffer;
-    }
-  }
-}
-
 static framework::proto::ProgramDesc LoadProgramDesc(
     const std::string &model_path) {
   std::ifstream fin(model_path, std::ios::in | std::ios::binary);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index e76708baf4b39afb0febbcf3ff71281dfbfc8627..fce5e1cac92064a320179243380ea02b2c5d7838 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -18,6 +18,8 @@
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/argument.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
 #include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
@@ -27,21 +29,33 @@ using string::PrettyLogEndl;
 using string::PrettyLog;
 using string::Style;
 
-IRPassManager::IRPassManager(const ProgramDesc &program,
-                             framework::Scope *scope)
-    : program_(program) {
-  graph_.reset(new framework::ir::Graph(program));
-  if (scope)
-    graph_->Set(framework::ir::kParamScopeAttr, new framework::Scope *(scope));
+IRPassManager::IRPassManager(Argument *argument) {
+  ARGUMENT_CHECK_FIELD(argument, main_program);
+  graph_ = std::unique_ptr<Graph>(new Graph(argument->main_program()));
+  if (argument->Has("scope")) {
+    graph_->Set(framework::ir::kParamScopeAttr,
+                new framework::Scope *(
+                    const_cast<framework::Scope *>(&argument->scope())));
+  }
+
+  ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
+  CreatePasses(argument, argument->ir_analysis_passes());
 }
 
-void IRPassManager::Apply(const std::vector<std::string> &passes) {
-  // Apply all the passes
+void IRPassManager::CreatePasses(Argument *argument,
+                                 const std::vector<std::string> &passes) {
   std::string pre_pass;
   int pass_num = 0;
   for (const std::string &pass_name : passes) {
-    PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass_name);
     auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
+
+    // Set some pass attributes.
+    if (pass_name == "ir_analysis_pass") {
+      pass->Set("tensorrt_node_teller",
+                new SubgraphDetector::NodeInsideSubgraphTeller(
+                    argument->tensorrt_node_teller()));
+    }
+
     if (pass_name == "graph_viz_pass") {
       std::string dot_file_path = std::to_string(pass_num) + "_ir_" +
                                   (pre_pass.empty() ? "origin" : pre_pass) +
@@ -49,11 +63,47 @@ void IRPassManager::Apply(const std::vector<std::string> &passes) {
       pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
       pass_num++;
     }
-    graph_ = pass->Apply(std::move(graph_));
+
+    if (pass_name == "tensorrt_subgraph_pass") {
+      PADDLE_ENFORCE(argument->tensorrt_node_teller_valid());
+      pass->SetNotOwned("tensorrt_node_teller",
+                        argument->tensorrt_node_teller_ptr());
+      pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
+      pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
+    }
+
+    // graph_ = pass->Apply(std::move(graph_));
     pre_pass = pass_name;
+
+    passes_.emplace_back(std::move(pass));
   }
 }
 
+std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
+  if (passes_.empty()) {
+    return graph;
+  }
+  PADDLE_ENFORCE(graph.get());
+  // Apply all the passes
+  for (const auto &pass : passes_) {
+    PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
+    graph = pass->Apply(std::move(graph));
+  }
+  return std::move(graph);
+}
+
+framework::proto::ProgramDesc IRPassManager::AcquireProgram(
+    std::unique_ptr<Graph> *graph, const ProgramDesc &program) const {
+  auto pass =
+      framework::ir::PassRegistry::Instance().Get("graph_to_program_pass");
+
+  ProgramDesc desc(program);
+  pass->SetNotOwned("program", &desc);
+  auto *the_graph = graph->release();
+  *graph = pass->Apply(std::unique_ptr<Graph>(the_graph));
+  return *desc.Proto();
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h
index bb230283b7c2cc783d0b68ea0aa3cca1cabc75e6..983a582649706fa6eedb5aa459b5ac53b98f658b 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.h
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.h
@@ -20,27 +20,38 @@
  * for inference.
  */
 
+#pragma once
+
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/argument.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 using framework::ProgramDesc;
+using framework::ir::Graph;
 
 class IRPassManager final {
  public:
-  IRPassManager(const ProgramDesc &program, framework::Scope *scope);
+  explicit IRPassManager(Argument *argument);
+
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph);
 
-  void Apply(const std::vector<std::string> &passes);
+  framework::proto::ProgramDesc AcquireProgram(
+      std::unique_ptr<Graph> *graph, const ProgramDesc &program) const;
 
   framework::ir::Graph &graph() const { return *graph_; }
 
  private:
-  std::unique_ptr<framework::ir::Graph> graph_;
-  ProgramDesc program_;
+  void CreatePasses(Argument *argument, const std::vector<std::string> &passes);
+
+  std::unique_ptr<Graph> graph_;
+  std::vector<std::unique_ptr<framework::ir::Pass>> passes_;
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c71cff889ed7cdb95f79b9bc89a9ca5ab370271c
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -0,0 +1,7 @@
+cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
+cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector)
+set(analysis_deps ${analysis_deps}
+        subgraph_detector tensorrt_subgraph_pass
+        CACHE INTERNAL "")
+
+set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
similarity index 53%
rename from paddle/fluid/inference/analysis/subgraph_splitter.cc
rename to paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
index 3688ea15d959309d33901c360cb1055e2ac489a5..b6a5dfd087c95d0ccb0f5adfa4f754cfa5a44f14 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
@@ -12,46 +12,110 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
+#include <string>
+#include <utility>
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/node.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-const char *SubGraphSplitter::kMarkerAttrName =
-    "_sub_graph_splitter_inside_sub_graph";
+using framework::ir::Node;
+
+std::pair<std::vector<Node *>, std::vector<Node *>>
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
+  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
+  std::unordered_set<Node *> inputs;
+  std::unordered_set<Node *> outputs;
+  // Input a Value, check whether its inlink is in the subgraph.
+  auto inlink_in_subgraph = [&](Node *n) {
+    for (auto *in : n->inputs) {
+      if (nodes.count(in)) return true;
+    }
+    return false;
+  };
+
+  for (auto &node : graph) {
+    for (auto *in : node->inputs) {
+      // The Value that is written by nodes inside a sub-graph shouldn't be the
+      // input of the sub-graph.
+      if (!nodes.count(in) && in->IsVar() && !inlink_in_subgraph(in)) {
+        inputs.insert(in);
+      }
+    }
+    for (auto *out : node->outputs) {
+      if (!nodes.count(out) && out->IsVar()) {
+        outputs.insert(out);
+      }
+    }
+  }
+  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
+                        std::vector<Node *>(outputs.begin(), outputs.end()));
+}
+
+// Filter the Intermediate results of the subgraph node.
+void FilterRedundantOutputOfSubGraph(Graph *graph) {
+  std::vector<Node *> op_nodes;
+  for (auto &node : TopologicalSort(*graph)) {
+    if (node.IsVar() || Agent(&node).deleted()) {
+      continue;
+    }
+    op_nodes.push_back(&node);
+  }
+  size_t op_num = op_nodes.size();
+  for (size_t i = 0; i < op_num; i++) {
+    if (op_nodes[i]->IsOp()) continue;
+    std::unordered_set<std::string> follow_up_input_names;
+    for (size_t j = i + 1; j < op_num; j++) {
+      for (auto *in : op_nodes[j]->inputs) {
+        follow_up_input_names.insert(in->Name());
+      }
+    }
+    std::vector<Node *> filtered_subgraph_outlinks;
+    for (auto *out : op_nodes[i]->outputs) {
+      if (follow_up_input_names.count(out->Name())) {
+        filtered_subgraph_outlinks.push_back(out);
+      } else {
+        Agent(out).set_deleted(true);
+      }
+    }
+    // The filtered_subgraph_outlinks may be empty.
+    op_nodes[i]->outputs = filtered_subgraph_outlinks;
+  }
+}
 
-std::vector<std::vector<Node *>> SubGraphSplitter::operator()() {
+std::vector<std::vector<Node *>> SubgraphDetector::operator()() {
   MarkNodesInsideSubGraph();
   return ExtractSubGraphs();
 }
 
 // Mark the output variables inside a subgraph with the func.
-inline void MarkOutLinksInSubGraph(const Function *func) {
-  for (auto *var : func->outlinks) {
-    var->attr(SubGraphSplitter::kMarkerAttrName).Bool() = true;
+inline void MarkOutLinksInSubGraph(const Node *func) {
+  for (auto *var : func->outputs) {
+    Agent(var).set_marked(true);
   }
 }
 
-void SubGraphSplitter::MarkNodesInsideSubGraph() {
-  for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes()) {
+void SubgraphDetector::MarkNodesInsideSubGraph() {
+  for (auto &node : framework::ir::GraphTraits::DFS(*graph_)) {
     if (node_inside_subgraph_teller_(&node)) {
-      node.attr(kMarkerAttrName).Bool() = true;
-      if (node.type() == Node::Type::kFunction) {
+      Agent(&node).set_marked(true);
+      if (node.IsOp()) {
         // If a function is inside the sub-graph, mark all the output variables
         // to be inside too, so that two marked functions will be inside a same
         // sub-graph, lets take a example:  A_function->var->B_function, if
         // A_function is marked, var should also be marked, so that B_function
         // will be in the same sub-graph with A_function if B_function is
         // marked.
-        MarkOutLinksInSubGraph(static_cast<const Function *>(&node));
+        MarkOutLinksInSubGraph(&node);
       }
     }
   }
 }
 
-const char *kUnionFindParent = "_sub_graph_splitter_union_find_parent_";
-
 // Use the Union Find(UF) algorithm to find fully connected sub-graphs, if node
 // a's output is node b, that is a and b is in the same sub-graph. The UF
 // algorithm will group them to the same cluster.
@@ -60,8 +124,8 @@ using node_map_t = std::unordered_map<int, Node *>;
 int UnionFindGetAncestor(const node_map_t &node_map, size_t id) {
   int tmp = id;
   do {
-    tmp = node_map.at(tmp)->attr(kUnionFindParent).Int32();
-  } while (node_map.at(tmp)->attr(kUnionFindParent).Int32() != tmp);
+    tmp = Agent(node_map.at(tmp)).union_find_parent();
+  } while (Agent(node_map.at(tmp)).union_find_parent() != tmp);
   return tmp;
 }
 // Make this two node share the same ancestor.
@@ -69,9 +133,9 @@ int UnionFindGetAncestor(const node_map_t &node_map, size_t id) {
 void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
   int a_ancestor = UnionFindGetAncestor(node_map, a);
   int b_ancestor = UnionFindGetAncestor(node_map, b);
-  node_map.at(b_ancestor)->attr(kUnionFindParent).Int32() = a_ancestor;
-  node_map.at(a)->attr(kUnionFindParent).Int32() = a_ancestor;
-  node_map.at(b)->attr(kUnionFindParent).Int32() = a_ancestor;
+  Agent(node_map.at(b_ancestor)).set_union_find_parent(a_ancestor);
+  Agent(node_map.at(a)).set_union_find_parent(a_ancestor);
+  Agent(node_map.at(b)).set_union_find_parent(a_ancestor);
 }
 
 // This is a simple representation of a graph.
@@ -195,16 +259,21 @@ void FlexibleDFS(const std::vector<BriefNode *> &source, bool reverse,
   }
 }
 
-std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
+std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubGraphs() {
   // Run the Extract algorithm to find all subgraphs.
   std::vector<Node *> marked_nodes;
   //  We use brief_node_map to represent the original graph in order to avoid
   //  changing the original graph.
   std::unordered_map<int, BriefNode *> brief_node_map;
 
-  for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes_in_TS()) {
+  std::unordered_set<int32_t> valid_node_ids;
+  for (auto *node : graph_->Nodes()) {
+    valid_node_ids.insert(node->id());
+  }
+
+  for (auto &node : framework::ir::GraphTraits::TS(*graph_)) {
     brief_node_map[node.id()] = new BriefNode(&node);
-    if (node.attr(kMarkerAttrName).Bool()) {
+    if (Agent(&node).marked()) {
       marked_nodes.push_back(&node);
     }
   }
@@ -213,26 +282,34 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
   node_map_t node_map;  // id to ptr
   for (auto *n : marked_nodes) {
     // n's parent == n.id means it is the ancestor
-    n->attr(kUnionFindParent).Int32() = n->id();
+    Agent(n).set_union_find_parent(n->id());
     node_map[n->id()] = n;
   }
 
   // create breif node map
   for (auto &itr : brief_node_map) {
-    for (Node *node : itr.second->node->inlinks) {
-      itr.second->inlinks.push_back(brief_node_map[node->id()]);
+    for (Node *node : itr.second->node->inputs) {
+      if (!valid_node_ids.count(node->id())) {
+        LOG(INFO) << "invalid node id " << node->id();
+        continue;
+      }
+      itr.second->inlinks.push_back(brief_node_map.at(node->id()));
     }
 
-    for (Node *node : itr.second->node->outlinks) {
-      itr.second->outlinks.push_back(brief_node_map[node->id()]);
+    for (Node *node : itr.second->node->outputs) {
+      if (!valid_node_ids.count(node->id())) {
+        LOG(INFO) << "invalid node id " << node->id();
+        continue;
+      }
+      itr.second->outlinks.push_back(brief_node_map.at(node->id()));
     }
   }
 
   for (auto &itr : brief_node_map) {
     BriefNode *brief_node = itr.second;
 
-    if (!brief_node->node->attr(kMarkerAttrName).Bool()) {
-      VLOG(40) << brief_node->node->id() << " node not a trt candicate.";
+    if (!Agent(brief_node->node).marked()) {
+      VLOG(4) << brief_node->node->id() << " node not a trt candidate.";
       continue;
     }
 
@@ -254,7 +331,7 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
       std::unordered_set<BriefNode *> contract_nodes;
       for (auto *out : brief_node->outlinks) {
         // must be an trt candidate
-        if (!out->node->attr(kMarkerAttrName).Bool()) continue;
+        if (!Agent(out->node).marked()) continue;
         // get all dst input nodes except src.
         std::vector<BriefNode *> source_nodes;
         for (auto *n : out->inlinks) {
@@ -289,9 +366,8 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
 
   std::unordered_map<int /*ancestor*/, std::vector<Node *>> clusters;
   for (auto *n : marked_nodes) {
-    if (n->type() == Node::Type::kFunction) {
-      clusters[UnionFindGetAncestor(node_map,
-                                    n->attr(kUnionFindParent).Int32())]
+    if (n->IsOp()) {
+      clusters[UnionFindGetAncestor(node_map, Agent(n).union_find_parent())]
           .push_back(n);
     }
   }
@@ -304,28 +380,59 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
   return result;
 }
 
-void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
+void SubGraphFuser::operator()() { ReplaceNodesWithSubGraphs(); }
+
+void RemoveIntermediateOutputInSubgraph(const std::vector<Node *> &subgraph,
+                                        Graph *graph,
+                                        std::vector<Node *> *outputs) {
+  std::unordered_set<Node *> subgraph_set(subgraph.begin(), subgraph.end());
+  std::unordered_set<Node *> valid_output;
+
+  for (auto *output : *outputs) {
+    int num_used = 0;
+    for (auto *node : output->outputs) {
+      if (!subgraph_set.count(node)) ++num_used;
+      if (num_used > 0) valid_output.insert(output);
+    }
+  }
+
+  outputs->assign(valid_output.begin(), valid_output.end());
+}
+
+void DetachDeletedNodes(framework::ir::Graph *graph) {
+  std::unordered_set<const Node *> nodes;
+  for (auto *node : graph->Nodes()) {
+    if (Agent(node).deleted()) {
+      node->inputs.clear();
+      node->outputs.clear();
+    }
+  }
+}
 
-void SubGraphFuse::ReplaceNodesWithSubGraphs() {
-  auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
+void SubGraphFuser::ReplaceNodesWithSubGraphs() {
+  auto subgraphs = SubgraphDetector(graph_, node_inside_subgraph_teller_)();
   for (auto &subgraph : subgraphs) {
-    if (subgraph.size() <= argument_->Get<int>("minimum_subgraph_size"))
-      continue;
+    if (subgraph.size() <= (size_t)min_subgraph_size_) continue;
+    LOG(INFO) << "detect a subgraph size " << subgraph.size();
     std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
     // replace this sub-graph with the first node. Two steps: 1. Create a Block
     // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
     // as deleted. 3. Replace the deleted node with the new Block Node.
-    auto *block_node = static_cast<FunctionBlock *>(
-        graph_->nodes.Create(Node::Type::kFunctionBlock));
+    framework::OpDesc empty_desc;
+    empty_desc.SetType("tensorrt_engine");
+    auto *block_node = graph_->CreateOpNode(&empty_desc);
+    Agent(block_node).set_subgraph({});
     auto io = ExtractInputAndOutputOfSubGraph(subgraph);
-    block_node->inlinks = std::move(io.first);
-    block_node->outlinks = std::move(io.second);
+    block_node->inputs = std::move(io.first);
+    block_node->outputs = std::move(io.second);
+
+    RemoveIntermediateOutputInSubgraph(subgraph, graph_, &block_node->outputs);
 
     for (auto *node : subgraph) {
       // TODO(Superjomn) need a unified mechanism to treat deleted node in each
       // pass.
-      node->SetDeleted();
-      block_node->subgraph.push_back(node);
+      Agent(node).set_deleted(true);
+      Agent(block_node).subgraph()->push_back(node);
     }
 
     // Change all the sub-graph's inputs and outputs corresponding inlink and
@@ -339,16 +446,92 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() {
       std::unordered_set<Node *> uniq(nodes.begin(), nodes.end());
       nodes.assign(uniq.begin(), uniq.end());
     };
-    for (auto *i : block_node->inlinks) {
-      inlink_or_outlink_cleaner(i->outlinks);
+    for (auto *i : block_node->inputs) {
+      inlink_or_outlink_cleaner(i->outputs);
     }
-    for (auto *&o : block_node->outlinks) {
-      inlink_or_outlink_cleaner(o->inlinks);
+    for (auto *&o : block_node->outputs) {
+      inlink_or_outlink_cleaner(o->inputs);
     }
   }
+  // DetachDeletedNodes(graph_);
   FilterRedundantOutputOfSubGraph(graph_);
 }
 
+inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
+  return node.inputs.size() == n;
+}
+
+NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
+  PADDLE_ENFORCE(!source.empty(),
+                 "Start points of topological sorting should not be empty!");
+  // CHECK all the inputs' in-degree is 0
+  for (auto *node : source) {
+    PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
+  }
+
+  std::unordered_set<Node *> visited;
+  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
+
+  std::vector<Node *> inlink_visited;
+  while (!to_visit.empty()) {
+    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
+    for (auto *p : queue) {
+      if (Agent(p).deleted()) {
+        visited.insert(p);
+        to_visit.erase(p);
+      }
+
+      inlink_visited.clear();
+
+      std::copy_if(p->inputs.begin(), p->inputs.end(),
+                   std::back_inserter(inlink_visited),
+                   [&](Node *x) -> bool { return visited.count(x) != 0; });
+
+      if (inlink_visited.size() == p->inputs.size()) {
+        sorted_.push_back(p);
+        for (auto *_ : p->outputs) {
+          if (!visited.count(_)) {
+            to_visit.insert(_);
+          }
+        }
+
+        to_visit.erase(p);
+        visited.insert(p);
+      }
+    }
+  }
+}
+
+NodesTSIterator::NodesTSIterator(const NodesTSIterator &other)
+    : sorted_(other.sorted_), cursor_(other.cursor_) {}
+
+Node &NodesTSIterator::operator*() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return *sorted_[cursor_];
+}
+
+NodesTSIterator &NodesTSIterator::operator++() {
+  if (++cursor_ >= sorted_.size()) {
+    sorted_.clear();
+    cursor_ = 0;
+  }
+  return *this;
+}
+NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) {
+  cursor_ = other.cursor_;
+  sorted_ = other.sorted_;
+  return *this;
+}
+
+bool NodesTSIterator::operator==(const NodesTSIterator &other) {
+  return sorted_ == other.sorted_ && cursor_ == other.cursor_;
+}
+
+Node *NodesTSIterator::operator->() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return sorted_[cursor_];
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea88edd042aa9d46f66af1aa92f2cb273696c118
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
@@ -0,0 +1,182 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file defines the the class to partition a graph.
+ */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/inference/analysis/argument.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+using framework::ir::Graph;
+
+const char kIsFunctionNode[] = "__is_function_node__";
+const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__";
+const char kSubgraphSplitterMarkerAttrName[] =
+    "_sub_graph_splitter_inside_sub_graph";
+
+/*
+ * Detect the nodes in a sub-graph that meet some conditions. This class doesn't
+ * modify the graph.
+ */
+class SubgraphDetector {
+ public:
+  // Tell whether a node is inside a sub-graph.
+  using NodeInsideSubgraphTeller =
+      std::function<bool(const framework::ir::Node *)>;
+
+  SubgraphDetector(Graph *graph, const NodeInsideSubgraphTeller &teller)
+      : graph_(graph), node_inside_subgraph_teller_(teller) {}
+
+  std::vector<std::vector<framework::ir::Node *>> operator()();
+
+ protected:
+  // Mark the nodes inside the accepted sub-graph using
+  // node_inside_subgraph_teller.
+  void MarkNodesInsideSubGraph();
+
+  // Merge the marked nodes into sub-graphs and return the sub-graphs.
+  std::vector<std::vector<framework::ir::Node *>> ExtractSubGraphs();
+
+ private:
+  Graph *graph_;
+  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+};
+
+/*
+ * SubGraphFuser - Replace some nodes with the sub-graph node they are inside.
+ * To some extent, the TensorRT engine is just a fusion op for a model.
+ */
+class SubGraphFuser {
+ public:
+  using NodeInsideSubgraphTeller = SubgraphDetector::NodeInsideSubgraphTeller;
+
+  SubGraphFuser(Graph *graph, const NodeInsideSubgraphTeller &teller,
+                int min_subgraph_size)
+      : graph_(graph),
+        node_inside_subgraph_teller_(teller),
+        min_subgraph_size_{min_subgraph_size} {}
+
+  // The main method which run all the logic.
+  void operator()();
+
+ protected:
+  // Remove the nodes inside sub-graphs and replace with the SubGraphNode.
+  void ReplaceNodesWithSubGraphs();
+
+ private:
+  Graph *graph_;
+  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+  int min_subgraph_size_;
+};
+
+struct NodeWrapper {
+  bool deleted{false};
+  bool marked{false};
+  int union_find_parent{-1};
+  std::vector<framework::ir::Node *> subgraph;
+};
+
+/*
+ * ir::Node agent for subgraph detector.
+ */
+struct Agent {
+  explicit Agent(framework::ir::Node *x) : x_(x) {}
+
+  NodeWrapper &wrapper() {
+    if (!x_->IsWrappedBy<NodeWrapper>()) {
+      x_->WrappedBy<NodeWrapper>(new NodeWrapper);
+    }
+    return x_->template Wrapper<NodeWrapper>();
+  }
+
+  bool deleted() { return wrapper().deleted; }
+  void set_deleted(bool x) { wrapper().deleted = x; }
+
+  bool marked() { return wrapper().marked; }
+  void set_marked(bool x) { wrapper().marked = x; }
+
+  void set_subgraph(const std::vector<framework::ir::Node *> &x) {
+    wrapper().subgraph = x;
+  }
+
+  int union_find_parent() { return wrapper().union_find_parent; }
+  void set_union_find_parent(int v) { wrapper().union_find_parent = v; }
+
+  std::vector<framework::ir::Node *> *subgraph() { return &wrapper().subgraph; }
+  std::vector<framework::ir::Node *> &inputs() { return x_->inputs; }
+  std::vector<framework::ir::Node *> &outputs() { return x_->outputs; }
+
+ private:
+  framework::ir::Node *x_;
+};
+
+// Topological sorting iterator on nodes.
+struct NodesTSIterator
+    : public std::iterator<std::forward_iterator_tag, framework::ir::Node *> {
+  NodesTSIterator() = default;
+  explicit NodesTSIterator(const std::vector<framework::ir::Node *> &source);
+  NodesTSIterator(NodesTSIterator &&other)
+      : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
+    other.cursor_ = 0;
+  }
+  NodesTSIterator(const NodesTSIterator &other);
+
+  framework::ir::Node &operator*();
+  NodesTSIterator &operator++();
+  // TODO(Superjomn) current implementation just compare the first
+  // element, need to compare the graph and all the elements in the queue and
+  // set.
+  NodesTSIterator &operator=(const NodesTSIterator &other);
+  bool operator==(const NodesTSIterator &other);
+  bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
+  framework::ir::Node *operator->();
+
+ private:
+  std::vector<framework::ir::Node *> sorted_;
+  size_t cursor_{0};
+};
+
+// The nodes those have no input will be treated as start points.
+static std::vector<framework::ir::Node *> ExtractStartPoints(const Graph &g) {
+  std::vector<framework::ir::Node *> result;
+  for (auto *node : g.Nodes()) {
+    if (node->inputs.empty()) {
+      result.push_back(node);
+    }
+  }
+  return result;
+}
+
+static iterator_range<NodesTSIterator> TopologicalSort(const Graph &g) {
+  auto start_points = ExtractStartPoints(g);
+  PADDLE_ENFORCE(!start_points.empty());
+  NodesTSIterator x(start_points);
+  return iterator_range<NodesTSIterator>(NodesTSIterator(start_points),
+                                         NodesTSIterator());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..21fd8d2df49698d7fa38d906f7921f092ca916a3
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -0,0 +1,220 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+using framework::ir::Node;
+
+std::vector<std::string> ExtractParameters(
+    const std::unordered_set<Node *> &nodes);
+
+std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
+
+    std::unique_ptr<framework::ir::Graph> graph) const {
+  framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
+
+  auto teller =
+      Get<SubgraphDetector::NodeInsideSubgraphTeller>("tensorrt_node_teller");
+
+  SubGraphFuser fuser(graph.get(), teller, 2 /*min subgraph size*/);
+  fuser();
+
+  for (auto *node : graph->Nodes()) {
+    if (node->IsOp() && !Agent(node).subgraph()->empty()) {
+      CreateTensorRTOp(node, graph.get());
+
+      std::unordered_set<const Node *> nodes2remove(
+          Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
+      framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+    }
+  }
+
+  std::unordered_set<const Node *> nodes2remove;
+  for (auto *node : graph->Nodes()) {
+    if (node->IsOp() && Agent(node).deleted()) {
+      nodes2remove.insert(node);
+    }
+  }
+  framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+
+  return graph;
+}
+
+void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
+                                            Graph *graph) const {
+  auto *op_desc = node->Op();
+  static int counter{0};
+  auto &subgraph = *Agent(node).subgraph();
+  PADDLE_ENFORCE(!subgraph.empty());
+
+  // An fake block desc.
+  framework::proto::BlockDesc block_proto;
+  framework::BlockDesc block_desc(nullptr, &block_proto);
+  block_desc.Proto()->set_parent_idx(-1);
+  block_desc.Proto()->set_idx(0);
+  for (auto *node : subgraph) {
+    auto *op = block_desc.AppendOp();
+    *op->Proto() = *node->Op()->Proto();
+  }
+
+  // collect inputs
+  std::unordered_set<std::string> input_names;
+  std::unordered_set<std::string> input_names_with_id;
+  for (auto *x : node->inputs) {
+    input_names.insert(x->Name());
+    input_names_with_id.insert(x->Name() + std::to_string(x->id()));
+  }
+  op_desc->SetInput(
+      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
+
+  std::unordered_set<std::string> output_names;
+  std::unordered_set<std::string> output_names_with_id;
+  for (auto *x : node->outputs) {
+    output_names.insert(x->Name());
+    output_names_with_id.insert(x->Name() + std::to_string(x->id()));
+  }
+
+  op_desc->SetOutput(
+      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
+  op_desc->SetType("tensorrt_engine");
+
+  std::unordered_map<std::string, std::string> output_name_map;
+
+  // The following procedure is used to rename all the intermediate
+  // variables and the output variables of the subgraph.
+  // Why we do this?
+  // During the transition from fluid OP to tensorrt OP, we map
+  // the input and output Tensor(fluid data structure) of fluid OP
+  // to the corresponding ITensor (trt data structure) through the
+  // Tensor name. When we set up ITensor for an variable, we must
+  // ensure that it has not been set before.
+  // If there is variable in the fluid graph, which is not only the
+  // input of a OP, but also the output of a Op, there will be problems.
+  // So we have to rename the variable in the subgraph to make sure
+  // it is either an OP's input or an OP's output.
+
+  auto &subgraph_nodes = *Agent(node).subgraph();
+  for (size_t index = 0; index < block_desc.OpSize(); index++) {
+    framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
+    auto correspond_node = subgraph_nodes[index];
+    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
+
+    std::unordered_map<std::string, size_t> var2id;
+    for (auto *in_var : correspond_node->inputs) {
+      var2id[in_var->Name()] = in_var->id();
+    }
+    // rename for the input variables of op inside subgraph
+    for (int i = 0; i < op->inputs_size(); i++) {
+      // one input
+      auto *in_var = op->mutable_inputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < in_var->arguments_size(); k++) {  // all the arguments
+        std::string arg_value = in_var->arguments(k);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (input_names_with_id.count(arg_value_with_id)) {
+          replaced_names.push_back(arg_value);
+        } else {
+          replaced_names.push_back(arg_value_with_id);
+        }
+      }
+      in_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        in_var->add_arguments(replaced_names[k]);
+      }
+    }
+    var2id.clear();
+    for (auto out_var : correspond_node->outputs) {
+      var2id[out_var->Name()] = out_var->id();
+    }
+
+    // rename for the output variables of op inside subgraph
+    for (int i = 0; i < op->outputs_size(); i++) {
+      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < out_var->arguments_size(); k++) {
+        std::string arg_value = out_var->arguments(k);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (output_names_with_id.count(arg_value_with_id)) {
+          output_name_map[arg_value] = arg_value_with_id;
+        }
+        replaced_names.push_back(arg_value_with_id);
+      }
+      out_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        out_var->add_arguments(replaced_names[k]);
+      }
+    }
+  }
+
+  // When tensorrt engine runs at the end of the operation,
+  // output_mapping help us copy the data from the renamed ITensor
+  // to Tensor.
+  std::vector<std::string> output_mapping;
+  for (auto name : output_names) {
+    // LOG(INFO) << name << " " << output_name_map.size();
+    PADDLE_ENFORCE(output_name_map.count(name) != 0);
+    output_mapping.push_back(output_name_map[name]);
+  }
+
+  *block_desc.Proto()->mutable_vars() =
+      const_cast<framework::ProgramDesc *>(&graph->program())
+          ->Proto()
+          ->blocks(0)
+          .vars();
+  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
+                 "the block has no var-desc");
+  PADDLE_ENFORCE(!output_mapping.empty());
+  // Set attrs
+  SetAttr(op_desc->Proto(), "subgraph",
+          block_desc.Proto()->SerializeAsString());
+  SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
+  SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
+  SetAttr(op_desc->Proto(), "engine_uniq_key",
+          "trt-" + std::to_string(counter++));
+  SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
+  SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
+}
+
+std::vector<std::string> ExtractParameters(
+    const std::unordered_set<Node *> &nodes) {
+  std::vector<std::string> parameters;
+  for (const auto &node : nodes) {
+    if (!node->IsVar()) continue;
+    if (node->Var()->Persistable()) {
+      parameters.push_back(node->Name());
+    }
+  }
+  return parameters;
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_PASS(tensorrt_subgraph_pass,
+              paddle::inference::analysis::TensorRtSubgraphPass)
+    .RequirePassAttr("tensorrt_node_teller")
+    .RequirePassAttr("max_batch_size")
+    .RequirePassAttr("workspace_size");
diff --git a/paddle/fluid/inference/analysis/model_store_pass_tester.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
similarity index 55%
rename from paddle/fluid/inference/analysis/model_store_pass_tester.cc
rename to paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
index d6493fc25edf25003504542f1b01c4105754c8df..502353b95fc15e763900a0caf1649257508f0880 100644
--- a/paddle/fluid/inference/analysis/model_store_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
@@ -12,31 +12,24 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/model_store_pass.h"
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/analysis/analyzer.h"
+#pragma once
+#include <paddle/fluid/framework/ir/fuse_pass_base.h>
+#include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-DEFINE_string(inference_model_dir, "", "Model path");
-
-TEST(DFG_StorePass, test) {
-  Analyzer analyzer;
-  Argument argument(FLAGS_inference_model_dir);
-  argument.model_output_store_path.reset(
-      new std::string("./_dfg_store_pass_tmp"));
-  // disable storage in alalyzer
-  FLAGS_IA_output_storage_path = "";
-  analyzer.Run(&argument);
+class TensorRtSubgraphPass : public framework::ir::FusePassBase {
+ public:
+  std::unique_ptr<framework::ir::Graph> ApplyImpl(
+      std::unique_ptr<framework::ir::Graph> graph) const override;
 
-  ModelStorePass pass;
-  pass.Initialize(&argument);
-  pass.Run(argument.main_dfg.get());
-}
+ private:
+  void CreateTensorRTOp(framework::ir::Node *x,
+                        framework::ir::Graph *graph) const;
+  void CleanIntermediateOutputs(framework::ir::Node *node);
+};
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/model_store_pass.cc b/paddle/fluid/inference/analysis/model_store_pass.cc
deleted file mode 100644
index 4f40a7a1adc324b824af8e3831901abcbffaeca6..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/model_store_pass.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string>
-
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/analysis/argument.h"
-#include "paddle/fluid/inference/analysis/model_store_pass.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-void ModelStorePass::Run(DataFlowGraph *x) {
-  if (!argument_->fluid_model_param_path) {
-    PADDLE_ENFORCE_NOT_NULL(argument_->fluid_model_dir);
-    argument_->fluid_model_param_path.reset(
-        new std::string(*argument_->fluid_model_dir + "param"));
-  }
-  PADDLE_ENFORCE_NOT_NULL(argument_->model_output_store_path);
-  // Directly copy param file to destination.
-  std::stringstream ss;
-  // NOTE these commands only works on linux.
-  ss << "mkdir -p " << *argument_->model_output_store_path;
-  VLOG(30) << "run command: " << ss.str();
-  PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
-  ss.str("");
-
-  ss << "cp " << *argument_->fluid_model_dir << "/*"
-     << " " << *argument_->model_output_store_path;
-  VLOG(30) << "run command: " << ss.str();
-  PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
-
-  // Store program
-  PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc,
-                          "program desc is not transformed, should call "
-                          "DataFlowGraphToFluidPass first.");
-  VLOG(30) << "store analyzed program to "
-           << *argument_->model_output_store_path;
-  const std::string program_output_path =
-      *argument_->model_output_store_path + "/__model__";
-  std::ofstream file(program_output_path, std::ios::binary);
-  PADDLE_ENFORCE(file.is_open(), "failed to open %s to write.",
-                 program_output_path);
-  const std::string serialized_message =
-      argument_->transformed_program_desc->SerializeAsString();
-  file.write(serialized_message.c_str(), serialized_message.size());
-}
-
-bool ModelStorePass::Finalize() { return true; }
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/node.cc b/paddle/fluid/inference/analysis/node.cc
deleted file mode 100644
index 3339b5044df0cf91d00aa9ddad310d4bf263bc3c..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/node.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/node.h"
-#include "glog/logging.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-std::vector<Dot::Attr> Value::dot_attrs() const {
-  return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
-                                 Dot::Attr("shape", "box"),
-                                 Dot::Attr("fillcolor", "red")});
-}
-
-std::vector<Dot::Attr> Function::dot_attrs() const {
-  return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
-                                 Dot::Attr("shape", "diamond"),
-                                 Dot::Attr("fillcolor", "yellow")});
-}
-
-Node *NodeMap::Create(Node::Type type) {
-  switch (type) {
-    case Node::Type::kFunction:
-      nodes_.emplace_back(new Function);
-      break;
-    case Node::Type::kValue:
-      nodes_.emplace_back(new Value);
-      break;
-    case Node::Type::kFunctionBlock:
-      nodes_.emplace_back(new FunctionBlock);
-      break;
-    default:
-      PADDLE_THROW("Not supported node type.");
-  }
-  nodes_.back()->id_ = size() - 1;
-  return nodes_.back().get();
-}
-
-Node *NodeMap::GetMutable(size_t id) {
-  PADDLE_ENFORCE_GT(size(), id);
-  return nodes_[id].get();
-}
-
-const Node &NodeMap::Get(size_t id) const {
-  PADDLE_ENFORCE_GT(size(), id);
-  return *nodes_[id].get();
-}
-
-void NodeMap::Delete(size_t id) {
-  PADDLE_ENFORCE_LT(id, size());
-  nodes_[id]->SetDeleted();
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/node.h b/paddle/fluid/inference/analysis/node.h
deleted file mode 100644
index af34156bc2f101465d87cb10e2155745022eb521..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/node.h
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file defines the Node class and its subclasses. A Node is the basis
- * analysis element in a computation graph.
- * There are basically two kinds of nodes, the function node and value node.
- */
-#pragma once
-
-#include <limits>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/inference/analysis/device.h"
-#include "paddle/fluid/inference/analysis/dot.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/platform/variant.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-class NodeMap;
-
-// A helper class to maintain the status from Pass.
-struct AnyAttr {
-  using any_t =
-      boost::variant<bool, float, int32_t, int64_t, void *, std::string>;
-  // NOTE T should be a primary type or a struct combined by several primary
-  // types.
-  // NOTE the STL containers should not use here.
-  // Some usages
-  //   Attr attr;
-  //   attr.Bool() = true;
-  bool &Bool() { return As<bool>(); }
-  float &Float() { return As<float>(); }
-  int32_t &Int32() { return As<int32_t>(); }
-  int64_t &Int64() { return As<int64_t>(); }
-  void *&Pointer() { return As<void *>(); }
-  std::string &String() { return As<std::string>(); }
-
-  template <typename T>
-  T &As() {
-    if (type_index_ == typeid(AnyAttr)) {
-      type_index_ = typeid(T);
-      any_data_ = T();
-    } else {
-      PADDLE_ENFORCE(type_index_ == typeid(T), "fetch error type");
-    }
-    return boost::get<T>(any_data_);
-  }
-
- private:
-  any_t any_data_;
-  std::type_index type_index_{typeid(AnyAttr)};
-};
-
-/*
- * Node Representation.
- *
- * This is a very important class for analysis. It is the base class of all
- * nodes computed by a program that may be used as operands to other nodes.
- * Node is the super class of other important classes such as Function and
- * Value, some nodes can have a name.
- */
-class Node {
- public:
-  // Node type. NOTE the new node types should add here.
-  enum class Type { kNone = -1, kFunction, kValue, kFunctionBlock };
-
-  Node() = default;
-
-  // Cast to a subclass type, Function for example.
-  template <typename Subclass>
-  Subclass &As() {
-    return *dynamic_cast<Subclass *>(this);
-  }
-
-  // Formatted representation of this Node.
-  virtual std::string repr() const {
-    return name() + "(" + std::to_string(id()) + ")";
-  }
-
-  // DOT node representation. One Node type can customize its own node
-  // representation.
-  virtual std::vector<Dot::Attr> dot_attrs() const {
-    return std::vector<Dot::Attr>({Dot::Attr("style", "filled")});
-  }
-
-  // Get an additional attribute and convert it to T data type. NOTE this will
-  // silently create a new attribute if not exists.
-  AnyAttr &attr(const std::string &name) const { return attrs_[name]; }
-
-  int id() const { return id_; }
-
-  // The Protobuf description is set/get with a void* to decouple Node interface
-  // from a specific kind of Protobuf message.
-  void SetPbDesc(void *pb) { attr("pb_desc").Pointer() = pb; }
-  void *pb_desc() const { return attr("pb_desc").Pointer(); }
-
-  void SetPbMsg(const std::string &s) { attr("pb_msg").String() = s; }
-  const std::string &pb_msg() const { return attr("pb_msg").String(); }
-
-  void SetDeleted() { deleted_ = true; }
-  bool deleted() const { return deleted_; }
-
-  void SetName(const std::string &name) { name_ = name; }
-  const std::string &name() const { return name_; }
-
-  void SetType(Type type) { type_ = type; }
-  Type type() const { return type_; }
-
-  // Input links.
-  std::vector<Node *> inlinks;
-  // Output links.
-  std::vector<Node *> outlinks;
-
-  // Type checks.
-  bool IsFunction() const { return type_ == Node::Type::kFunction; }
-  bool IsValue() const { return type_ == Node::Type::kValue; }
-  bool IsFunctionBlock() const { return type_ == Node::Type::kFunctionBlock; }
-
-  virtual ~Node() {}
-
-  friend class NodeMap;
-
-  PADDLE_DISALLOW_COPY_AND_ASSIGN(Node);
-
- protected:
-  // The id number not the name is a node's unique identifier in the computation
-  // graph.
-  int id_{-1};
-  std::string name_;
-  Type type_{Type::kNone};
-  // Mark this node is deleted by some pass.
-  bool deleted_{false};
-  mutable std::unordered_map<std::string, AnyAttr> attrs_;
-};
-
-class Function;
-/*
- * Value represents a value node, it has some attributes including dims, data
- * type and so on.
- */
-class Value : public Node {
- public:
-  enum class DataType { kInt32, kInt64, kFloat32, kFloat64 };
-  using Dims = std::vector<int>;
-
-  void SetDataType(DataType data_type) { data_type_ = data_type; }
-  DataType data_type() const { return data_type_; }
-
-  void SetDims(const Dims &dims) { dims_ = dims; }
-  const Dims &dims() const { return dims_; }
-
-  Device device() const { return device_; }
-  void SetDevice(Device device) { device_ = device; }
-
-  std::vector<Dot::Attr> dot_attrs() const override;
-
-  PADDLE_DISALLOW_COPY_AND_ASSIGN(Value);
-
- protected:
-  Value() { SetType(Node::Type::kValue); }
-  friend class NodeMap;
-
- private:
-  DataType data_type_;
-  Dims dims_;
-  Device device_;
-};
-
-/*
- * Function represents any kind of executable concepts that takes several Values
- * as input, and outputs several Values.
- */
-class Function : public Node {
- public:
-  std::vector<Dot::Attr> dot_attrs() const override;
-
-  // Get the operator's type from Desc.
-  const std::string &func_type() const { return func_type_; }
-  // Set the operator's type.
-  void SetFuncType(const std::string &func_type) { func_type_ = func_type; }
-
-  PADDLE_DISALLOW_COPY_AND_ASSIGN(Function);
-
- protected:
-  std::string func_type_;
-  Function() { SetType(Node::Type::kFunction); }
-  friend class NodeMap;
-};
-
-/*
- * FunctionBlock is a Node that contains a sub-graph multiple Node.
- */
-struct FunctionBlock : public Node {
-  std::string repr() const override { return "block-" + std::to_string(id()); }
-  std::vector<Node *> subgraph;
-
- protected:
-  FunctionBlock() { SetType(Node::Type::kFunctionBlock); }
-  friend class NodeMap;
-};
-
-class NodeMap {
- public:
-  // Create a new node with type.
-  Node *Create(Node::Type type);
-
-  // Get a node by its id.
-  Node *GetMutable(size_t id);
-
-  const Node &Get(size_t id) const;
-
-  void Delete(size_t id);
-
-  const std::vector<std::unique_ptr<Node>> &nodes() const { return nodes_; }
-
-  size_t size() const { return nodes_.size(); }
-
- private:
-  std::vector<std::unique_ptr<Node>> nodes_;
-  std::unordered_map<std::string, Node *> map_;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/node_tester.cc b/paddle/fluid/inference/analysis/node_tester.cc
deleted file mode 100644
index 9207c15373fb4264ff0e738e93ae88e1c08b554c..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/node_tester.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/node.h"
-
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-TEST(NodeAttr, bool) {
-  AnyAttr x;
-  x.Bool() = true;
-  ASSERT_EQ(x.Bool(), true);
-}
-
-TEST(NodeAttr, int32) {
-  AnyAttr x;
-  x.Int32() = 32;
-  ASSERT_EQ(x.Int32(), 32);
-}
-
-TEST(NodeAttr, string) {
-  AnyAttr x;
-  x.String() = "Hello";
-  ASSERT_EQ(x.String(), "Hello");
-}
-
-TEST(Node, Attr) {
-  // Node is an abstract class, use Value instead for they share the same Attr
-  // logic.
-  NodeMap nodes;
-  auto* node = nodes.Create(Node::Type::kValue);
-  node->attr("v0").Int32() = 2008;
-  ASSERT_EQ(node->attr("v0").Int32(), 2008);
-
-  node->attr("str").String() = "hello world";
-  ASSERT_EQ(node->attr("str").String(), "hello world");
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc
deleted file mode 100644
index ce390ee8313d6e3e2f0d79fb59d2225e2779180b..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/pass_manager.h"
-#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
-#include "paddle/fluid/string/pretty_log.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-bool PassManager::Initialize(Argument* argument) {
-  argument_ = argument;
-  for (auto& pass : data_) {
-    VLOG(30) << "Initializing pass [" << pass->repr() << "]";
-    if (!pass->Initialize(argument)) {
-      LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
-      return false;
-    }
-  }
-  return true;
-}
-
-void DfgPassManager::RunAll() {
-  PADDLE_ENFORCE(argument_);
-  VLOG(30) << "Total " << data_.size() << " Analysys passes";
-  for (auto& pass : data_) {
-    string::PrettyLogEndl(string::Style::H1(), "* Running Analysis pass [%s]",
-                          pass->repr());
-    pass->Run(argument_->main_dfg.get());
-  }
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass_manager.h b/paddle/fluid/inference/analysis/pass_manager.h
deleted file mode 100644
index 412747c4fcce73303703f586f7a04edf4cc5ee76..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/pass_manager.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file defines the logic of pass management. The analysis for inference is
- * a pipeline of Passes, a PassManager is a agency that helps to manage the
- * executation of the Passes.
- *
- * There are two modes of Passes, the first one is called NodePass and takes
- * an Node as input and output; the second one is called DFGPass and takes a
- * DFG(Data Flow Graph) as input and output. It is hard to put all the passes in
- * the same pipeline, there are two kinds of PassManagers, both takes a DFG as
- * input and output a DFG, but the Passes inside are different:
- *
- *   1. NodePassManager: the passes inside are all NodePasses, it can have
- *      different graph trivial algorithm, for example, DFS_NodePassManager will
- *      trigger the passes in depth first order;
- *   2. DfgPassManager: the passes inside are all DfgPasses.
- */
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * PassManager is the base class for all pass managers, a pass manager has
- * several Pass-es registered, and execute them in the linear order.
- */
-class PassManager : public OrderedRegistry<AnalysisPass> {
- public:
-  PassManager() = default;
-  // Call all the passes' Initialize methods. The desc and data_flow_graph are
-  // globally shared, so pass them as the arguemnts for all the pass managers.
-  virtual bool Initialize(const Argument& argument) { return false; }
-
-  virtual bool Initialize(Argument* argument);
-
-  // Call all the passes' Finalize methods.
-  virtual bool Finalize() {
-    for (auto& pass : data_) {
-      if (!pass->Finalize()) {
-        LOG(ERROR) << "Failed to finalize pass [" << pass->repr() << "]";
-        return false;
-      }
-    }
-    return true;
-  }
-
-  // Run all the passes.
-  virtual void RunAll() = 0;
-
-  // Short identifier.
-  virtual std::string repr() const = 0;
-  // Long description.
-  virtual std::string description() const = 0;
-
-  virtual ~PassManager() = default;
-
- protected:
-  Argument* argument_{nullptr};
-};
-
-/*
- * A pass manager that process a DFG.
- */
-class DfgPassManager : public PassManager {
- public:
-  DfgPassManager() = default;
-
-  void RunAll() override;
-
-  virtual ~DfgPassManager() = default;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc
deleted file mode 100644
index 72b0fbf7e571ec97a0ea093d01449c1d5ddb9b91..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
-#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
-#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
-#include "paddle/fluid/inference/analysis/pass_manager.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-class TestDfgPassManager final : public DfgPassManager {
- public:
-  TestDfgPassManager() = default;
-  virtual ~TestDfgPassManager() = default;
-  // Short identifier.
-  std::string repr() const override { return "test-pass-manager"; }
-  // Long description.
-  std::string description() const override { return "test doc"; }
-};
-
-TEST(PassManager, DFG_pass_manager) {
-  TestDfgPassManager manager;
-  DFG_GraphvizDrawPass::Config config("./", "dfg.dot");
-
-  manager.Register("fluid-to-flow-graph", new FluidToDataFlowGraphPass);
-  manager.Register("graphviz", new DFG_GraphvizDrawPass(config));
-  manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass);
-
-  Argument argument(FLAGS_inference_model_dir);
-
-  ASSERT_TRUE(&argument);
-  ASSERT_TRUE(manager.Initialize(&argument));
-  manager.RunAll();
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a30c27b1183a75de8c0bb50ef3617d747b239fae
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -0,0 +1,9 @@
+cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
+cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
+cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass)
+
+set(analysis_deps ${analysis_deps}
+        ir_graph_build_pass
+        ir_analysis_pass
+        analysis_passes
+        CACHE INTERNAL "")
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..233bfd6a42b7f123813d4ef5cecf353f7e88d208
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void IrAnalysisComposePass::RunImpl(Argument *argument) {
+  ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
+  if (argument->use_tensorrt_valid() && argument->use_tensorrt()) {
+    InitTensorRTAttrs(argument);
+  }
+  ApplyIrPasses(argument);
+  CollectFusionStatis(argument);
+}
+
+std::string IrAnalysisComposePass::repr() const {
+  return "ir-analysis-compose-pass";
+}
+
+void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
+  if (argument->use_tensorrt_valid() && argument->use_tensorrt()) {
+    LOG(INFO) << "Initing TensorRT pass";
+    argument->SetTensorRtNodeTeller([](const framework::ir::Node *node) {
+      std::unordered_set<std::string> teller_set(
+          {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
+           "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
+           "elementwise_add", "dropout", "split"});
+      if (!node->IsOp()) return false;
+
+      if (teller_set.count(node->Op()->Type())) {
+        return true;
+      } else {
+        return false;
+      }
+    });
+  }
+}
+
+void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
+  std::vector<std::string> passes({
+      "ir_graph_build_pass", "ir_analysis_pass",
+  });
+  for (const auto &pass : passes) {
+    VLOG(2) << "Run pass " << pass;
+    auto *the_pass = PassRegistry::Global().Retreive(pass);
+    the_pass->Run(argument);
+  }
+}
+
+void IrAnalysisComposePass::CollectFusionStatis(Argument *argument) {
+  if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) {
+    LOG(INFO) << "argument has no fuse statis";
+    return;
+  }
+  argument->SetFusionStatis(
+      argument->main_graph().Get<Argument::fusion_statis_t>(
+          framework::ir::kFuseStatisAttr));
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass.h b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
similarity index 53%
rename from paddle/fluid/inference/analysis/model_store_pass.h
rename to paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
index f14b49e09c2f8e79c6fc4accdbf17f4f7a9bb1a3..53e2ebb0038a5c105f68a0146b3da90a6ae34af8 100644
--- a/paddle/fluid/inference/analysis/model_store_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
@@ -12,42 +12,35 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-/*
- * This file defines ModelStorePass, which store the runtime DFG to a Paddle
- * model in the disk, and that model can be reloaded for prediction.
- */
-
 #pragma once
+
 #include <string>
+#include <vector>
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/inference/analysis/passes/passes.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-class ModelStorePass : public DataFlowGraphPass {
+/*
+ * The analysis pass to run a list of IR passes (like a function call).
+ * Currently, it should be the first pass of analysis phase.
+ */
+class IrAnalysisComposePass : public AnalysisPass {
  public:
-  bool Initialize(Argument* argument) override {
-    if (!argument) {
-      LOG(ERROR) << "invalid argument";
-      return false;
-    }
-    argument_ = argument;
-    return true;
-  }
+  void RunImpl(Argument* argument) override;
+  std::string repr() const override;
 
-  void Run(DataFlowGraph* x) override;
+ private:
+  void InitTensorRTAttrs(Argument* argument);
 
-  std::string repr() const override { return "DFG-store-pass"; }
-  std::string description() const override {
-    return R"DD(This file defines ModelStorePass, which store the runtime DFG to a Paddle
-    model in the disk, and that model can be reloaded for prediction again.)DD";
-  }
+  void ApplyIrPasses(Argument* argument);
 
-  bool Finalize() override;
+  void CollectFusionStatis(Argument* argument);
 
- private:
-  Argument* argument_{nullptr};
+  // Assign a Scope for IR passes to modify the weights.
+  void AssignScopeToModify(Argument* argument);
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e327bd39f0ae0b8fbe3b189e4bb26a23c44d910c
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
+#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void IrAnalysisPass::RunImpl(Argument* argument) {
+  ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
+  ARGUMENT_CHECK_FIELD(argument, main_program);
+  ARGUMENT_CHECK_FIELD(argument, scope);
+
+  auto* the_graph = argument->ReleaseMainGraph();
+  auto graph = std::unique_ptr<Graph>(the_graph);
+
+  // Apply passes.
+  IRPassManager the_ir_manager(argument);
+  graph = the_ir_manager.Apply(std::move(graph));
+  PADDLE_ENFORCE_GT(graph->Nodes().size(), 0);
+  argument->SetIrAnalyzedProgram(new framework::proto::ProgramDesc(
+      the_ir_manager.AcquireProgram(&graph, argument->main_program())));
+  argument->SetMainGraph(graph.release());
+}
+
+std::string IrAnalysisPass::repr() const { return "ir-analysis-pass"; }
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/node_attr_flags.h b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
similarity index 70%
rename from paddle/fluid/inference/analysis/node_attr_flags.h
rename to paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
index a3f70e5419a66969e8fb20152a8a8ace39316f57..d8a7449807585257c153d3c8958555ea2306afa3 100644
--- a/paddle/fluid/inference/analysis/node_attr_flags.h
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
@@ -12,20 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-/*
- * This file contains all the flags that declared in Node::Attr.
- *
- * The Node::Attr is designed to share information between different passes, one
- * can get other's attributes in a Node by the flags in this file.
- */
 #pragma once
+
+#include <string>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-#define DECLARE_NODE_ATTR(flag__) const char ATTR_##flag__[] = #flag__;
-
-DECLARE_NODE_ATTR(supported_by_tensorrt)  // bool
+/*
+ * Perform IR analysis passes.
+ *
+ * It is used to fuse some
+ */
+class IrAnalysisPass : public AnalysisPass {
+ public:
+  void RunImpl(Argument* argument) override;
+  std::string repr() const override;
+};
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a30fef08b5726c965637e2fb489bdb2036bd2a8d
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
+#include <paddle/fluid/framework/ir/fuse_pass_base.h>
+#include <string>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+
+extern void ReadBinaryFile(const std::string &filename, std::string *contents);
+
+namespace analysis {
+
+void IrGraphBuildPass::RunImpl(Argument *argument) {
+  if (!argument->scope_valid()) {
+    argument->SetScope(new framework::Scope);
+  }
+
+  if (argument->model_dir_valid()) {
+    auto program = LoadModel(argument->model_dir(), argument->scope_ptr());
+    argument->SetMainProgram(program.release());
+  } else if (argument->model_program_path_valid() &&
+             argument->model_params_path_valid()) {
+    auto program =
+        LoadModel(argument->model_program_path(), argument->model_params_path(),
+                  argument->scope_ptr());
+    argument->SetMainProgram(program.release());
+  } else {
+    PADDLE_THROW(
+        "either model_dir or (program path and parameter path) should be set.");
+  }
+
+  auto graph = std::unique_ptr<Graph>(new Graph(argument->main_program()));
+  argument->SetMainGraph(graph.release());
+  argument->main_graph().Set(framework::ir::kParamScopeAttr,
+                             new framework::Scope *(argument->scope_ptr()));
+}
+
+std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
+    const std::string &path, framework::Scope *scope) {
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  return Load(&exe, scope, path);
+}
+
+std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
+    const std::string &program_path, const std::string &params_path,
+    framework::Scope *scope) {
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  return Load(&exe, scope, program_path, params_path);
+}
+
+std::string IrGraphBuildPass::repr() const { return "ir-graph-build-pass"; }
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..3291e4f6ad3ca3079e672350805cab1f1e7b2413
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Load program and parameter to memory from the disk.
+ */
+class IrGraphBuildPass : public AnalysisPass {
+ public:
+  void RunImpl(Argument *argument) override;
+
+  std::string repr() const override;
+
+ private:
+  std::unique_ptr<framework::ProgramDesc> LoadModel(const std::string &path,
+                                                    framework::Scope *scope);
+  std::unique_ptr<framework::ProgramDesc> LoadModel(
+      const std::string &program_path, const std::string &params_path,
+      framework::Scope *scope);
+
+  std::string model_binary_str_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2ef515f45f2483df8d1238b4758d6729d0299ce9
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/passes.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/passes/passes.h"
+#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc"
+#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
+#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+PassRegistry::PassRegistry() {
+  passes_.emplace("ir_analysis_pass",
+                  std::unique_ptr<AnalysisPass>(new IrAnalysisPass));
+  passes_.emplace("ir_graph_build_pass",
+                  std::unique_ptr<AnalysisPass>(new IrGraphBuildPass));
+  passes_.emplace("ir_analysis_compose_pass",
+                  std::unique_ptr<AnalysisPass>(new IrAnalysisComposePass));
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc b/paddle/fluid/inference/analysis/passes/passes.h
similarity index 61%
rename from paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
rename to paddle/fluid/inference/analysis/passes/passes.h
index 367c25805d05f8d10fb8341158760ac6356a5c48..ea07e0dcbd992c9d10c6662909798ef79a01e3a7 100644
--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/passes/passes.h
@@ -12,24 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
+#pragma once
 
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include <string>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST(FluidToIrPass, Test) {
-  FluidToIrPass pass;
-  Argument argument(FLAGS_inference_model_dir);
-  argument.Set(kFluidToIrPassesAttr,
-               new std::vector<std::string>({"infer_clean_graph_pass"}));
-  pass.Initialize(&argument);
-  pass.Run(argument.main_dfg.get());
-}
+struct PassRegistry {
+  PassRegistry();
+
+  AnalysisPass* Retreive(const std::string& pass_type) {
+    return passes_[pass_type].get();
+  }
+
+  static PassRegistry& Global() {
+    static auto* x = new PassRegistry;
+    return *x;
+  }
+
+ private:
+  std::unordered_map<std::string, std::unique_ptr<AnalysisPass>> passes_;
+};
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.h b/paddle/fluid/inference/analysis/subgraph_splitter.h
deleted file mode 100644
index 76e4fda0249e03c617d1b37c079dcd97f21387c1..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/subgraph_splitter.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file defines the the class to partition a graph.
- */
-
-#pragma once
-
-#include <vector>
-
-#include "paddle/fluid/inference/analysis/argument.h"
-#include "paddle/fluid/inference/analysis/data_flow_graph.h"
-#include "paddle/fluid/inference/analysis/node.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * Detect the nodes in a sub-graph that meet some conditions. This class doesn't
- * modify the graph.
- */
-class SubGraphSplitter {
- public:
-  static const char *kMarkerAttrName;
-  // Tell whether a node is inside a sub-graph.
-  using NodeInsideSubgraphTeller = std::function<bool(const Node *)>;
-
-  SubGraphSplitter(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller)
-      : graph_(graph), node_inside_subgraph_teller_(teller) {}
-
-  std::vector<std::vector<Node *>> operator()();
-
- protected:
-  // Mark the nodes inside the accepted sub-graph using
-  // node_inside_subgraph_teller.
-  void MarkNodesInsideSubGraph();
-
-  // Merge the marked nodes into sub-graphs and return the sub-graphs.
-  std::vector<std::vector<Node *>> ExtractSubGraphs();
-
- private:
-  DataFlowGraph *graph_;
-  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
-};
-
-/*
- * SubGraphFuse - Replace some nodes with the sub-graph node they are inside. To
- * some extent, the TensorRT engine is just a fusion op for a model.
- */
-class SubGraphFuse {
- public:
-  using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller;
-
-  SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller,
-               Argument *argument)
-      : graph_(graph),
-        node_inside_subgraph_teller_(teller),
-        argument_(argument) {}
-
-  // The main method which run all the logic.
-  void operator()();
-
- protected:
-  // Remove the nodes inside sub-graphs and replace with the SubGraphNode.
-  void ReplaceNodesWithSubGraphs();
-
- private:
-  DataFlowGraph *graph_;
-  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
-  Argument *argument_;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
deleted file mode 100644
index e1dc89fab5fb76d456b07c316ab1cabe6de23b26..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
-  if (node->type() != Node::Type::kFunction) return false;
-  const auto* func = static_cast<const Function*>(node);
-  if (func->func_type() == "elementwise_add" || func->func_type() == "relu" ||
-      func->func_type() == "conv2d" || func->func_type() == "mul" ||
-      func->func_type() == "sigmoid" || func->func_type() == "softmax") {
-    LOG(INFO) << "sub-graph marked " << node->repr();
-    return true;
-  }
-  return false;
-};
-
-TEST(SubGraphSplitter, Split) {
-  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
-  auto dfg = ProgramDescToDFG(desc);
-  LOG(INFO) << "spliter\n" << dfg.DotString();
-
-  ASSERT_GT(dfg.nodes.size(), 5UL);
-
-  auto subgraphs = SubGraphSplitter(&dfg, teller)();
-
-  // Check the number of the marked nodes.
-  int marked_nodes = 0;
-  for (auto& node : dfg.nodes.nodes()) {
-    if (node->IsFunction() &&
-        node->attr(SubGraphSplitter::kMarkerAttrName).Bool()) {
-      ++marked_nodes;
-    }
-  }
-  EXPECT_EQ(marked_nodes, 6);
-
-  // For human debug.
-  for (auto& subgraph : subgraphs) {
-    LOG(INFO) << "subgraph size " << subgraph.size();
-    for (auto* node : subgraph) {
-      LOG(INFO) << "node " << node->repr();
-    }
-  }
-
-  ASSERT_EQ(subgraphs.size(), 1UL);
-  // The last sub-graph has 5 Functions.
-  ASSERT_EQ(subgraphs.back().size(), 6UL);
-}
-
-TEST(SubGraphSplitter, Fuse) {
-  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
-  auto dfg = ProgramDescToDFG(desc);
-  Argument argument;
-  argument.Set<int>("minimum_subgraph_size", new int(3));
-
-  size_t count0 = dfg.nodes.size();
-
-  SubGraphFuse fuse(&dfg, teller, &argument);
-  fuse();
-
-  int count1 = 0;
-  for (auto& node : dfg.nodes.nodes()) {
-    if (node->deleted()) {
-      LOG(INFO) << "deleted " << node->repr();
-    }
-    count1 += node->deleted();
-  }
-
-  // At least one nodes should be deleted.
-  ASSERT_EQ(dfg.nodes.size(), count0 + 1);  // added a new FunctionBlock
-  ASSERT_EQ(11, count1);
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
deleted file mode 100644
index 174c8513f92cf869419f04cab5a54af65e9673b8..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
-#include "paddle/fluid/inference/analysis/node_attr_flags.h"
-#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-void TensorRTSubgraphNodeMarkPass::Run(DataFlowGraph *graph) {
-  for (auto &node : graph->nodes.nodes()) {
-    node->attr(ATTR_supported_by_tensorrt).Bool() = teller_(node.get());
-  }
-}
-
-class DfgDebuggerPass : public DFG_GraphvizDrawPass {
- public:
-  explicit DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
-      : DFG_GraphvizDrawPass(config) {}
-
-  std::string repr() const override {
-    return "tensorrt-subgraph-node-mark-debugger";
-  }
-
-  bool Finalize() override { return true; }
-
- protected:
-  std::string Draw(DataFlowGraph *graph) override {
-    Dot dot;
-    // Add nodes
-    for (size_t i = 0; i < graph->nodes.size(); i++) {
-      const Node &node = graph->nodes.Get(i);
-      if (config_.display_deleted_node || !node.deleted()) {
-        auto dot_attr = node.dot_attrs();
-        if (node.attr(ATTR_supported_by_tensorrt).Bool()) {
-          dot_attr.assign(
-              {Dot::Attr{"color", "green"}, Dot::Attr{"style", "filled"}});
-        }
-        dot.AddNode(node.repr(), dot_attr);
-      }
-    }
-    // Add edges
-    for (size_t i = 0; i < graph->nodes.size(); i++) {
-      const Node &node = graph->nodes.Get(i);
-      if (!config_.display_deleted_node && node.deleted()) continue;
-      for (auto &in : node.inlinks) {
-        if (!config_.display_deleted_node && in->deleted()) continue;
-        dot.AddEdge(in->repr(), node.repr(), {});
-      }
-    }
-    return dot.Build();
-  }
-};
-
-AnalysisPass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
-  DFG_GraphvizDrawPass::Config config(FLAGS_IA_graphviz_log_root,
-                                      "tensorrt_marked_node");
-  return new DfgDebuggerPass(config);
-}
-bool TensorRTSubgraphNodeMarkPass::Finalize() { return true; }
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
deleted file mode 100644
index c881a54c240538b68abdcb9060db69de3bf2b8bb..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * This file defines TensorRTSubgraphNodeMarkPass which helps to mark the ops
- * that supported by TensorRT engine.
- */
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * Mark the operators that TensorRT engine supports.
- */
-class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
- public:
-  using teller_t = SubGraphSplitter::NodeInsideSubgraphTeller;
-
-  explicit TensorRTSubgraphNodeMarkPass(const teller_t& teller)
-      : teller_(teller) {}
-
-  bool Initialize(Argument* argument) override { return true; }
-
-  // This class get a sub-graph as input and determine whether to transform this
-  // sub-graph into TensorRT.
-  void Run(DataFlowGraph* graph) override;
-
-  std::string repr() const override { return "tensorrt-sub-subgraph-mark"; }
-  std::string description() const override {
-    return "tensorrt sub-graph mark pass";
-  }
-
-  AnalysisPass* CreateGraphvizDebugerPass() const override;
-  bool Finalize() override;
-
- private:
-  teller_t teller_;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
deleted file mode 100644
index c1d932878e559180af987594535959afdf475587..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/analysis/node_attr_flags.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-TEST(TensorRTSubgraphNodeMarkPass, test) {
-  // init
-  FluidToDataFlowGraphPass pass;
-  Argument argument(FLAGS_inference_model_dir);
-  ASSERT_TRUE(pass.Initialize(&argument));
-  pass.Run(argument.main_dfg.get());
-
-  TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) {
-    return node->IsFunction() &&
-           static_cast<const Function*>(node)->func_type() == "mul";
-  };
-  TensorRTSubgraphNodeMarkPass pass1(teller);
-  ASSERT_TRUE(pass1.Initialize(&argument));
-  pass1.Run(argument.main_dfg.get());
-
-  int counter{0};
-  for (auto& node : argument.main_dfg->nodes.nodes()) {
-    counter += node->attr(ATTR_supported_by_tensorrt).Bool();
-  }
-  ASSERT_EQ(counter, 2);
-  LOG(INFO) << counter << " nodes marked";
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
deleted file mode 100644
index 3aa65f223a9e70b8ba7e387d1766ec6a97aee385..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
-#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-TensorRTSubGraphPass::TensorRTSubGraphPass(
-    const TensorRTSubGraphPass::NodeInsideSubgraphTeller &teller)
-    : node_inside_subgraph_teller_(teller) {}
-
-void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
-  SubGraphFuse(graph, node_inside_subgraph_teller_, argument_)();
-  VLOG(40) << "debug info "
-           << graph->HumanReadableInfo(false /*show_values*/,
-                                       true /*show_functions*/);
-}
-
-}  // namespace analysis
-}  // namespace inference
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
deleted file mode 100644
index 3545da9109d79964f36c3d7e738620cc2e0f9a6c..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/node.h"
-#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * Parse the graph and replace TensorRT supported nodes with SubGraphNode
- */
-class TensorRTSubGraphPass : public DataFlowGraphPass {
- public:
-  // Tell whether to transform a sub-graph into TensorRT.
-  using NodeInsideSubgraphTeller = SubGraphFuse::NodeInsideSubgraphTeller;
-
-  explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
-
-  bool Initialize(Argument* argument) override {
-    argument_ = argument;
-    return true;
-  }
-
-  // This class get a sub-graph as input and determine whether to transform this
-  // sub-graph into TensorRT.
-  void Run(DataFlowGraph* graph) override;
-
-  bool Finalize() override { return true; }
-
-  std::string repr() const override { return "tensorrt-sub-graph"; }
-  std::string description() const override { return "tensorrt sub graph pass"; }
-
- private:
-  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
-  Argument* argument_;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
deleted file mode 100644
index 9748e24b06295a4e7c2995429e6588cd0f225fe6..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-DEFINE_string(dot_dir, "./", "");
-
-TEST(TensorRTSubGraphPass, main) {
-  std::unordered_set<std::string> teller_set(
-      {"elementwise_add", "mul", "sigmoid"});
-  SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) {
-    if (node->type() != Node::Type::kFunction) return false;
-    const auto* func = static_cast<const Function*>(node);
-    if (teller_set.count(func->func_type())) return true;
-    return false;
-  };
-
-  Argument argument(FLAGS_inference_model_dir);
-  argument.Set<int>("minimum_subgraph_size", new int(0));
-  argument.Set<int>("max_batch_size", new int(3));
-  argument.Set<int>("workspace_size", new int(1 << 20));
-  argument.Set<std::string>("precision_mode", new std::string("FP32"));
-
-  DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
-  DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
-
-  DFG_GraphvizDrawPass dfg_pass(config);
-  DFG_GraphvizDrawPass dfg_pass1(config1);
-  FluidToDataFlowGraphPass pass0;
-  TensorRTSubGraphPass trt_pass(std::move(teller));
-
-  dfg_pass.Initialize(&argument);
-  dfg_pass1.Initialize(&argument);
-  pass0.Initialize(&argument);
-  trt_pass.Initialize(&argument);
-
-  argument.main_dfg.reset(new DataFlowGraph);
-  pass0.Run(argument.main_dfg.get());
-  dfg_pass.Run(argument.main_dfg.get());
-  trt_pass.Run(argument.main_dfg.get());
-  dfg_pass1.Run(argument.main_dfg.get());
-
-  // Check the TRT op's block desc
-  for (auto& node : argument.main_dfg->nodes.nodes()) {
-    if (node->IsFunctionBlock()) {
-      LOG(INFO) << "get function block";
-    }
-  }
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h
index 1073a6f686eaeeaaae2d93ab044149b7df518085..d599099a8050eaeabb8e0544b1bfe3b6b46b17ec 100644
--- a/paddle/fluid/inference/analysis/ut_helper.h
+++ b/paddle/fluid/inference/analysis/ut_helper.h
@@ -18,8 +18,6 @@ limitations under the License. */
 #include <fstream>
 #include <string>
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/inference/analysis/data_flow_graph.h"
-#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 
 namespace paddle {
@@ -32,29 +30,6 @@ namespace analysis {
 
 DEFINE_string(inference_model_dir, "", "inference test model dir");
 
-static DataFlowGraph ProgramDescToDFG(
-    const framework::proto::ProgramDesc& desc) {
-  DataFlowGraph graph;
-  FluidToDataFlowGraphPass pass;
-  Argument argument;
-  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
-  argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
-  pass.Initialize(&argument);
-  pass.Run(&graph);
-  pass.Finalize();
-  return graph;
-}
-
-class DFG_Tester : public ::testing::Test {
- protected:
-  void SetUp() override {
-    auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
-    argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
-  }
-
-  Argument argument;
-};
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index fd05c967774b336275f4c7bd98313bc1d750502f..82f74a269a5915dfa1d97a28f5ae15a12ea0b154 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -17,17 +17,22 @@ if(APPLE)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
 endif(APPLE)
 
-set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB})
+
+set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor analysis_predictor ${GLOB_PASS_LIB})
 
 if(WITH_GPU AND TENSORRT_FOUND)
-    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor)
+    set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
 endif()
 
 cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
-cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
+cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder)
+cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder)
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder)
 cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
 cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api)
+
+
 cc_test(test_paddle_inference_api
         SRCS api_tester.cc
         DEPS paddle_inference_api)
@@ -40,20 +45,10 @@ endif()
 cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps}
         ARGS --dirname=${WORD2VEC_MODEL_DIR})
 
-if(WITH_GPU AND TENSORRT_FOUND)
-cc_library(paddle_inference_tensorrt_subgraph_engine
-        SRCS api_tensorrt_subgraph_engine.cc
-        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy)
-  if(WITH_TESTING)
-    inference_base_test(test_api_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine_tester.cc DEPS ${inference_deps}
-                      ARGS --dirname=${WORD2VEC_MODEL_DIR})
-  endif()
-endif()
-
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
     # compile the libinference_anakin_api.a and anakin.so.
-    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml scope zero_copy_tensor_dummy)
-    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber scope)
+    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml zero_copy_tensor_dummy)
+    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber zero_copy_tensor_dummy)
     function(anakin_target target_name)
       target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
     endfunction()
diff --git a/paddle/fluid/inference/api/README.md b/paddle/fluid/inference/api/README.md
index 20969fac6c8f894ffb4a02b48f795e2a0dcbd096..a2d685d723bd9ab2b84969adb86e177a8754328d 100644
--- a/paddle/fluid/inference/api/README.md
+++ b/paddle/fluid/inference/api/README.md
@@ -2,25 +2,15 @@
 
 Paddle inference offers the APIs in `C` and `C++` languages.
 
-One can easily deploy a model trained by Paddle following the steps as below:
+You can easily deploy a model trained by Paddle following the steps as below:
 
 1. Optimize the native model;
 2. Write some codes for deployment.
 
+## The APIs
 
-Let's explain the steps in detail.
-
-## Optimize the native Fluid Model
-
-The native model that get from the training phase needs to be optimized for that.
-
-- Clean the noise such as the cost operators that do not need inference;
-- Prune unnecessary computation fork that has nothing to do with the output;
-- Remove extraneous variables;
-- Memory reuse for native Fluid executor;
-- Translate the model storage format to some third-party engine's, so that the inference API can utilize the engine for acceleration;
-
-We have an official tool to do the optimization, call `paddle_inference_optimize --help` for more information.
+All the released APIs are located in the `paddle_inference_api.h` header file. 
+The stable APIs are wrapped by `namespace paddle`, the unstable APIs are protected by `namespace paddle::contrib`.
 
 ## Write some codes
 
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ccd2dc5ab353b1634b651a4b7caa2af0da75ce4
--- /dev/null
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -0,0 +1,103 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle_pass_builder.h"  // NOLINT
+
+namespace paddle {
+
+PassStrategy *contrib::AnalysisConfig::pass_builder() const {
+  PADDLE_ENFORCE(
+      pass_builder_.get(),
+      "Should call constructor first, that will init the pass_builder_.");
+  return pass_builder_.get();
+}
+
+contrib::AnalysisConfig::AnalysisConfig(bool use_gpu) {
+  this->use_gpu = use_gpu;
+  if (use_gpu) {
+    pass_builder_.reset(new GpuPassStrategy);
+  } else {
+    pass_builder_.reset(new CpuPassStrategy);
+  }
+}
+
+contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
+  // fields from Config
+  model_dir = other.model_dir;
+  // fields from NativeConfig
+  use_gpu = other.use_gpu;
+  device = other.device;
+  fraction_of_gpu_memory = other.fraction_of_gpu_memory;
+  prog_file = other.prog_file;
+  param_file = other.param_file;
+  specify_input_name = other.specify_input_name;
+  // fields from this.
+  enable_ir_optim = other.enable_ir_optim;
+  use_feed_fetch_ops = other.use_feed_fetch_ops;
+  use_tensorrt_ = other.use_tensorrt_;
+  tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
+  tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
+
+  if (use_gpu) {
+    pass_builder_.reset(new GpuPassStrategy(
+        *static_cast<GpuPassStrategy *>(other.pass_builder())));
+  } else {
+    pass_builder_.reset(new CpuPassStrategy(
+        *static_cast<CpuPassStrategy *>(other.pass_builder())));
+  }
+}
+
+contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
+  // fields from Config
+  model_dir = other.model_dir;
+  // fields from NativeConfig
+  use_gpu = other.use_gpu;
+  device = other.device;
+  fraction_of_gpu_memory = other.fraction_of_gpu_memory;
+  prog_file = other.prog_file;
+  param_file = other.param_file;
+  specify_input_name = other.specify_input_name;
+  // fields from this.
+  enable_ir_optim = other.enable_ir_optim;
+  use_feed_fetch_ops = other.use_feed_fetch_ops;
+  use_tensorrt_ = other.use_tensorrt_;
+  tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
+  tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
+  pass_builder_ = std::move(other.pass_builder_);
+}
+
+void contrib::AnalysisConfig::EnableMKLDNN() {
+#ifdef PADDLE_WITH_MKLDNN
+  pass_builder()->EnableMKLDNN();
+  use_mkldnn_ = true;
+#else
+  LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
+  use_mkldnn_ = false;
+#endif
+}
+
+void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
+                                                   int max_batch_size) {
+  use_tensorrt_ = true;
+  tensorrt_workspace_size_ = workspace_size;
+  tensorrt_max_batchsize_ = max_batch_size;
+  // Append after the infer_clean pass.
+  pass_builder()->InsertPass(1, "tensorrt_subgraph_pass");
+}
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index dd295854a87c9707aaa85e1e2c6111089e3fe885..76d205b737aeb456f242037f2b375d9c537b39f3 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -13,10 +13,13 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/analysis_predictor.h"
+#include <glog/logging.h>
+#include <algorithm>
 #include <memory>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/naive_executor.h"
@@ -24,6 +27,9 @@
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#if PADDLE_WITH_TENSORRT
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#endif
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -35,6 +41,17 @@ namespace paddle {
 
 using contrib::AnalysisConfig;
 
+namespace {
+bool IsPersistable(const framework::VarDesc *var) {
+  if (var->Persistable() &&
+      var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
+      var->GetType() != framework::proto::VarType::FETCH_LIST) {
+    return true;
+  }
+  return false;
+}
+}  // namespace
+
 bool AnalysisPredictor::Init(
     const std::shared_ptr<framework::Scope> &parent_scope,
     const std::shared_ptr<framework::ProgramDesc> &program) {
@@ -52,36 +69,93 @@ bool AnalysisPredictor::Init(
   // no matter with or without MKLDNN
   paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
 
-  if (config_.use_gpu) {
-    place_ = paddle::platform::CUDAPlace(config_.device);
-    LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim "
-                    "is turned false.";
-    config_.enable_ir_optim = false;
-  } else {
-    place_ = paddle::platform::CPUPlace();
+  if (!PrepareScope(parent_scope)) {
+    return false;
+  }
+  if (!CreateExecutor()) {
+    return false;
+  }
+  if (!PrepareProgram(program)) {
+    return false;
+  }
+
+  // Prepare executor, create local variables.
+  if (!PrepareExecutor()) {
+    return true;
   }
+
+  // Get the feed_target_names and fetch_target_names
+  PrepareFeedFetch();
+
+  return true;
+}
+
+bool AnalysisPredictor::PrepareScope(
+    const std::shared_ptr<framework::Scope> &parent_scope) {
   if (parent_scope) {
+    PADDLE_ENFORCE_NOT_NULL(
+        parent_scope,
+        "Both program and parent_scope should be set in Clone mode.");
     scope_ = parent_scope;
-    sub_scope_ = &(parent_scope->NewScope());
+    status_is_cloned_ = true;
   } else {
     paddle::framework::InitDevices(false);
     scope_.reset(new paddle::framework::Scope());
+    status_is_cloned_ = false;
   }
-
-  executor_.reset(new paddle::framework::NaiveExecutor(place_));
-
+  sub_scope_ = &scope_->NewScope();
+  return true;
+}
+bool AnalysisPredictor::PrepareProgram(
+    const std::shared_ptr<framework::ProgramDesc> &program) {
   if (!program) {
     if (!LoadProgramDesc()) return false;
-    OptimizeInferenceProgram();
+
+    // Optimize the program, and load parameters and modify them in the
+    // scope_.
+    // This will change the scope_ address.
+    if (config_.enable_ir_optim) {
+      status_ir_optim_enabled_ = true;
+      OptimizeInferenceProgram();
+    } else {
+      // If the parent_scope is passed, we assert that the persistable variables
+      // are already created, so just create the no persistable variables.
+
+      // If not cloned, the parameters should be loaded
+      // OptimizeInferenceProgram.
+      // So in both cases, just the local variables are needed to load, not the
+      // parematers.
+      executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
+
+      // Load parameters
+      LOG(INFO) << "load parameters ";
+      LoadParameters();
+    }
   } else {
+    // If the program is passed from external, no need to optimize it, this
+    // logic is used in the clone scenario.
     inference_program_ = program;
   }
 
-  executor_->Prepare(scope_.get(), *inference_program_, 0,
+  executor_->CreateVariables(*inference_program_, 0, false, sub_scope_);
+
+  return true;
+}
+bool AnalysisPredictor::CreateExecutor() {
+  if (config_.use_gpu) {
+    status_use_gpu_ = true;
+    place_ = paddle::platform::CUDAPlace(config_.device);
+  } else {
+    place_ = paddle::platform::CPUPlace();
+  }
+  executor_.reset(new paddle::framework::NaiveExecutor(place_));
+  return true;
+}
+bool AnalysisPredictor::PrepareExecutor() {
+  executor_->Prepare(sub_scope_, *inference_program_, 0,
                      config_.use_feed_fetch_ops);
 
-  // Get the feed_target_names and fetch_target_names
-  PrepareFeedFetch();
+  PADDLE_ENFORCE_NOT_NULL(sub_scope_);
 
   return true;
 }
@@ -206,54 +280,40 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
   return true;
 }
 
+// NOTE All the members in AnalysisConfig should be copied to Argument.
 void AnalysisPredictor::OptimizeInferenceProgram() {
-  LOG(INFO) << "optimize begin";
-  FLAGS_IA_enable_ir = config_.enable_ir_optim;
-  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
-  FLAGS_IA_output_storage_path = "";  // Don't output the model.
+  status_program_optimized_ = true;
+
+  argument_.SetUseGPU(config_.use_gpu);
   // Analyze inference_program
   if (!config_.model_dir.empty()) {
-    argument_.fluid_model_dir.reset(new std::string(config_.model_dir));
+    argument_.SetModelDir(config_.model_dir);
   } else {
     PADDLE_ENFORCE(
         !config_.param_file.empty(),
         "Either model_dir or (param_file, prog_file) should be set.");
     PADDLE_ENFORCE(!config_.prog_file.empty());
-    argument_.fluid_model_program_path.reset(
-        new std::string(config_.prog_file));
-    argument_.fluid_model_param_path.reset(new std::string(config_.param_file));
+    argument_.SetModelProgramPath(config_.prog_file);
+    argument_.SetModelParamsPath(config_.param_file);
   }
 
-  argument_.origin_program_desc.reset(
-      new ProgramDesc(*inference_program_->Proto()));
-
-  switch (config_.ir_mode) {
-    case contrib::AnalysisConfig::IrPassMode::kExclude:
-      Analyzer()
-          .IncludeAllIrPasses()
-          .SetUseMkldnn(config_._use_mkldnn)
-          .DisableIrPasses(config_.ir_passes)
-          .Run(&argument_);
-      break;
-    case contrib::AnalysisConfig::IrPassMode::kInclude:
-      Analyzer()
-          .SetUseMkldnn(config_._use_mkldnn)
-          .IncludeIrPasses(config_.ir_passes)
-          .Run(&argument_);
-      break;
-    default:
-      LOG(ERROR) << "Only kExclude and kInclude modes are supoorted yet.";
+  if (config_.use_gpu && config_.use_tensorrt_) {
+    argument_.SetUseTensorRT(true);
+    argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
+    argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
   }
 
-  CHECK(argument_.transformed_program_desc);
-  VLOG(50) << "to prepare executor";
+  auto passes = config_.pass_builder()->AllPasses();
+  if (!config_.enable_ir_optim) passes.clear();
+  argument_.SetIrAnalysisPasses(passes);
+  argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get()));
+  Analyzer().Run(&argument_);
+
+  PADDLE_ENFORCE(argument_.scope_valid());
+  VLOG(5) << "to prepare executor";
+  ARGUMENT_CHECK_FIELD((&argument_), ir_analyzed_program);
   inference_program_.reset(
-      new framework::ProgramDesc(*argument_.transformed_program_desc));
-  if (argument_.Has(framework::ir::kParamScopeAttr)) {
-    // Update scope.
-    scope_.reset(
-        argument_.Release<framework::Scope>(framework::ir::kParamScopeAttr));
-  }
+      new framework::ProgramDesc(argument_.ir_analyzed_program()));
   LOG(INFO) << "== optimize end ==";
 }
 
@@ -283,10 +343,12 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   if (!dynamic_cast<AnalysisPredictor *>(predictor.get())->Init(nullptr)) {
     return nullptr;
   }
-  return predictor;
+  return std::move(predictor);
 }
 
 void AnalysisPredictor::PrepareFeedFetch() {
+  PADDLE_ENFORCE_NOT_NULL(sub_scope_);
+  CreateFeedFetchVar(sub_scope_);
   for (auto *op : inference_program_->Block(0).AllOps()) {
     if (op->Type() == "feed") {
       int idx = boost::get<int>(op->GetAttr("col"));
@@ -305,6 +367,14 @@ void AnalysisPredictor::PrepareFeedFetch() {
   }
 }
 
+void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
+  PADDLE_ENFORCE_NOT_NULL(scope);
+  auto *var = scope->Var("feed");
+  var->GetMutable<framework::FeedFetchList>();
+  var = scope->Var("fetch");
+  var->GetMutable<framework::FeedFetchList>();
+}
+
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
     const std::string &name) {
   PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
@@ -335,27 +405,98 @@ bool AnalysisPredictor::ZeroCopyRun() {
 
 bool AnalysisPredictor::LoadProgramDesc() {
   // Initialize the inference program
-  std::unique_ptr<framework::Executor> tmp_exe(
-      new framework::Executor(platform::CPUPlace()));
+  std::string filename;
   if (!config_.model_dir.empty()) {
-    // Parameters are saved in separate files sited in
-    // the specified `dirname`.
-    inference_program_ = paddle::inference::Load(
-        static_cast<framework::Executor *>(tmp_exe.get()), scope_.get(),
-        config_.model_dir);
+    filename = config_.model_dir + "/__model__";
   } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
     // All parameters are saved in a single file.
     // The file names should be consistent with that used
     // in Python API `fluid.io.save_inference_model`.
-    inference_program_ = paddle::inference::Load(
-        static_cast<framework::Executor *>(tmp_exe.get()), scope_.get(),
-        config_.prog_file, config_.param_file);
+    filename = config_.prog_file;
   } else {
+    if (config_.model_dir.empty() && config_.prog_file.empty()) {
+      LOG(ERROR)
+          << "Either model_dir or (prog_file, param_file) should be set.";
+      return false;
+    }
     LOG(ERROR) << string::Sprintf(
         "not valid model path '%s' or program path '%s'.", config_.model_dir,
         config_.param_file);
     return false;
   }
+
+  std::string pb_content;
+  // Read binary
+  std::ifstream fin(filename, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
+  fin.seekg(0, std::ios::end);
+
+  pb_content.resize(fin.tellg());
+  fin.seekg(0, std::ios::beg);
+  fin.read(&(pb_content.at(0)), pb_content.size());
+  fin.close();
+
+  // Create ProgramDesc
+  framework::proto::ProgramDesc proto;
+  proto.ParseFromString(pb_content);
+  inference_program_.reset(new framework::ProgramDesc(proto));
+  return true;
+}
+
+bool AnalysisPredictor::LoadParameters() {
+  PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
+                          "The inference program should be loaded first.");
+  const auto &global_block = inference_program_->MutableBlock(0);
+
+  // create a temporary program to load parameters.
+
+  std::unique_ptr<framework::ProgramDesc> load_program(
+      new framework::ProgramDesc());
+  framework::BlockDesc *load_block = load_program->MutableBlock(0);
+  std::vector<std::string> params;
+
+  for (auto *var : global_block->AllVars()) {
+    if (IsPersistable(var)) {
+      VLOG(3) << "persistable variable's name: " << var->Name();
+
+      framework::VarDesc *new_var = load_block->Var(var->Name());
+      new_var->SetShape(var->GetShape());
+      new_var->SetDataType(var->GetDataType());
+      new_var->SetType(var->GetType());
+      new_var->SetLoDLevel(var->GetLoDLevel());
+      new_var->SetPersistable(true);
+
+      if (!config_.param_file.empty()) {
+        params.push_back(new_var->Name());
+      } else {
+        // append_op
+        framework::OpDesc *op = load_block->AppendOp();
+        op->SetType("load");
+        op->SetOutput("Out", {new_var->Name()});
+        op->SetAttr("file_path", {config_.model_dir + "/" + new_var->Name()});
+        op->CheckAttrs();
+      }
+    }
+  }
+
+  if (!config_.param_file.empty()) {
+    // sort paramlist to have consistent ordering
+    std::sort(params.begin(), params.end());
+    // append just the load_combine op
+    framework::OpDesc *op = load_block->AppendOp();
+    op->SetType("load_combine");
+    op->SetOutput("Out", params);
+    op->SetAttr("file_path", {config_.param_file});
+    op->CheckAttrs();
+  }
+
+  // Use NaiveExecutor to Load parameters.
+  platform::CPUPlace place;
+  framework::NaiveExecutor e(place);
+  e.Prepare(scope_.get(), *load_program, 0, false);
+  e.Run();
+  VLOG(3) << "get " << scope_->LocalVarNames().size() << " vars after load";
+
   return true;
 }
 
@@ -385,3 +526,27 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
 }
 
 }  // namespace paddle
+
+#if PADDLE_WITH_TENSORRT
+USE_TRT_CONVERTER(elementwise_add_weight);
+USE_TRT_CONVERTER(elementwise_add_tensor);
+USE_TRT_CONVERTER(elementwise_sub_tensor);
+USE_TRT_CONVERTER(elementwise_div_tensor);
+USE_TRT_CONVERTER(elementwise_mul_tensor);
+USE_TRT_CONVERTER(elementwise_max_tensor);
+USE_TRT_CONVERTER(elementwise_min_tensor);
+USE_TRT_CONVERTER(elementwise_pow_tensor);
+USE_TRT_CONVERTER(mul);
+USE_TRT_CONVERTER(conv2d);
+USE_TRT_CONVERTER(relu);
+USE_TRT_CONVERTER(sigmoid);
+USE_TRT_CONVERTER(tanh);
+USE_TRT_CONVERTER(fc);
+USE_TRT_CONVERTER(pool2d);
+USE_TRT_CONVERTER(softmax);
+USE_TRT_CONVERTER(batch_norm);
+USE_TRT_CONVERTER(concat);
+USE_TRT_CONVERTER(dropout);
+USE_TRT_CONVERTER(pad);
+USE_TRT_CONVERTER(split);
+#endif
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index a9f4cce6dfa1c92301f57a7b1dd024a61f99d5ab..cf81b7db738d899566ddf32c5e5a40475c8e7bc7 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -23,7 +23,10 @@
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/string/printf.h"
-
+#ifdef PADDLE_WITH_TESTING
+#include <gtest/gtest.h>
+#include <gtest/gtest_prod.h>
+#endif
 namespace paddle {
 
 using inference::analysis::Argument;
@@ -54,6 +57,7 @@ class AnalysisPredictor : public PaddlePredictor {
 
   bool ZeroCopyRun() override;
 
+  void CreateFeedFetchVar(framework::Scope *scope);
   void PrepareFeedFetch();
 
   void OptimizeInferenceProgram();
@@ -62,11 +66,17 @@ class AnalysisPredictor : public PaddlePredictor {
 
   std::unique_ptr<PaddlePredictor> Clone() override;
 
-  framework::Scope *scope() { return executor_->scope(); }
+  framework::Scope *scope() { return scope_.get(); }
   framework::ProgramDesc &program() { return *inference_program_; }
 
  protected:
+  bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program);
+  bool PrepareScope(const std::shared_ptr<framework::Scope> &parent_scope);
+  bool CreateExecutor();
+  bool PrepareExecutor();
+
   bool LoadProgramDesc();
+  bool LoadParameters();
 
   bool SetFeed(const std::vector<PaddleTensor> &input_datas,
                framework::Scope *scope);
@@ -77,6 +87,14 @@ class AnalysisPredictor : public PaddlePredictor {
                    PaddleTensor *output_data);
   ~AnalysisPredictor();
 
+// Some more detailed tests, they are made the friends of the predictor, so that
+// the all the details can be tested.
+#if PADDLE_WITH_TESTING
+  FRIEND_TEST(AnalysisPredictor, analysis_off);
+  FRIEND_TEST(AnalysisPredictor, analysis_on);
+  FRIEND_TEST(AnalysisPredictor, with_gpu);
+#endif
+
  private:
   contrib::AnalysisConfig config_;
   Argument argument_;
@@ -92,6 +110,13 @@ class AnalysisPredictor : public PaddlePredictor {
   // concurrency problems, so cache them.
   std::vector<framework::LoDTensor> feed_tensors_;
   details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
+
+ private:
+  // Some status here that help to determine the status inside the predictor.
+  bool status_program_optimized_{false};
+  bool status_is_cloned_{false};
+  bool status_use_gpu_{false};
+  bool status_ir_optim_enabled_{false};
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index f75c45f3a0438bc437e716160af8c5eab5b10fce..d67305670c91bb0814b8771332641e96974ade9d 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -12,16 +12,85 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/inference/api/analysis_predictor.h"
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <thread>  // NOLINT
+#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 
 DEFINE_string(dirname, "", "dirname to tests.");
 
 namespace paddle {
-namespace inference {
 using contrib::AnalysisConfig;
 
+TEST(AnalysisPredictor, analysis_off) {
+  AnalysisConfig config(false);
+  config.model_dir = FLAGS_dirname;
+  config.enable_ir_optim = false;
+
+  auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+  auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
+
+  // Without analysis, the scope_ and sub_scope_ are created by predictor
+  // itself.
+  ASSERT_TRUE(predictor->scope_);
+  ASSERT_TRUE(predictor->sub_scope_);
+  ASSERT_EQ(predictor->scope_->parent(), nullptr);
+  ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get());
+  // ir is turned off, so program shouldn't be optimized.
+  ASSERT_FALSE(predictor->status_program_optimized_);
+  LOG(INFO) << "scope parameters " << predictor->scope_->LocalVarNames().size();
+
+  // 2. Dummy Input Data
+  int64_t data[4] = {1, 2, 3, 4};
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({4, 1});
+  tensor.data.Reset(data, sizeof(data));
+  tensor.dtype = PaddleDType::INT64;
+
+  std::vector<PaddleTensor> inputs(4, tensor);
+  std::vector<PaddleTensor> outputs;
+  ASSERT_TRUE(predictor->Run(inputs, &outputs));
+}
+
+TEST(AnalysisPredictor, analysis_on) {
+  AnalysisConfig config(false);
+  config.model_dir = FLAGS_dirname;
+  config.enable_ir_optim = true;
+
+  auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+  auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
+
+  ASSERT_TRUE(predictor->scope_);
+  ASSERT_TRUE(predictor->sub_scope_);
+  ASSERT_EQ(predictor->scope_->parent(), nullptr);
+  ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get());
+  // ir is turned on, so program should be optimized.
+  ASSERT_TRUE(predictor->status_program_optimized_);
+  // 2. Dummy Input Data
+  int64_t data[4] = {1, 2, 3, 4};
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({4, 1});
+  tensor.data.Reset(data, sizeof(data));
+  tensor.dtype = PaddleDType::INT64;
+
+  std::vector<PaddleTensor> inputs(4, tensor);
+  std::vector<PaddleTensor> outputs;
+  ASSERT_TRUE(predictor->Run(inputs, &outputs));
+
+  for (auto& output : outputs) {
+    LOG(INFO) << inference::DescribeTensor(output);
+  }
+
+  // compare with NativePredictor
+  auto naive_predictor = CreatePaddlePredictor<NativeConfig>(config);
+  std::vector<PaddleTensor> naive_outputs;
+  ASSERT_TRUE(naive_predictor->Run(inputs, &naive_outputs));
+  ASSERT_EQ(naive_outputs.size(), 1UL);
+  inference::CompareTensor(outputs.front(), naive_outputs.front());
+}
+
 TEST(AnalysisPredictor, ZeroCopy) {
   AnalysisConfig config;
   config.model_dir = FLAGS_dirname;
@@ -61,5 +130,59 @@ TEST(AnalysisPredictor, ZeroCopy) {
   LOG(INFO) << "output_data: " << out_data;
 }
 
-}  // namespace inference
+TEST(AnalysisPredictor, Clone) {
+  AnalysisConfig config;
+  config.model_dir = FLAGS_dirname;
+  config.use_feed_fetch_ops = true;
+  config.enable_ir_optim = true;
+
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  predictors.emplace_back(CreatePaddlePredictor(config));
+
+  LOG(INFO) << "************** to clone ************************";
+  const int num_threads = 3;
+  for (int i = 1; i < num_threads; i++) {
+    predictors.emplace_back(predictors.front()->Clone());
+  }
+
+  auto* root_scope =
+      static_cast<AnalysisPredictor*>(predictors[0].get())->scope();
+  ASSERT_FALSE(root_scope->kids().empty());
+  LOG(INFO) << "***** scope ******\n"
+            << framework::GenScopeTreeDebugInfo(root_scope);
+
+  // 2. Dummy Input Data
+  int64_t data[4] = {1, 2, 3, 4};
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({4, 1});
+  tensor.data.Reset(data, sizeof(data));
+  tensor.dtype = PaddleDType::INT64;
+
+  std::vector<PaddleTensor> inputs(4, tensor);
+  std::vector<PaddleTensor> outputs;
+  predictors[0]->Run(inputs, &outputs);
+
+  LOG(INFO) << "Run with single thread";
+  for (int i = 0; i < num_threads; i++) {
+    LOG(INFO) << "run predictor " << i;
+    ASSERT_TRUE(predictors[i]->Run(inputs, &outputs));
+  }
+
+  LOG(INFO) << "Run with multiple threads";
+  std::vector<std::thread> threads;
+  for (int i = 0; i < num_threads; i++) {
+    threads.emplace_back([&predictors, &inputs, i] {
+      LOG(INFO) << "thread #" << i << " running";
+      std::vector<PaddleTensor> outputs;
+      for (int j = 0; j < 10; j++) {
+        ASSERT_TRUE(predictors[i]->Run(inputs, &outputs));
+      }
+    });
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index 20fab8078fedf837564496aa296648bf5970a348..9be059c73e20ebeeff2c4b6e8e5502e4a56fd0d6 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h
index 04536ea3a53bbbc9293d92e69a23567e4bfd84c0..6a8b81cc57281b12cd3a4c89c863b20a824ce34a 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
@@ -19,11 +19,13 @@ limitations under the License. */
 
 #pragma once
 
+#define WITH_ANAKIN
+
 #include <vector>
 
 #include "framework/core/net/net.h"
 #include "framework/graph/graph.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_anakin_config.h"
 #include "saber/core/shape.h"
 #include "saber/saber_types.h"
 
diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc
index 5152b8670ddb206f0927c03149684af4a096df42..014bdc6a379744463e535df97af4c9c2e1651656 100644
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -292,7 +292,14 @@ TEST(inference_api_native, image_classification_gpu) {
 // TEST(inference_api_native, image_classification_gpu_threads) {
 //   MainThreadsImageClassification(true /*use_gpu*/);
 // }
-
 #endif
 
+TEST(PassBuilder, Delete) {
+  contrib::AnalysisConfig config(false);
+  config.pass_builder()->DeletePass("attention_lstm_fuse_pass");
+  const auto& passes = config.pass_builder()->AllPasses();
+  auto it = std::find(passes.begin(), passes.end(), "attention_lstm_fuse_pass");
+  ASSERT_EQ(it, passes.end());
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
deleted file mode 100644
index 94b3933497daac1a4db1787994ea1bc33ec4e74f..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ /dev/null
@@ -1,188 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/api/api_impl.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/operators/tensorrt_engine_op.h"
-
-namespace paddle {
-
-using inference::analysis::Argument;
-using inference::Singleton;
-using inference::analysis::Analyzer;
-using framework::proto::ProgramDesc;
-using paddle::contrib::MixedRTConfig;
-
-class TensorRTSubgraphPredictor : public NativePaddlePredictor {
- public:
-  explicit TensorRTSubgraphPredictor(const MixedRTConfig& config)
-      : NativePaddlePredictor(config), config_(config) {}
-
-  bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
-    FLAGS_IA_enable_tensorrt_subgraph_engine = true;
-    VLOG(30) << "Predictor::init()";
-    if (config_.use_gpu) {
-      place_ = paddle::platform::CUDAPlace(config_.device);
-    } else {
-      place_ = paddle::platform::CPUPlace();
-    }
-    if (parent_scope) {
-      scope_ = parent_scope;
-      sub_scope_ = &(parent_scope->NewScope());
-    } else {
-      paddle::framework::InitDevices(false);
-      scope_.reset(new paddle::framework::Scope());
-    }
-
-    executor_.reset(new paddle::framework::Executor(place_));
-
-    // Initialize the inference program
-    if (!config_.model_dir.empty()) {
-      // Parameters are saved in separate files sited in
-      // the specified `dirname`.
-      inference_program_ = paddle::inference::Load(
-          executor_.get(), scope_.get(), config_.model_dir);
-    } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
-      // All parameters are saved in a single file.
-      // The file names should be consistent with that used
-      // in Python API `fluid.io.save_inference_model`.
-      inference_program_ = paddle::inference::Load(
-          executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
-    } else {
-      LOG(ERROR) << "fail to load inference model.";
-      return false;
-    }
-
-    OptimizeInferenceProgram();
-    ctx_ = executor_->Prepare(*inference_program_, 0);
-
-    VLOG(50) << "to create variables";
-    executor_->CreateVariables(*inference_program_,
-                               sub_scope_ ? sub_scope_ : scope_.get(), 0);
-    // Get the feed_target_names and fetch_target_names
-    PrepareFeedFetch();
-    return true;
-  }
-
-  bool Run(const std::vector<PaddleTensor>& inputs,
-           std::vector<PaddleTensor>* output_data,
-           int batch_size = -1) override {
-    PADDLE_ENFORCE_GT(batch_size, 0,
-                      "TensorRT engine needs the argument batch_size set");
-    FLAGS_tensorrt_engine_batch_size = batch_size;
-    return NativePaddlePredictor::Run(inputs, output_data, batch_size);
-  }
-
-  void OptimizeInferenceProgram() {
-    // Analyze inference_program
-    Argument argument;
-
-    argument.Set<int>("minimum_subgraph_size",
-                      new int(config_.minimum_subgraph_size));
-    argument.Set<int>("max_batch_size", new int(config_.max_batch_size));
-    argument.Set<int>("workspace_size", new int(config_.workspace_size));
-    argument.Set<std::string>("precision_mode",
-                              new std::string(config_.precision_mode));
-
-    if (!config_.model_dir.empty()) {
-      argument.fluid_model_dir.reset(new std::string(config_.model_dir));
-    } else {
-      PADDLE_ENFORCE(
-          !config_.param_file.empty(),
-          "Either model_dir or (param_file, prog_file) should be set.");
-      PADDLE_ENFORCE(!config_.prog_file.empty());
-      argument.fluid_model_program_path.reset(
-          new std::string(config_.prog_file));
-      argument.fluid_model_param_path.reset(
-          new std::string(config_.param_file));
-    }
-    argument.origin_program_desc.reset(
-        new ProgramDesc(*inference_program_->Proto()));
-    Singleton<Analyzer>::Global().Run(&argument);
-    CHECK(argument.transformed_program_desc);
-    VLOG(50) << "transformed program:\n"
-             << argument.transformed_program_desc->SerializeAsString();
-    VLOG(50) << "to prepare executor";
-    inference_program_.reset(
-        new framework::ProgramDesc(*argument.transformed_program_desc));
-  }
-
- private:
-  MixedRTConfig config_;
-};
-
-template <>
-std::unique_ptr<PaddlePredictor>
-CreatePaddlePredictor<MixedRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
-    const MixedRTConfig& config) {
-  VLOG(30) << "create TensorRTSubgraphPredictor";
-  if (config.use_gpu) {
-    // 1. GPU memeroy
-    PADDLE_ENFORCE_GT(
-        config.fraction_of_gpu_memory, 0.f,
-        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
-    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
-    std::vector<std::string> flags;
-    if (config.fraction_of_gpu_memory >= 0.0f ||
-        config.fraction_of_gpu_memory <= 0.95f) {
-      flags.push_back("dummpy");
-      std::string flag = "--fraction_of_gpu_memory_to_use=" +
-                         std::to_string(config.fraction_of_gpu_memory);
-      flags.push_back(flag);
-      VLOG(30) << "set flag: " << flag;
-      framework::InitGflags(flags);
-    }
-  }
-
-  std::unique_ptr<PaddlePredictor> predictor(
-      new TensorRTSubgraphPredictor(config));
-  if (!dynamic_cast<TensorRTSubgraphPredictor*>(predictor.get())
-           ->Init(nullptr)) {
-    return nullptr;
-  }
-  return std::move(predictor);
-}
-
-template <>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<MixedRTConfig>(
-    const MixedRTConfig& config) {
-  return CreatePaddlePredictor<MixedRTConfig,
-                               PaddleEngineKind::kAutoMixedTensorRT>(config);
-}
-
-}  // namespace paddle
-
-USE_TRT_CONVERTER(elementwise_add_weight);
-USE_TRT_CONVERTER(elementwise_add_tensor);
-USE_TRT_CONVERTER(elementwise_sub_tensor);
-USE_TRT_CONVERTER(elementwise_div_tensor);
-USE_TRT_CONVERTER(elementwise_mul_tensor);
-USE_TRT_CONVERTER(elementwise_max_tensor);
-USE_TRT_CONVERTER(elementwise_min_tensor);
-USE_TRT_CONVERTER(elementwise_pow_tensor);
-USE_TRT_CONVERTER(mul);
-USE_TRT_CONVERTER(conv2d);
-USE_TRT_CONVERTER(relu);
-USE_TRT_CONVERTER(sigmoid);
-USE_TRT_CONVERTER(tanh);
-USE_TRT_CONVERTER(fc);
-USE_TRT_CONVERTER(pool2d);
-USE_TRT_CONVERTER(softmax);
-USE_TRT_CONVERTER(batch_norm);
-USE_TRT_CONVERTER(concat);
-USE_TRT_CONVERTER(dropout);
-USE_TRT_CONVERTER(pad);
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
deleted file mode 100644
index 89c9a65cb06ba565f0e0cbdb9b6031c6adbcb64e..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-
-namespace paddle {
-
-using contrib::MixedRTConfig;
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-void CompareTensorRTWithFluid(bool enable_tensorrt) {
-  FLAGS_IA_enable_tensorrt_subgraph_engine = enable_tensorrt;
-
-  //# 1. Create PaddlePredictor with a config.
-  NativeConfig config0;
-  config0.model_dir = FLAGS_dirname;
-  config0.use_gpu = true;
-  config0.fraction_of_gpu_memory = 0.3;
-  config0.device = 0;
-
-  MixedRTConfig config1;
-  config1.model_dir = FLAGS_dirname;
-  config1.use_gpu = true;
-  config1.fraction_of_gpu_memory = 0.3;
-  config1.device = 0;
-  config1.max_batch_size = 10;
-
-  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
-  auto predictor1 = CreatePaddlePredictor<MixedRTConfig>(config1);
-
-  for (int batch_id = 0; batch_id < 1; batch_id++) {
-    //# 2. Prepare input.
-    std::vector<int64_t> data(20);
-    for (int i = 0; i < 20; i++) data[i] = i;
-
-    PaddleTensor tensor;
-    tensor.shape = std::vector<int>({10, 1});
-    tensor.data = PaddleBuf(data.data(), data.size() * sizeof(int64_t));
-    tensor.dtype = PaddleDType::INT64;
-
-    // For simplicity, we set all the slots with the same data.
-    std::vector<PaddleTensor> slots(4, tensor);
-
-    //# 3. Run
-    std::vector<PaddleTensor> outputs0;
-    std::vector<PaddleTensor> outputs1;
-    CHECK(predictor0->Run(slots, &outputs0));
-    CHECK(predictor1->Run(slots, &outputs1, 10));
-
-    //# 4. Get output.
-    ASSERT_EQ(outputs0.size(), 1UL);
-    ASSERT_EQ(outputs1.size(), 1UL);
-
-    const size_t num_elements = outputs0.front().data.length() / sizeof(float);
-    const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
-    EXPECT_EQ(num_elements, num_elements1);
-
-    auto *data0 = static_cast<float *>(outputs0.front().data.data());
-    auto *data1 = static_cast<float *>(outputs1.front().data.data());
-
-    ASSERT_GT(num_elements, 0UL);
-    for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
-      EXPECT_NEAR(data0[i], data1[i], 1e-3);
-    }
-  }
-}
-
-TEST(paddle_inference_api_tensorrt_subgraph_engine, without_tensorrt) {
-  CompareTensorRTWithFluid(false);
-}
-
-TEST(paddle_inference_api_tensorrt_subgraph_engine, with_tensorrt) {
-  CompareTensorRTWithFluid(true);
-}
-
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
index 5446fd4d4256c10442a53ea09a447cf308cbd681..3dd1d3c838c4b1bcdefdadff16b02dbfb4a02ee9 100644
--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include <memory>
 #include <thread>  //NOLINT
 
-#include "paddle/include/paddle_inference_api.h"
+#include "utils.h"  // NOLINT
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
 DEFINE_bool(use_gpu, false, "Whether use gpu.");
diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
index 6460514f3f80cac3c5e52560ab61b5cc7fd74636..0eb620ea516d28fb9598af8dbd297e84580a99f9 100644
--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -36,14 +36,13 @@ namespace demo {
  */
 void Main() {
   std::unique_ptr<PaddlePredictor> predictor;
-  paddle::contrib::MixedRTConfig config;
+  paddle::contrib::AnalysisConfig config(true);
   config.param_file = FLAGS_modeldir + "/__params__";
   config.prog_file = FLAGS_modeldir + "/__model__";
-  config.use_gpu = true;
   config.device = 0;
-  config.max_batch_size = 1;
+  config.EnableTensorRtEngine();
   config.fraction_of_gpu_memory = 0.1;  // set by yourself
-  predictor = CreatePaddlePredictor<paddle::contrib::MixedRTConfig>(config);
+  predictor = CreatePaddlePredictor(config);
 
   VLOG(30) << "begin to process data";
   // Just a single batch of data.
diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
index d747f855803a6997d08957b5d35a56a0fe4160c5..bc8891455dc8e4a30ddfcc5f89792296e59c2548 100644
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -17,7 +17,7 @@ limitations under the License. */
  */
 
 #include <gflags/gflags.h>
-#include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
+#include <glog/logging.h>
 #include "utils.h"  // NOLINT
 
 #ifdef PADDLE_WITH_CUDA
@@ -40,20 +40,17 @@ using contrib::AnalysisConfig;
  */
 void Main(bool use_gpu) {
   std::unique_ptr<PaddlePredictor> predictor, analysis_predictor;
-  AnalysisConfig config;
+  AnalysisConfig config(use_gpu);
   config.param_file = FLAGS_modeldir + "/__params__";
   config.prog_file = FLAGS_modeldir + "/__model__";
-  config.use_gpu = use_gpu;
   config.device = 0;
   if (FLAGS_use_gpu) {
     config.fraction_of_gpu_memory = 0.1;  // set by yourself
   }
 
-  VLOG(30) << "init predictor";
   predictor = CreatePaddlePredictor<NativeConfig>(config);
-  analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+  analysis_predictor = CreatePaddlePredictor(config);
 
-  VLOG(30) << "begin to process data";
   // Just a single batch of data.
   std::string line;
   std::ifstream file(FLAGS_data);
@@ -68,13 +65,10 @@ void Main(bool use_gpu) {
       PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
   input.dtype = PaddleDType::FLOAT32;
 
-  VLOG(30) << "run executor";
   std::vector<PaddleTensor> output, analysis_output;
   predictor->Run({input}, &output, 1);
 
-  VLOG(30) << "output.size " << output.size();
   auto& tensor = output.front();
-  VLOG(30) << "output: " << SummaryTensor(tensor);
 
   // compare with reference result
   CheckOutput(FLAGS_refer, tensor);
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 14698f6dfc8885ec1d35f1912bad10a9caa13db4..0f540699b8ffea94c3f3aaf3354a0462e0e674a9 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -51,7 +51,7 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
 }
 
 template <typename T>
-T *ZeroCopyTensor::data(PaddlePlace *place, int *size) {
+T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const {
   auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
   auto *res = tensor->data<T>();
 
@@ -67,8 +67,10 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) {
   return res;
 }
 
-template float *ZeroCopyTensor::data<float>(PaddlePlace *place, int *size);
-template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place, int *size);
+template float *ZeroCopyTensor::data<float>(PaddlePlace *place,
+                                            int *size) const;
+template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place,
+                                                int *size) const;
 template float *ZeroCopyTensor::mutable_data<float>(PaddlePlace place);
 template int64_t *ZeroCopyTensor::mutable_data<int64_t>(PaddlePlace place);
 
@@ -84,7 +86,7 @@ void *ZeroCopyTensor::FindTensor() const {
   return tensor;
 }
 
-std::vector<int64_t> ZeroCopyTensor::shape() {
+std::vector<int64_t> ZeroCopyTensor::shape() const {
   auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
   PADDLE_ENFORCE(tensor, "not found tensor called %s in the scope", name_);
   return framework::vectorize(tensor->dims());
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
index 2d5b561d801cd9e734cab13b28e7285493e30f94..12071e09f8442f2c52a06b7c3fe4bed2c28b524a 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
@@ -24,18 +24,20 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
 }
 
 template <typename T>
-T *ZeroCopyTensor::data(PaddlePlace *place, int *size) {
+T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const {
   return nullptr;
 }
 
-template float *ZeroCopyTensor::data<float>(PaddlePlace *place, int *size);
-template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place, int *size);
+template float *ZeroCopyTensor::data<float>(PaddlePlace *place,
+                                            int *size) const;
+template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place,
+                                                int *size) const;
 template float *ZeroCopyTensor::mutable_data(PaddlePlace place);
 template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place);
 
 void *ZeroCopyTensor::FindTensor() const { return nullptr; }
 
-std::vector<int64_t> ZeroCopyTensor::shape() { return {}; }
+std::vector<int64_t> ZeroCopyTensor::shape() const { return {}; }
 
 void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {}
 
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index af21c0095c28b26c0ef4afc83572a9681d49d497..6f9d663121004470d57c17b8154d725fdf2b9689 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -15,9 +15,14 @@
 #pragma once
 
 #include <glog/logging.h>
+#if !defined(_WIN32)
 #include <sys/time.h>
+#else
+#endif
+
 #include <algorithm>
 #include <chrono>  // NOLINT
+#include <iterator>
 #include <numeric>
 #include <sstream>
 #include <string>
@@ -125,6 +130,51 @@ static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
   return size;
 }
 
+static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) {
+  if (a.dtype != b.dtype) {
+    LOG(ERROR) << "dtype not match";
+    return false;
+  }
+
+  if (a.lod.size() != b.lod.size()) {
+    LOG(ERROR) << "lod not match";
+    return false;
+  }
+  for (size_t i = 0; i < a.lod.size(); i++) {
+    if (a.lod[i].size() != b.lod[i].size()) {
+      LOG(ERROR) << "lod not match";
+      return false;
+    }
+    for (size_t j = 0; j < a.lod[i].size(); j++) {
+      if (a.lod[i][j] != b.lod[i][j]) {
+        LOG(ERROR) << "lod not match";
+        return false;
+      }
+    }
+  }
+
+  if (a.shape.size() != b.shape.size()) {
+    LOG(INFO) << "shape not match";
+    return false;
+  }
+  for (size_t i = 0; i < a.shape.size(); i++) {
+    if (a.shape[i] != b.shape[i]) {
+      LOG(ERROR) << "shape not match";
+      return false;
+    }
+  }
+
+  auto *adata = static_cast<float *>(a.data.data());
+  auto *bdata = static_cast<float *>(b.data.data());
+  for (int i = 0; i < VecReduceToInt(a.shape); i++) {
+    if (adata[i] != bdata[i]) {
+      LOG(ERROR) << "data not match";
+      return false;
+    }
+  }
+  return true;
+}
+
 static std::string DescribeTensor(const PaddleTensor &tensor) {
   std::stringstream os;
   os << "Tensor [" << tensor.name << "]\n";
@@ -157,6 +207,26 @@ static std::string DescribeTensor(const PaddleTensor &tensor) {
   return os.str();
 }
 
+static std::string DescribeZeroCopyTensor(const ZeroCopyTensor &tensor) {
+  std::stringstream os;
+  os << "Tensor [" << tensor.name() << "]\n";
+
+  os << " - shape: " << to_string(tensor.shape()) << '\n';
+  os << " - lod: ";
+  for (auto &l : tensor.lod()) {
+    os << to_string(l) << "; ";
+  }
+  os << "\n";
+  os << " - data: ";
+  PaddlePlace place;
+  int size;
+  const auto *data = tensor.data<float>(&place, &size);
+  for (int i = 0; i < size; i++) {
+    os << data[i] << " ";
+  }
+  return os.str();
+}
+
 static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
                       double latency, int epoch = 1) {
   LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
diff --git a/paddle/fluid/inference/analysis/analyzer_main.cc b/paddle/fluid/inference/api/paddle_anakin_config.h
similarity index 56%
rename from paddle/fluid/inference/analysis/analyzer_main.cc
rename to paddle/fluid/inference/api/paddle_anakin_config.h
index 5e1fe3eb797cdced56a61aa2db0c3d18601824f8..0e91c2624bed4459b936ac4477d73ae954e55bcc 100644
--- a/paddle/fluid/inference/analysis/analyzer_main.cc
+++ b/paddle/fluid/inference/api/paddle_anakin_config.h
@@ -11,23 +11,25 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#pragma once
 
-/*
- * This file implements analysizer -- an executation help to analyze and
- * optimize trained model.
- */
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include <gflags/gflags.h>
-#include <glog/logging.h>
+#include <cassert>
+#include <memory>
+#include <string>
+#include <vector>
 
-int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
-  using paddle::inference::analysis::Analyzer;
-  using paddle::inference::analysis::Argument;
+#include "paddle_api.h"  // NOLINT
 
-  Argument argument;
-  Analyzer analyzer;
-  analyzer.Run(&argument);
+namespace paddle {
+namespace contrib {
+// Configurations for Anakin engine.
+struct AnakinConfig : public PaddlePredictor::Config {
+  enum TargetType { NVGPU = 0, X86 };
+  int device;
+  std::string model_file;
+  int max_batch_size{-1};
+  TargetType target_type;
+};
 
-  return 0;
-}
+}  // namespace contrib
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ac736df7ccd54babe582ca1383903c191069d33
--- /dev/null
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <vector>
+
+// Here we include some header files with relative paths, for that in deploy,
+// the abstract path of this header file will be changed.
+#include "paddle_api.h"           // NOLINT
+#include "paddle_pass_builder.h"  // NOLINT
+
+namespace paddle {
+
+class AnalysisPredictor;
+// ==
+//
+// -----------------------------------------------------------------------------------
+// NOTE: The following APIs are not mature yet, we are still working on them.
+namespace contrib {
+
+// NOTE WIP, not stable yet.
+struct AnalysisConfig : public NativeConfig {
+  explicit AnalysisConfig(bool use_gpu = false);
+  explicit AnalysisConfig(const AnalysisConfig& other);
+  explicit AnalysisConfig(AnalysisConfig&& other);
+
+  // Determine whether to perform graph optimization.
+  bool enable_ir_optim = true;
+
+  // Get a pass builder for customize the passes in IR analysis phase.
+  PassStrategy* pass_builder() const;
+
+  // NOT stable yet.
+  bool use_feed_fetch_ops{true};
+
+  void EnableTensorRtEngine(int workspace_size = 1 << 20,
+                            int max_batch_size = 1);
+  bool use_tensorrt() const { return use_tensorrt_; }
+
+  // NOTE this is just for internal development, please not use it.
+  // NOT stable yet.
+  void EnableMKLDNN();
+  bool use_mkldnn() const { return use_mkldnn_; }
+
+  friend class ::paddle::AnalysisPredictor;
+
+ protected:
+  bool use_tensorrt_{false};
+  bool use_mkldnn_{false};
+  int tensorrt_workspace_size_;
+  int tensorrt_max_batchsize_;
+  std::unique_ptr<PassStrategy> pass_builder_;
+};
+
+// Configurations for Anakin engine.
+struct AnakinConfig : public PaddlePredictor::Config {
+  enum TargetType { NVGPU = 0, X86 };
+  int device;
+  std::string model_file;
+  int max_batch_size{-1};
+  TargetType target_type;
+};
+
+}  // namespace contrib
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a2a2a1a23401b5aa4d3402da6f7a3369280d8f5
--- /dev/null
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -0,0 +1,220 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace paddle {
+
+// Data type.
+enum PaddleDType {
+  FLOAT32,
+  INT64,
+  // TODO(Superjomn) support more data types if needed.
+};
+
+/*
+ * Memory menage for PaddleTensor.
+ * The PaddleBuf holds a buffer for data input or output. The memory can be
+ * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
+ * should be reused for better performance.
+ *
+ * For user allocated memory, the following API can be used:
+ * - PaddleBuf(void* data, size_t length) to set an external memory by
+ * specifying
+ *   the memory address and length.
+ * - Reset(void* data, size_t length) to reset the PaddleBuf with an external
+ * memory.
+ * ATTENTION, for user allocated memory, deallocation should be done by users
+ * externally after the program finished. The PaddleBuf won't do any allocation
+ * or deallocation.
+ *
+ * To have the PaddleBuf allocate and manage the memory:
+ * - PaddleBuf(size_t length) will allocate a memory of size `length`.
+ * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
+ *   if the allocated memory is larger than `length`, nothing will done.
+ */
+class PaddleBuf {
+ public:
+  // PaddleBuf allocate memory internally, and manage it.
+  explicit PaddleBuf(size_t length)
+      : data_(new char[length]), length_(length), memory_owned_(true) {}
+  // Set external memory, the PaddleBuf won't manage it.
+  PaddleBuf(void* data, size_t length)
+      : data_(data), length_(length), memory_owned_{false} {}
+  // Copy only available when memory is managed externally.
+  explicit PaddleBuf(const PaddleBuf&);
+
+  // Resize the memory.
+  void Resize(size_t length);
+  // Reset to external memory, with address and length set.
+  void Reset(void* data, size_t length);
+  // Tell whether the buffer is empty.
+  bool empty() const { return length_ == 0; }
+  // Get the memory address.
+  void* data() const { return data_; }
+  // Get the memory length.
+  size_t length() const { return length_; }
+
+  ~PaddleBuf() { Free(); }
+  PaddleBuf& operator=(const PaddleBuf&);
+  PaddleBuf& operator=(PaddleBuf&&);
+  PaddleBuf() = default;
+  PaddleBuf(PaddleBuf&& other);
+
+ private:
+  void Free();
+  void* data_{nullptr};  // pointer to the data memory.
+  size_t length_{0};     // number of memory bytes.
+  bool memory_owned_{true};
+};
+
+// Basic input and output data structure for PaddlePredictor.
+struct PaddleTensor {
+  PaddleTensor() = default;
+  std::string name;  // variable name.
+  std::vector<int> shape;
+  PaddleBuf data;  // blob of data.
+  PaddleDType dtype;
+  std::vector<std::vector<size_t>> lod;  // Tensor+LoD equals LoDTensor
+};
+
+enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
+// Tensor without copy, currently only supports AnalysisPredictor.
+class ZeroCopyTensor {
+ public:
+  void Reshape(const std::vector<int>& shape);
+
+  // Get the memory in CPU or GPU with specific data type, should Reshape first
+  // to tell the data size.
+  // Once can directly call this data to feed the data.
+  // This is for write the input tensor.
+  template <typename T>
+  T* mutable_data(PaddlePlace place);
+  // Get the memory directly, will return the place and memory size by pointer.
+  // This is for reading the output tensor.
+  template <typename T>
+  T* data(PaddlePlace* place, int* size) const;
+
+  std::vector<int64_t> shape() const;
+
+  void SetLoD(const std::vector<std::vector<size_t>>& x);
+  std::vector<std::vector<size_t>> lod() const;
+  const std::string& name() const { return name_; }
+
+ protected:
+  explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
+  void SetName(const std::string& name) { name_ = name; }
+  void* FindTensor() const;
+
+ private:
+  std::string name_;
+  bool input_or_output_;
+  friend class AnalysisPredictor;
+  void* scope_{nullptr};
+};
+
+/*
+ * A simple Inference API for Paddle.
+ */
+class PaddlePredictor {
+ public:
+  struct Config;
+  PaddlePredictor() = default;
+  PaddlePredictor(const PaddlePredictor&) = delete;
+  PaddlePredictor& operator=(const PaddlePredictor&) = delete;
+
+  // Predict an record.
+  // The caller should be responsible for allocating and releasing the memory of
+  // `inputs`. `inputs` should be available until Run returns. Caller should be
+  // responsible for the output tensor's buffer, either allocated or passed from
+  // outside.
+  virtual bool Run(const std::vector<PaddleTensor>& inputs,
+                   std::vector<PaddleTensor>* output_data,
+                   int batch_size = -1) = 0;
+
+  // Zero copy input and output optimization.
+  // Get the input or output tensors, and operate on their memory directly,
+  // without copy.
+  virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
+      const std::string& name) {
+    return nullptr;
+  }
+  virtual std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
+      const std::string& name) {
+    return nullptr;
+  }
+  virtual bool ZeroCopyRun() { return false; }
+
+  // Clone a predictor that share the model weights, the Cloned predictor should
+  // be thread-safe.
+  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
+
+  // Destroy the Predictor.
+  virtual ~PaddlePredictor() = default;
+
+  // The common configs for all the predictors.
+  struct Config {
+    std::string model_dir;  // path to the model directory.
+  };
+};
+
+struct NativeConfig : public PaddlePredictor::Config {
+  // GPU related fields.
+  bool use_gpu{false};
+  int device{0};
+  float fraction_of_gpu_memory{-1.f};  // Change to a float in (0,1] if needed.
+
+  // Specify the exact path of program and parameter files.
+  std::string prog_file;
+  std::string param_file;
+
+  // Specify the variable's name of each input if input tensors don't follow the
+  // `feeds` and `fetches` of the phase `save_inference_model`.
+  bool specify_input_name{false};
+};
+
+// A factory to help create different predictors.
+//
+// Usage:
+//
+// NativeConfig config;
+// ... // change the configs.
+// auto native_predictor = CreatePaddlePredictor(config);
+//
+// FOR EXTENSION DEVELOPER:
+// Different predictors are designated by config type. Similar configs can be
+// merged, but there shouldn't be a huge config containing different fields for
+// more than one kind of predictors.
+template <typename ConfigT>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+
+// NOTE The following APIs are too trivial, we will discard it in the following
+// versions.
+enum class PaddleEngineKind {
+  kNative = 0,         // Use the native Fluid facility.
+  kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
+  kAnalysis,           // More optimization.
+  kAnakin              // Use Anakin for inference, not mature yet.
+};
+
+template <typename ConfigT, PaddleEngineKind engine>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+
+int PaddleDtypeSize(PaddleDType dtype);
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index a755ccb93bdee018dfeaf91157e7971b4d4cd832..92fb51d647cf4e2c8a4914d8df2e8b7b6318d1d1 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -26,265 +26,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-namespace paddle {
-
-// Data type.
-enum PaddleDType {
-  FLOAT32,
-  INT64,
-  // TODO(Superjomn) support more data types if needed.
-};
-
-/*
- * Memory menage for PaddleTensor.
- * The PaddleBuf holds a buffer for data input or output. The memory can be
- * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
- * should be reused for better performance.
- *
- * For user allocated memory, the following API can be used:
- * - PaddleBuf(void* data, size_t length) to set an external memory by
- * specifying
- *   the memory address and length.
- * - Reset(void* data, size_t length) to reset the PaddleBuf with an external
- * memory.
- * ATTENTION, for user allocated memory, deallocation should be done by users
- * externally after the program finished. The PaddleBuf won't do any allocation
- * or deallocation.
- *
- * To have the PaddleBuf allocate and manage the memory:
- * - PaddleBuf(size_t length) will allocate a memory of size `length`.
- * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
- *   if the allocated memory is larger than `length`, nothing will done.
- */
-class PaddleBuf {
- public:
-  // PaddleBuf allocate memory internally, and manage it.
-  explicit PaddleBuf(size_t length)
-      : data_(new char[length]), length_(length), memory_owned_(true) {}
-  // Set external memory, the PaddleBuf won't manage it.
-  PaddleBuf(void* data, size_t length)
-      : data_(data), length_(length), memory_owned_{false} {}
-  // Copy only available when memory is managed externally.
-  explicit PaddleBuf(const PaddleBuf&);
-
-  // Resize the memory.
-  void Resize(size_t length);
-  // Reset to external memory, with address and length set.
-  void Reset(void* data, size_t length);
-  // Tell whether the buffer is empty.
-  bool empty() const { return length_ == 0; }
-  // Get the memory address.
-  void* data() const { return data_; }
-  // Get the memory length.
-  size_t length() const { return length_; }
-
-  ~PaddleBuf() { Free(); }
-  PaddleBuf& operator=(const PaddleBuf&);
-  PaddleBuf& operator=(PaddleBuf&&);
-  PaddleBuf() = default;
-  PaddleBuf(PaddleBuf&& other);
-
- private:
-  void Free();
-  void* data_{nullptr};  // pointer to the data memory.
-  size_t length_{0};     // number of memory bytes.
-  bool memory_owned_{true};
-};
-
-// Basic input and output data structure for PaddlePredictor.
-struct PaddleTensor {
-  PaddleTensor() = default;
-  std::string name;  // variable name.
-  std::vector<int> shape;
-  PaddleBuf data;  // blob of data.
-  PaddleDType dtype;
-  std::vector<std::vector<size_t>> lod;  // Tensor+LoD equals LoDTensor
-};
-
-enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
-// Tensor without copy, currently only supports AnalysisPredictor.
-class ZeroCopyTensor {
- public:
-  void Reshape(const std::vector<int>& shape);
-
-  // Get the memory in CPU or GPU with specific data type, should Reshape first
-  // to tell the data size.
-  // Once can directly call this data to feed the data.
-  // This is for write the input tensor.
-  template <typename T>
-  T* mutable_data(PaddlePlace place);
-  // Get the memory directly, will return the place and memory size by pointer.
-  // This is for reading the output tensor.
-  template <typename T>
-  T* data(PaddlePlace* place, int* size);
-
-  std::vector<int64_t> shape();
-
-  void SetLoD(const std::vector<std::vector<size_t>>& x);
-  std::vector<std::vector<size_t>> lod() const;
-
- protected:
-  explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
-  void SetName(const std::string& name) { name_ = name; }
-  void* FindTensor() const;
-
- private:
-  std::string name_;
-  bool input_or_output_;
-  friend class AnalysisPredictor;
-  void* scope_{nullptr};
-};
-
-/*
- * A simple Inference API for Paddle.
- */
-class PaddlePredictor {
- public:
-  struct Config;
-  PaddlePredictor() = default;
-  PaddlePredictor(const PaddlePredictor&) = delete;
-  PaddlePredictor& operator=(const PaddlePredictor&) = delete;
-
-  // Predict an record.
-  // The caller should be responsible for allocating and releasing the memory of
-  // `inputs`. `inputs` should be available until Run returns. Caller should be
-  // responsible for the output tensor's buffer, either allocated or passed from
-  // outside.
-  virtual bool Run(const std::vector<PaddleTensor>& inputs,
-                   std::vector<PaddleTensor>* output_data,
-                   int batch_size = -1) = 0;
-
-  // Zero copy input and output optimization.
-  // Get the input or output tensors, and operate on their memory directly,
-  // without copy.
-  virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
-      const std::string& name) {
-    return nullptr;
-  }
-  virtual std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
-      const std::string& name) {
-    return nullptr;
-  }
-  virtual bool ZeroCopyRun() { return false; }
-
-  // Clone a predictor that share the model weights, the Cloned predictor should
-  // be thread-safe.
-  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
-
-  // Destroy the Predictor.
-  virtual ~PaddlePredictor() = default;
-
-  // The common configs for all the predictors.
-  struct Config {
-    std::string model_dir;  // path to the model directory.
-  };
-};
-
-struct NativeConfig : public PaddlePredictor::Config {
-  // GPU related fields.
-  bool use_gpu{false};
-  int device{0};
-  float fraction_of_gpu_memory{-1.f};  // Change to a float in (0,1] if needed.
-
-  // Specify the exact path of program and parameter files.
-  std::string prog_file;
-  std::string param_file;
-
-  // Specify the variable's name of each input if input tensors don't follow the
-  // `feeds` and `fetches` of the phase `save_inference_model`.
-  bool specify_input_name{false};
-};
-
-// A factory to help create different predictors.
-//
-// Usage:
-//
-// NativeConfig config;
-// ... // change the configs.
-// auto native_predictor = CreatePaddlePredictor(config);
-//
-// FOR EXTENSION DEVELOPER:
-// Different predictors are designated by config type. Similar configs can be
-// merged, but there shouldn't be a huge config containing different fields for
-// more than one kind of predictors.
-template <typename ConfigT>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
-
-// NOTE The following APIs are too trivial, we will discard it in the following
-// versions.
-enum class PaddleEngineKind {
-  kNative = 0,         // Use the native Fluid facility.
-  kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
-  kAnalysis,           // More optimization.
-  kAnakin              // Use Anakin for inference, not mature yet.
-};
-
-template <typename ConfigT, PaddleEngineKind engine>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
-
-// ==
-//
-// -----------------------------------------------------------------------------------
-// NOTE: The following APIs are not mature yet, we are still working on them.
-
-namespace contrib {
-
-// Accelerate GPU computation with TensorRT engine.
-struct MixedRTConfig : public NativeConfig {
-  // Determine whether a subgraph will be executed by TRT.
-  int min_subgraph_size{1};
-  // While TensorRT allows an engine optimized for a given max batch size
-  // to run at any smaller size, the performance for those smaller
-  // sizes may not be as well-optimized. Therefore, Max batch is best
-  // equivalent to the runtime batch size.
-  int max_batch_size{1};
-  // For workspace_size, refer it from here:
-  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
-  int workspace_size{1 << 30};
-  //  We transform the Ops that can be converted into TRT layer in the model,
-  //  and aggregate these Ops into subgraphs for TRT execution.
-  //  We set this variable to control the minimum number of nodes in the
-  //  subgraph, 3 as default value.
-  int minimum_subgraph_size = 3;
-  // Reserved configuration
-  // We just support "FP32" now, "FP16" and "INT8" will be supported.
-  std::string precision_mode = "FP32";
-};
-
-// NOTE WIP, not stable yet.
-struct AnalysisConfig : public NativeConfig {
-  enum class IrPassMode {
-    kSystem,   // Use system default passes, not customize.
-    kInclude,  // Specify the passes in `ir_passes`.
-    kExclude   // Specify the disabled passes in `ir_passes`.
-  };
-
-  // Determine whether to perform graph optimization.
-  bool enable_ir_optim = true;
-  // Manually determine the IR passes to run.
-  IrPassMode ir_mode{IrPassMode::kExclude};
-  // passes to be excluded/included
-  std::vector<std::string> ir_passes{"embedding_fc_lstm_fuse_pass"};
-
-  // NOT stable yet.
-  bool use_feed_fetch_ops{true};
-
-  // NOTE this is just for internal development, please not use it.
-  // NOT stable yet.
-  bool _use_mkldnn{false};
-};
-
-// Configurations for Anakin engine.
-struct AnakinConfig : public PaddlePredictor::Config {
-  enum TargetType { NVGPU = 0, X86 };
-  int device;
-  std::string model_file;
-  int max_batch_size{-1};
-  TargetType target_type;
-};
-
-}  // namespace contrib
-
-int PaddleDtypeSize(PaddleDType dtype);
-
-}  // namespace paddle
+#include "paddle_api.h"  // NOLINT
+#ifndef WITH_ANAKIN
+#include "paddle_analysis_config.h"  // NOLINT
+#else
+#include "paddle_anakin_config.h"  // NOLINT
+#endif
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc3ce72f0832c4bf029f86e023bd9ff11f6578bd
--- /dev/null
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/paddle_pass_builder.h"
+#include <glog/logging.h>
+
+namespace paddle {
+
+void PaddlePassBuilder::AppendPass(const std::string &pass_type) {
+  passes_.push_back(pass_type);
+}
+
+void PaddlePassBuilder::TurnOnDebug() {
+  std::vector<std::string> passes;
+  auto it = std::begin(passes_);
+  while (it != std::end(passes_)) {
+    if (*it != "graph_viz_pass") {
+      it = passes_.insert(it + 1, "graph_viz_pass");
+    } else {
+      ++it;
+    }
+  }
+}
+
+std::string PaddlePassBuilder::DebugString() {
+  std::stringstream ss;
+  ss << "Passes to apply:\n";
+  for (auto &pass : passes_) {
+    ss << "  - " << pass << '\n';
+  }
+  return ss.str();
+}
+
+void PaddlePassBuilder::DeletePass(const std::string &pass_type) {
+  auto it = std::begin(passes_);
+  while (it != std::end(passes_)) {
+    if (*it == pass_type) {
+      it = passes_.erase(it);
+    } else {
+      ++it;
+    }
+  }
+}
+
+void PaddlePassBuilder::InsertPass(size_t idx, const std::string &pass_type) {
+  passes_.insert(std::begin(passes_) + idx, pass_type);
+}
+
+void PaddlePassBuilder::DeletePass(size_t idx) {
+  passes_.erase(std::begin(passes_) + idx);
+}
+
+void GpuPassStrategy::EnableMKLDNN() {
+  LOG(ERROR) << "GPU not support MKLDNN yet";
+}
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..80658d30850aaa7212903828c5c963da5f37ca65
--- /dev/null
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -0,0 +1,131 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace paddle {
+/*
+ * This is a pass builder based on string. It is part of inference API.
+ */
+class PaddlePassBuilder {
+ public:
+  explicit PaddlePassBuilder(const std::vector<std::string> &passes)
+      : passes_(passes) {}
+
+  void AppendPass(const std::string &pass_type);
+
+  void InsertPass(size_t idx, const std::string &pass_type);
+
+  // Delete the `idx`-th pass.
+  void DeletePass(size_t idx);
+
+  // Delete all the passes that has type `pass_type`.
+  void DeletePass(const std::string &pass_type);
+
+  // Visualize the computation graph after each pass by generating a DOT
+  // language file, one can draw them with the Graphviz toolkit.
+  void TurnOnDebug();
+
+  // Human-readible information.
+  std::string DebugString();
+
+  const std::vector<std::string> &AllPasses() const { return passes_; }
+
+ protected:
+  std::vector<std::string> passes_;
+};
+
+/*
+ * Pass strategy to help control the IR passes.
+ */
+class PassStrategy : public PaddlePassBuilder {
+ public:
+  explicit PassStrategy(const std::vector<std::string> &passes)
+      : PaddlePassBuilder(passes) {}
+
+  // The MKLDNN control exists in both CPU and GPU mode, because there can be
+  // still some CPU kernels running in CPU mode.
+  virtual void EnableMKLDNN() = 0;
+
+  virtual ~PassStrategy() = default;
+};
+
+/*
+ * The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
+ */
+class CpuPassStrategy : public PassStrategy {
+ public:
+  CpuPassStrategy() : PassStrategy({}) {
+    // NOTE the large fusions should be located in the front, so that they will
+    // not be damaged by smaller ones.
+    passes_.assign({
+        "infer_clean_graph_pass",         //
+        "attention_lstm_fuse_pass",       //
+        "seqconv_eltadd_relu_fuse_pass",  //
+        // "embedding_fc_lstm_fuse_pass", //
+        "fc_lstm_fuse_pass",             //
+        "mul_lstm_fuse_pass",            //
+        "fc_gru_fuse_pass",              //
+        "mul_gru_fuse_pass",             //
+        "seq_concat_fc_fuse_pass",       //
+        "fc_fuse_pass",                  //
+        "conv_bn_fuse_pass",             //
+        "conv_eltwiseadd_bn_fuse_pass",  //
+    });
+  }
+
+  virtual ~CpuPassStrategy() = default;
+
+  void EnableMKLDNN() override {
+// TODO(Superjomn) Consider the way to mix CPU with GPU.
+#ifdef PADDLE_WITH_MKLDNN
+    passes_.insert(passes_.begin(), "mkldnn_placement_pass");
+
+    for (auto &pass :
+         std::vector<std::string>({"depthwise_conv_mkldnn_pass",  //
+                                   "conv_bias_mkldnn_fuse_pass",  //
+                                   "conv_relu_mkldnn_fuse_pass",  //
+                                   "conv_elementwise_add_mkldnn_fuse_pass"})) {
+      passes_.push_back(pass);
+    }
+#endif
+  }
+
+  CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.passes_) {}
+};
+
+/*
+ * The GPU passes strategy, it is used in
+ */
+class GpuPassStrategy : public PassStrategy {
+ public:
+  GpuPassStrategy() : PassStrategy({}) {
+    passes_.assign({
+        "infer_clean_graph_pass", "conv_bn_fuse_pass",
+    });
+  }
+
+  GpuPassStrategy(const GpuPassStrategy &other)
+      : PassStrategy(other.AllPasses()) {}
+
+  void EnableMKLDNN() override;
+
+  virtual ~GpuPassStrategy() = default;
+};
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index a610687a5b11999a7cb7426dbe961e5972ee1746..e09705e3c69eb2b2370bd1ad2d9cf178ef041ee6 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -1,4 +1,5 @@
 nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto device_context)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
+add_subdirectory(plugin)
 add_subdirectory(convert)
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 0a35e10f6936313928ab21a6f17c40335e8fc882..ed4c398cee518af3211cab4e982082c46ebb36c2 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,8 +1,9 @@
 # Add TRT tests
 nv_library(tensorrt_converter
   SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc pad_op.cc
-  DEPS tensorrt_engine operator scope framework_proto op_registry)
+batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
+pad_op.cc split_op.cc
+  DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
   ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter)
@@ -28,6 +29,8 @@ nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL)
 nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine dropout_op SERIAL)
-
 nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine pad_op SERIAL)
+nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_plugin
+split_op concat_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index e73c5bbf57501e4ff3c080a46d91685035652bfa..0b756534ec6fbf27a3e92bf39fb7544d9785ca48 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -27,7 +27,7 @@ class ActivationOpConverter : public OpConverter {
     // Here the two nullptr looks strange, that's because the
     // framework::OpDesc's constructor is strange.
     framework::OpDesc op_desc(op, nullptr);
-    LOG(INFO)
+    VLOG(3)
         << "convert a fluid Activation op to tensorrt activation layer whose "
            "type is "
         << op_type_;
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
index 3330af2da6c97ad153dcecd86be4b441eac62b5e..d017bac66dd99a4b54c44ec786de61d1e66b8981 100644
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -23,7 +23,7 @@ class BatchNormOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-    LOG(INFO) << "convert a fluid batch norm op to tensorrt batch_norm";
+    VLOG(3) << "convert a fluid batch norm op to tensorrt batch_norm";
 
     framework::OpDesc op_desc(op, nullptr);
     PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
index 60c16e35ed39a4cb14cbc16ebacaba2bb72bcc81..525ba9dc341c8c1343553ac9523611f79ac3aa2d 100644
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
@@ -19,13 +19,13 @@ namespace inference {
 namespace tensorrt {
 
 /*
- * MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
+ * ConcatOp
  */
 class ConcatOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-    VLOG(40) << "convert a fluid mul op to tensorrt mul layer without bias";
+    VLOG(3) << "convert a fluid mul op to tensorrt mul layer without bias";
 
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 7bcf2dd1eeb17e802c5647df31945284ae08fa95..43950b8c048b4e1b8974956948caa639812b2f78 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -37,8 +37,7 @@ class Conv2dOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-    LOG(INFO)
-        << "convert a fluid conv2d op to tensorrt conv layer without bias";
+    VLOG(3) << "convert a fluid conv2d op to tensorrt conv layer without bias";
 
     framework::OpDesc op_desc(op, nullptr);
     PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
index df86a68dac521257aadac5ab1ed4020475a382f3..ddbc724e3b2a48b75df17f9bda691a1fd3883c32 100644
--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
@@ -25,7 +25,7 @@ class DropoutOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-    VLOG(40) << "convert a fluid dropout op to tensorrt dropout layer";
+    VLOG(3) << "convert a fluid dropout op to tensorrt dropout layer";
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 0a6ce568f194f03c7259e1ebf28dd6ce4df2d594..671bcd8aa9a9fff34644a056499961cf6ab81287 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -26,7 +26,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
     // Here the two nullptr looks strange, that's because the
     // framework::OpDesc's constructor is strange.
     framework::OpDesc op_desc(op, nullptr);
-    LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer";
+    VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer";
 
     PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
     PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
@@ -108,7 +108,7 @@ class ElementwiseTensorOpConverter : public OpConverter {
     // Here the two nullptr looks strange, that's because the
     // framework::OpDesc's constructor is strange.
     framework::OpDesc op_desc(op, nullptr);
-    LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer";
+    VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer";
 
     PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
     PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index bc1d9ee2811c447cd84f7e5b415b6bd53433b986..eef4fab4e86f05fa80bc614371f1aa43e433407e 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -52,7 +52,7 @@ class FcOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-    VLOG(40) << "convert a fluid fc op to tensorrt fc layer without bias";
+    VLOG(3) << "convert a fluid fc op to tensorrt fc layer without bias";
 
     framework::OpDesc op_desc(op, nullptr);
     PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
index babd56d62392299a58ecc16b7441029f80022498..5b6aaad49833cedbd8d1ee0ec5d24c7f983190e6 100644
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -25,7 +25,7 @@ class MulOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-    VLOG(40) << "convert a fluid mul op to tensorrt mul layer without bias";
+    VLOG(3) << "convert a fluid mul op to tensorrt mul layer without bias";
 
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
index c3699428d29e4d21f4eafbcf1fd0e255c91fabe6..4afcb0aecec9d07b52d2fd701fae8750067a6041 100644
--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
@@ -25,7 +25,7 @@ class PadOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-    VLOG(40) << "convert a fluid transpose op to tensorrt tranpose layer";
+    VLOG(3) << "convert a fluid transpose op to tensorrt tranpose layer";
 
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index d943d699f2cbe6726b3e634d0bd08e7cff3057d6..48850020840a49bd309c007943f14b2f7eec5e2d 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -25,7 +25,7 @@ class Pool2dOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-    VLOG(40)
+    VLOG(3)
         << "convert a fluid pool2d op to tensorrt pool2d layer without bias";
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
index 174cdbe53b2698d05b680d27f1f94bee38d1dfa5..80bfb2d190a5637032e7c18fbac7f22b3a9e81e1 100644
--- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
@@ -25,7 +25,7 @@ class SoftMaxOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-    VLOG(40)
+    VLOG(3)
         << "convert a fluid softmax op to tensorrt softmax layer without bias";
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..12179cccc76f8b0f595f41c135290dc0f3b50ad7
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * SplitOp.
+ */
+class SplitOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(40) << "convert a fluid split op to tensorrt split layer";
+
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto input_dims = input->getDimensions();
+    int input_num = op_desc.Input("X").size();
+    size_t output_num = op_desc.Output("Out").size();
+
+    // Get Attrs
+    PADDLE_ENFORCE(input_num == 1);
+    int axis = boost::get<int>(op_desc.GetAttr("axis"));
+    std::vector<int> output_lengths =
+        boost::get<std::vector<int>>(op_desc.GetAttr("sections"));
+    PADDLE_ENFORCE(axis != 0);
+    if (axis < 0) {
+      axis += input_dims.nbDims;
+    } else {
+      axis -= 1;
+    }
+
+    PADDLE_ENFORCE(output_lengths.size() == output_num);
+
+    //
+    SplitPlugin* plugin = new SplitPlugin(axis, output_lengths);
+    nvinfer1::IPluginLayer* layer =
+        engine_->AddPlugin(&input, input_num, plugin);
+
+    std::string layer_name = "split (Output: ";
+    for (size_t i = 0; i < output_num; i++) {
+      auto output_name = op_desc.Output("Out")[i];
+      layer->getOutput(i)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(i));
+      layer_name += output_name;
+      if (test_mode) {
+        engine_->DeclareOutput(output_name);
+      }
+    }
+    layer->setName((layer_name + ")").c_str());
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(split, SplitOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f81d011552c152c2df79e1a272f34b954ae2a3a1
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(split_op, test) {
+  std::unordered_set<std::string> parameters({""});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("split_input", nvinfer1::DimsCHW(3, 2, 2));
+  validator.DeclOutputVar("split_out1", nvinfer1::DimsCHW(2, 2, 2));
+  validator.DeclOutputVar("split_out2", nvinfer1::DimsCHW(1, 2, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("split");
+  desc.SetInput("X", {"split_input"});
+  desc.SetOutput("Out", {"split_out1", "split_out2"});
+
+  int num = 0;
+  int axis = 1;
+  std::vector<int> output_lengths = {2, 1};
+  desc.SetAttr("axis", axis);
+  desc.SetAttr("num", num);
+  desc.SetAttr("sections", output_lengths);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(1);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(split);
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 9e0f95844761db7571c5313726d34685a9aa66b2..fdd8b56b0ce5c9b5cb6395bcb437aae5ae27829b 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -61,6 +61,7 @@ TensorRTEngine::~TensorRTEngine() {
 }
 
 void TensorRTEngine::FreezeNetwork() {
+  VLOG(3) << "TRT to freeze network";
   freshDeviceId();
   PADDLE_ENFORCE(infer_builder_ != nullptr,
                  "Call InitNetwork first to initialize network.");
@@ -254,6 +255,12 @@ void TensorRTEngine::freshDeviceId() {
   cudaSetDevice(device_);
 }
 
+nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
+    nvinfer1::ITensor *const *inputs, int nbInputs, PluginTensorRT *plugin) {
+  owned_plugin_.emplace_back(plugin);
+  return infer_network_.get()->addPluginExt(inputs, nbInputs, *plugin);
+}
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 828181200e300c370bbfa234c3c23ae44810878c..335acdf653e55cc7f3ceccdba88992851c8e0310 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 
 namespace paddle {
@@ -125,6 +126,8 @@ class TensorRTEngine : public EngineBase {
   void SetRuntimeBatch(size_t batch_size);
   int GetRuntimeBatch();
   int GetDevice() { return device_; }
+  nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
+                                    int nbInputs, PluginTensorRT*);
 
   // A pointer to CPU memory is needed of the TRT weight.
   // Before TRT runs, fluid loads weight into GPU storage.
@@ -164,8 +167,10 @@ class TensorRTEngine : public EngineBase {
   std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
   std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
       itensor_map_;
+
   // The specific GPU id that the TensorRTEngine bounded to.
   int device_;
+  std::vector<std::unique_ptr<PluginTensorRT>> owned_plugin_;
 
   // TensorRT related internal members
   template <typename T>
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
index b6e7968108403c9c9c192759c44eac040d1c5073..fc7ca7714e9325d2b6bce6189300aa339c81c2ba 100644
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -52,7 +52,7 @@ class NaiveLogger : public nvinfer1::ILogger {
   void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
     switch (severity) {
       case Severity::kINFO:
-        LOG(INFO) << msg;
+        VLOG(3) << msg;
         break;
       case Severity::kWARNING:
         LOG(WARNING) << msg;
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..71b7a551619a43e5300ad3205418d1174c7019ff
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -0,0 +1 @@
+nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu DEPS enforce)
diff --git a/paddle/fluid/inference/tensorrt/plugin/serialize.h b/paddle/fluid/inference/tensorrt/plugin/serialize.h
new file mode 100644
index 0000000000000000000000000000000000000000..50c0b17d78327e22b0aa81fdac6958e80a30dfe8
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/serialize.h
@@ -0,0 +1,111 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cassert>
+#include <cstring>
+#include <type_traits>
+#include <vector>
+
+template <typename T>
+inline void SerializeValue(void** buffer, T const& value);
+
+template <typename T>
+inline void DeserializeValue(void const** buffer, size_t* buffer_size,
+                             T* value);
+
+namespace {
+
+template <typename T, class Enable = void>
+struct Serializer {};
+
+template <typename T>
+struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
+                                             std::is_enum<T>::value ||
+                                             std::is_pod<T>::value>::type> {
+  static size_t SerializedSize(T const& value) { return sizeof(T); }
+  static void Serialize(void** buffer, T const& value) {
+    std::memcpy(*buffer, &value, sizeof(T));
+    reinterpret_cast<char*&>(*buffer) += sizeof(T);
+  }
+  static void Deserialize(void const** buffer, size_t* buffer_size, T* value) {
+    assert(*buffer_size >= sizeof(T));
+    std::memcpy(value, *buffer, sizeof(T));
+    reinterpret_cast<char const*&>(*buffer) += sizeof(T);
+    *buffer_size -= sizeof(T);
+  }
+};
+
+template <>
+struct Serializer<const char*> {
+  static size_t SerializedSize(const char* value) { return strlen(value) + 1; }
+  static void Serialize(void** buffer, const char* value) {
+    std::strcpy(static_cast<char*>(*buffer), value);
+    reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
+  }
+  static void Deserialize(void const** buffer, size_t* buffer_size,
+                          const char** value) {
+    *value = static_cast<char const*>(*buffer);
+    size_t data_size = strnlen(*value, *buffer_size) + 1;
+    assert(*buffer_size >= data_size);
+    reinterpret_cast<char const*&>(*buffer) += data_size;
+    *buffer_size -= data_size;
+  }
+};
+
+template <typename T>
+struct Serializer<std::vector<T>,
+                  typename std::enable_if<std::is_arithmetic<T>::value ||
+                                          std::is_enum<T>::value ||
+                                          std::is_pod<T>::value>::type> {
+  static size_t SerializedSize(std::vector<T> const& value) {
+    return sizeof(value.size()) + value.size() * sizeof(T);
+  }
+  static void Serialize(void** buffer, std::vector<T> const& value) {
+    SerializeValue(buffer, value.size());
+    size_t nbyte = value.size() * sizeof(T);
+    std::memcpy(*buffer, value.data(), nbyte);
+    reinterpret_cast<char*&>(*buffer) += nbyte;
+  }
+  static void Deserialize(void const** buffer, size_t* buffer_size,
+                          std::vector<T>* value) {
+    size_t size;
+    DeserializeValue(buffer, buffer_size, &size);
+    value->resize(size);
+    size_t nbyte = value->size() * sizeof(T);
+    assert(*buffer_size >= nbyte);
+    std::memcpy(value->data(), *buffer, nbyte);
+    reinterpret_cast<char const*&>(*buffer) += nbyte;
+    *buffer_size -= nbyte;
+  }
+};
+
+}  // namespace
+
+template <typename T>
+inline size_t SerializedSize(T const& value) {
+  return Serializer<T>::SerializedSize(value);
+}
+
+template <typename T>
+inline void SerializeValue(void** buffer, T const& value) {
+  return Serializer<T>::Serialize(buffer, value);
+}
+
+template <typename T>
+inline void DeserializeValue(void const** buffer, size_t* buffer_size,
+                             T* value) {
+  return Serializer<T>::Deserialize(buffer, buffer_size, value);
+}
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bd6a44dcc14d50cddb879763a93abf4297494ec9
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -0,0 +1,81 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <cassert>
+#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+nvinfer1::Dims SplitPlugin::getOutputDimensions(int index,
+                                                const nvinfer1::Dims* inputDims,
+                                                int nbInputs) {
+  assert(nbInputs == 1);
+  assert(index < this->getNbOutputs());
+  nvinfer1::Dims const& input_dims = inputDims[0];
+  nvinfer1::Dims output_dims = input_dims;
+  output_dims.d[axis_] = output_length_.at(index);
+  return output_dims;
+}
+
+int SplitPlugin::initialize() {
+  std::vector<int> segment_offsets(1, 0);
+  for (int i = 0; i < this->getNbOutputs(); ++i) {
+    segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
+  }
+  segment_offsets_ = segment_offsets;
+  nvinfer1::Dims dims = this->getInputDims(0);
+  nx_ = 1;
+  for (int i = dims.nbDims - 1; i > axis_; --i) {
+    nx_ *= dims.d[i];
+  }
+  ny_ = dims.d[axis_];
+  nz_ = 1;
+  for (int i = axis_ - 1; i >= 0; --i) {
+    nz_ *= dims.d[i];
+  }
+  return 0;
+}
+
+int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
+                         void** outputs, void* workspace, cudaStream_t stream) {
+  auto const& input_dims = this->getInputDims(0);
+  int input_size = 0;
+  float const* idata = reinterpret_cast<float const*>(inputs[0]);
+  float** odatas = reinterpret_cast<float**>(outputs);
+
+  // kernel impl here.
+  int inputBatchOffset = nx_ * ny_ * nz_;
+  for (size_t i = 0; i < this->getNbOutputs(); i++) {
+    for (size_t j = 0; j < batchSize; j++) {
+      cudaMemcpyAsync(
+          odatas[i] +
+              j * (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ *
+                  sizeof(float),
+          inputs[0] +
+              (inputBatchOffset * j + segment_offsets_[i] * nx_) *
+                  sizeof(float),
+          (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * sizeof(float),
+          cudaMemcpyDeviceToDevice, stream);
+    }
+  }
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+}  // tensorrt
+}  // inference
+}  // paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
new file mode 100644
index 0000000000000000000000000000000000000000..7281e40c331550de472df49c57b1d9a5226842d5
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class SplitPlugin : public PluginTensorRT {
+  int axis_;
+  std::vector<int> output_length_;
+  int nx_, ny_, nz_;
+  std::vector<int> segment_offsets_;
+
+ protected:
+  virtual size_t getSerializationSize() override {
+    return SerializedSize(axis_) + SerializedSize(output_length_) +
+           getBaseSerializationSize();
+  }
+
+  // TRT will call this func when we need to serialize the configuration of
+  // tensorrt.
+  // It should not be called by users.
+  virtual void serialize(void *buffer) override {
+    serializeBase(buffer);
+    SerializeValue(&buffer, axis_);
+    SerializeValue(&buffer, output_length_);
+  }
+
+ public:
+  SplitPlugin(int axis, std::vector<int> const &output_lengths)
+      : axis_(axis), output_length_(output_lengths) {
+    assert(axis <= nvinfer1::Dims::MAX_DIMS);
+  }
+
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
+  SplitPlugin(void const *serialData, size_t serialLength) {
+    deserializeBase(serialData, serialLength);
+    DeserializeValue(&serialData, &serialLength, &axis_);
+    DeserializeValue(&serialData, &serialLength, &output_length_);
+  }
+
+  SplitPlugin *clone() const override {
+    return new SplitPlugin(axis_, output_length_);
+  }
+
+  virtual const char *getPluginType() const override { return "split"; }
+  virtual int getNbOutputs() const override { return output_length_.size(); }
+  virtual nvinfer1::Dims getOutputDimensions(int index,
+                                             const nvinfer1::Dims *inputs,
+                                             int nbInputDims) override;
+  virtual int initialize() override;
+  virtual int enqueue(int batchSize, const void *const *inputs, void **outputs,
+                      void *workspace, cudaStream_t stream) override;
+};
+
+}  // tensorrt
+}  // inference
+}  // paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
new file mode 100644
index 0000000000000000000000000000000000000000..08016d84b15bc750738f3183d8d61a5c90862288
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
@@ -0,0 +1,61 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+void PluginTensorRT::serializeBase(void*& buffer) {
+  SerializeValue(&buffer, input_dims_);
+  SerializeValue(&buffer, max_batch_size_);
+  SerializeValue(&buffer, data_type_);
+  SerializeValue(&buffer, data_format_);
+}
+
+void PluginTensorRT::deserializeBase(void const*& serialData,
+                                     size_t& serialLength) {
+  DeserializeValue(&serialData, &serialLength, &input_dims_);
+  DeserializeValue(&serialData, &serialLength, &max_batch_size_);
+  DeserializeValue(&serialData, &serialLength, &data_type_);
+  DeserializeValue(&serialData, &serialLength, &data_format_);
+}
+
+size_t PluginTensorRT::getBaseSerializationSize() {
+  return (SerializedSize(input_dims_) + SerializedSize(max_batch_size_) +
+          SerializedSize(data_type_) + SerializedSize(data_format_));
+}
+
+bool PluginTensorRT::supportsFormat(nvinfer1::DataType type,
+                                    nvinfer1::PluginFormat format) const {
+  return ((type == nvinfer1::DataType::kFLOAT) &&
+          (format == nvinfer1::PluginFormat::kNCHW));
+}
+
+void PluginTensorRT::configureWithFormat(const nvinfer1::Dims* inputDims,
+                                         int nbInputs,
+                                         const nvinfer1::Dims* outputDims,
+                                         int nbOutputs, nvinfer1::DataType type,
+                                         nvinfer1::PluginFormat format,
+                                         int maxBatchSize) {
+  data_type_ = type;
+  data_format_ = format;
+  input_dims_.assign(inputDims, inputDims + nbInputs);
+  max_batch_size_ = maxBatchSize;
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d85e955a49b7dcccae158ea06b76419419797cf
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+#include "NvInfer.h"
+
+#include "paddle/fluid/inference/tensorrt/plugin/serialize.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class PluginTensorRT : public nvinfer1::IPluginExt {
+ public:
+  PluginTensorRT() {}
+  PluginTensorRT(const void* serialized_data, size_t length) {}
+  nvinfer1::Dims const& getInputDims(int index) const {
+    return input_dims_.at(index);
+  }
+  size_t getMaxBatchSize() const { return max_batch_size_; }
+  nvinfer1::DataType getDataType() const { return data_type_; }
+  nvinfer1::PluginFormat getDataFormat() const { return data_format_; }
+  virtual const char* getPluginVersion() const { return "1"; }
+  size_t getWorkspaceSize(int) const override { return 0; }
+  void terminate() override {}
+  virtual ~PluginTensorRT() {}
+  // Check format support. The default is FLOAT32 and NCHW.
+  bool supportsFormat(nvinfer1::DataType type,
+                      nvinfer1::PluginFormat format) const override;
+  void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs,
+                           const nvinfer1::Dims* outputDims, int nbOutputs,
+                           nvinfer1::DataType type,
+                           nvinfer1::PluginFormat format,
+                           int maxBatchSize) override;
+
+  // *NOTE* The following functions need to be overrided in the subclass.
+  virtual nvinfer1::IPluginExt* clone() const = 0;
+  virtual const char* getPluginType() const = 0;
+  // Initialize the layer for execution. This is called when the engine is
+  // created.
+  int initialize() override { return 0; }
+  // Serialize the layer config to buffer.
+  virtual void serialize(void* buffer) = 0;
+  virtual size_t getSerializationSize() = 0;
+  virtual int enqueue(int batchSize, const void* const* inputs, void** outputs,
+                      void* workspace, cudaStream_t stream) = 0;
+
+ protected:
+  // Deserialize input_dims, max_batch_size, data_type, data_format
+  void deserializeBase(void const*& serialData, size_t& serialLength);
+  size_t getBaseSerializationSize();
+  // Serialize input_dims, max_batch_size, data_type, data_format
+  void serializeBase(void*& buffer);
+
+  std::vector<nvinfer1::Dims> input_dims_;
+  size_t max_batch_size_;
+  nvinfer1::DataType data_type_;
+  nvinfer1::PluginFormat data_format_;
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 2ca84c80058b35840aff5d072cdc99ecf5165f8e..74569057913d1db9a7184ab61ba655b3a66e49bd 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -45,11 +45,7 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
 # DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
 download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
-inference_analysis_test(test_analyzer_dam SRCS analyzer_dam_tester.cc
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS
-        --infer_model=${DAM_INSTALL_DIR}/model
-        --infer_data=${DAM_INSTALL_DIR}/data.txt
-        --use_analysis=0)
+inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc)
 
 # chinese_ner
 set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
@@ -108,7 +104,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
    if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
        inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
    endif()
-   cc_test(test_trt_models SRCS trt_models_tester.cc  
-     ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models
-     DEPS paddle_inference_tensorrt_subgraph_engine)
+   inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
+      EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor
+        ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
 endif()
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index ceac5dc7e14365c77cf1cbbbc16e4bf3ebfced73..b369cba5c8b3f8aadd1123d6b7345fad6e47bd0f 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -69,7 +69,7 @@ struct DataRecord {
       num_lines++;
       std::vector<std::string> data;
       split(line, ',', &data);
-      CHECK_EQ(data.size(), 2 * MAX_TURN_NUM + 3);
+      CHECK_EQ(data.size(), (size_t)(2 * MAX_TURN_NUM + 3));
       // load turn data
       std::vector<int64_t> turns_tmp[MAX_TURN_NUM];
       for (int i = 0; i < MAX_TURN_NUM; ++i) {
@@ -178,7 +178,8 @@ TEST(Analyzer_dam, profile) {
   std::vector<PaddleTensor> outputs;
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     PADDLE_ENFORCE_GT(outputs.size(), 0);
@@ -196,15 +197,13 @@ TEST(Analyzer_dam, fuse_statis) {
   contrib::AnalysisConfig cfg;
   SetConfig(&cfg);
 
-  if (FLAGS_use_analysis) {
-    int num_ops;
-    auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-    auto fuse_statis = GetFuseStatis(
-        static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
-    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-    EXPECT_EQ(fuse_statis.at("fc_fuse"), 317);
-    EXPECT_EQ(num_ops, 2020);
-  }
+  int num_ops;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
+  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+  EXPECT_EQ(fuse_statis.at("fc_fuse"), 317);
+  EXPECT_EQ(num_ops, 2020);
 }
 
 // Compare result of NativeConfig and AnalysisConfig
@@ -215,9 +214,8 @@ TEST(Analyzer_dam, compare) {
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
 
-  if (FLAGS_use_analysis) {
-    CompareNativeAndAnalysis(cfg, input_slots_all);
-  }
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
index 5fb551810fd4d1c56547a8aa581cb6c4587df031..310852e2f7cb284bda3041911d0059b55ee3b477 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -133,7 +133,8 @@ TEST(Analyzer_LAC, profile) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
@@ -175,7 +176,8 @@ TEST(Analyzer_LAC, compare) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index d91f7c314d0a936da6f5b0c41920c905af5cd0ee..3a5f844de3cae7eb9b6e3555c5219c6cf8ee1919 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -121,7 +121,8 @@ TEST(Analyzer_Chinese_ner, profile) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
@@ -160,7 +161,8 @@ TEST(Analyzer_Chinese_ner, compare) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
index e5c8dfd22a006d5271248c5b083aab2c22417502..2b936175ed3f8ec24826485027048c82df0461ab 100644
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -37,12 +37,16 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 void profile(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  cfg._use_mkldnn = use_mkldnn;
+
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
   std::vector<PaddleTensor> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 }
 
 TEST(Analyzer_resnet50, profile) { profile(); }
@@ -65,11 +69,14 @@ TEST(Analyzer_resnet50, fuse_statis) {
 void compare(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  cfg._use_mkldnn = use_mkldnn;
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
 TEST(Analyzer_resnet50, compare) { compare(); }
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
index e0416ff953b61f56a2ca1a45cb382d40a6cffa4a..1ae2b4b03a1b2a66b3ddc8cb66d9575751a52297 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -210,7 +210,6 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->device = 0;
   cfg->specify_input_name = true;
   cfg->enable_ir_optim = true;
-  cfg->ir_passes.clear();  // Do not exclude any pass.
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
@@ -226,14 +225,16 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 
 // Easy for profiling independently.
 TEST(Analyzer_rnn1, profile) {
-  contrib::AnalysisConfig cfg;
+  contrib::AnalysisConfig cfg(false);
   SetConfig(&cfg);
-  cfg.use_gpu = false;
+  cfg.fraction_of_gpu_memory = 0.1;
+  cfg.pass_builder()->TurnOnDebug();
   std::vector<PaddleTensor> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 }
 
 // Check the fuse status
@@ -260,7 +261,8 @@ TEST(Analyzer_rnn1, compare) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
 // Test Multi-Thread.
@@ -271,32 +273,8 @@ TEST(Analyzer_rnn1, multi_thread) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, 4 /* multi_thread */);
-}
-
-bool CompareTensors(const framework::Scope &a_scope,
-                    const framework::Scope &b_scope,
-                    const std::vector<std::string> &tensors) {
-  for (auto &x : tensors) {
-    auto *a_var = a_scope.FindVar(x);
-    auto *b_var = b_scope.FindVar(x);
-    if (a_var && b_var) {
-      if (a_var->Type() == typeid(framework::LoDTensor) ||
-          a_var->Type() == typeid(framework::Tensor)) {
-        LOG(INFO) << "comparing tensor " << x;
-        auto &a_t = a_var->Get<framework::LoDTensor>();
-        auto &b_t = b_var->Get<framework::LoDTensor>();
-        if (!inference::CompareTensor(a_t, b_t)) {
-          LOG(ERROR) << string::Sprintf("tensor %s not match in two scopes", x);
-        }
-      } else {
-        LOG(INFO) << "skip no tensor " << x;
-      }
-    } else {
-      LOG(INFO) << "skip tensor " << x;
-    }
-  }
-  return true;
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, 4 /* multi_thread */);
 }
 
 // Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
@@ -307,7 +285,6 @@ TEST(Analyzer_rnn1, ZeroCopy) {
   config.use_feed_fetch_ops = false;
 
   PaddlePlace place;
-  int output_size{0};
 
   auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
 
@@ -353,86 +330,22 @@ TEST(Analyzer_rnn1, ZeroCopy) {
 
   Timer timer;
   double total_time{0};
-  double native_total_time{0};
-  double analysis_total_time{0.};
-
   for (int i = 0; i < FLAGS_repeat; i++) {
     timer.tic();
     predictor->ZeroCopyRun();
     total_time += timer.toc();
   }
+  LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor);
 
-  auto *output_data = output_tensor->data<float>(&place, &output_size);
-  ASSERT_GT(output_size, 0);  // more than one output!
-
-  for (int i = 0; i < FLAGS_repeat; i++) {
-    // Run native predictor.
-    timer.tic();
-    ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs));
-    native_total_time += timer.toc();
-  }
-
-  for (int i = 0; i < FLAGS_repeat; i++) {
-    timer.tic();
-    ASSERT_TRUE(
-        analysis_predictor->Run(native_inputs.front(), &analysis_outputs));
-    analysis_total_time += timer.toc();
-  }
-
-  if (!FLAGS_with_precision_check) {
-    return;
-  }
-  int native_output_size = VecReduceToInt(native_outputs.front().shape);
-
-  EXPECT_EQ(native_output_size, output_size);
+  ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs));
+  LOG(INFO) << "native output " << DescribeTensor(native_outputs.front());
 
-  // Compare tensors between analysis and zerocopy
-  auto *p0 = static_cast<AnalysisPredictor *>(predictor.get());
-  auto *p1 = static_cast<AnalysisPredictor *>(analysis_predictor.get());
-  auto *p2 = static_cast<NativePaddlePredictor *>(native_predictor.get());
-
-  std::vector<std::string> tensor_names;
-  for (auto &var_desc : p0->program().Block(0).AllVars()) {
-    tensor_names.push_back(var_desc->Name());
-  }
-
-  LOG(INFO) << "Comparing tensors";
-  ASSERT_TRUE(
-      CompareTensors(*p0->scope(), *p1->scope(), {"final_output.tmp_1"}));
-  ASSERT_TRUE(
-      CompareTensors(*p0->scope(), *p2->scope(), {"final_output.tmp_1"}));
-
-  LOG(INFO) << "output1 " << inference::LoDTensorSummary<float>(
-                                 p0->scope()
-                                     ->FindVar("final_output.tmp_1")
-                                     ->Get<framework::LoDTensor>());
-  LOG(INFO) << "output2 " << inference::LoDTensorSummary<float>(
-                                 p1->scope()
-                                     ->FindVar("final_output.tmp_1")
-                                     ->Get<framework::LoDTensor>());
-  LOG(INFO) << "output3 " << inference::LoDTensorSummary<float>(
-                                 p2->scope()
-                                     ->FindVar("final_output.tmp_1")
-                                     ->Get<framework::LoDTensor>());
-
-  for (int i = 0; i < output_size; i++) {
-    LOG(INFO) << output_data[i] << " "
-              << static_cast<float *>(native_outputs.front().data.data())[i]
-              << " "
-              << static_cast<float *>(analysis_outputs.front().data.data())[i];
-    EXPECT_NEAR(output_data[i],
-                static_cast<float *>(native_outputs.front().data.data())[i],
-                1e-3);
+  int output_size{0};
+  auto *zero_copy_data = output_tensor->data<float>(&place, &output_size);
+  auto *native_data = static_cast<float *>(native_outputs.front().data.data());
+  for (size_t i = 0; i < output_size / sizeof(float); i++) {
+    EXPECT_NEAR(zero_copy_data[i], native_data[i], 1e-3);
   }
-
-  LOG(INFO) << "batch_size: " << FLAGS_batch_size;
-
-  LOG(INFO) << "zero average time: "
-            << total_time / (FLAGS_repeat * FLAGS_batch_size);
-  LOG(INFO) << "analysis average time: "
-            << analysis_total_time / (FLAGS_repeat * FLAGS_batch_size);
-  LOG(INFO) << "native average time: "
-            << native_total_time / (FLAGS_repeat * FLAGS_batch_size);
 }
 
 TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
index e0eb919bd896d73a557001982a436fc93f087a74..e2985006f0ed858e778bf4737be3aaee0e056021 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@@ -132,7 +132,8 @@ TEST(Analyzer_rnn2, profile) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
@@ -153,7 +154,8 @@ TEST(Analyzer_rnn2, compare) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
index f590ef27967e47ffcb3a97e80dd147efdd1906e6..858191184a377a26042c98e17d5b8df782575efc 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -161,7 +161,8 @@ TEST(Analyzer_seq_conv1, profile) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
@@ -199,7 +200,8 @@ TEST(Analyzer_seq_conv1, compare) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
index ca19475bda372398d425b0fa6f9a732cd79a8166..34a241f070fdc62d1b1e94835fb1dad405baafa9 100644
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -74,7 +74,8 @@ TEST(Analyzer_Text_Classification, profile) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1) {
     // Get output
@@ -101,20 +102,20 @@ TEST(Analyzer_Text_Classification, compare) {
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
 TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
   // Enable embedding_fc_lstm_fuse_pass (disabled by default)
-  auto it = std::find(cfg.ir_passes.begin(), cfg.ir_passes.end(),
-                      "embedding_fc_lstm_fuse_pass");
-  if (it != cfg.ir_passes.end()) cfg.ir_passes.erase(it);
+  cfg.pass_builder()->InsertPass(2, "embedding_fc_lstm_fuse_pass");
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index b2cd49af9aa580482fad84b6b23cb19f954e22fc..956a235edcefb7d688983c3b63b187e284efb02a 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -58,7 +58,7 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->enable_ir_optim = true;
   cfg->specify_input_name = true;
   // TODO(TJ): fix fusion gru
-  cfg->ir_passes.push_back("fc_gru_fuse_pass");
+  cfg->pass_builder()->DeletePass("fc_gru_fuse_pass");
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
@@ -84,12 +84,15 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 void profile(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  cfg._use_mkldnn = use_mkldnn;
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
   std::vector<PaddleTensor> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     const float ocr_result_data[] = {
@@ -125,11 +128,14 @@ TEST(Analyzer_vis, fuse_statis) {
 void compare(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  cfg._use_mkldnn = use_mkldnn;
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
 TEST(Analyzer_vis, compare) { compare(); }
diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa0c4b1d049bc276cda2f58ac1edd8102fb3fd88
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <ostream>
+#include <string>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+namespace paddle {
+namespace inference {
+
+thread_local int num_spaces = 0;
+
+static std::string GenSpaces(int num_spaces) {
+  std::ostringstream os;
+  for (int i = 0; i < num_spaces; ++i) {
+    os << "  ";
+  }
+  return os.str();
+}
+
+std::ostream &operator<<(std::ostream &os,
+                         const PaddlePredictor::Config &config) {
+  os << GenSpaces(num_spaces) << "PaddlePredictor::Config {\n";
+  num_spaces++;
+  os << GenSpaces(num_spaces) << "model_dir: " << config.model_dir << "\n";
+  num_spaces--;
+  os << GenSpaces(num_spaces) << "}\n";
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
+  os << GenSpaces(num_spaces) << "NativeConfig {\n";
+  num_spaces++;
+  os << *reinterpret_cast<const PaddlePredictor::Config *>(&config);
+  os << GenSpaces(num_spaces) << "use_gpu: " << config.use_gpu << "\n";
+  os << GenSpaces(num_spaces) << "device: " << config.device << "\n";
+  os << GenSpaces(num_spaces)
+     << "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n";
+  os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
+  os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
+  os << GenSpaces(num_spaces)
+     << "specify_input_name: " << config.specify_input_name << "\n";
+  num_spaces--;
+  os << GenSpaces(num_spaces) << "}\n";
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+                         const contrib::AnalysisConfig &config) {
+  os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n";
+  num_spaces++;
+  os << *reinterpret_cast<const NativeConfig *>(&config);
+  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim
+     << "\n";
+  os << GenSpaces(num_spaces)
+     << "use_feed_fetch_ops: " << config.use_feed_fetch_ops << "\n";
+  os << GenSpaces(num_spaces) << "use_tensorrt: " << config.use_tensorrt()
+     << "\n";
+  os << GenSpaces(num_spaces) << "use_mkldnn: " << config.use_mkldnn() << "\n";
+  num_spaces--;
+  os << GenSpaces(num_spaces) << "}\n";
+  return os;
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 8c5888d8da7b33eeca77311c10dd818648e8e524..a4046914132cc713a707fc2a4d12087383d77fe5 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -19,12 +19,16 @@
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
+
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/tests/api/config_printer.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -37,10 +41,18 @@ DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
 DEFINE_bool(use_analysis, true,
             "Running the inference program in analysis mode.");
 
+DECLARE_bool(profile);
+
 namespace paddle {
 namespace inference {
 
-using contrib::AnalysisConfig;
+void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
+  if (use_analysis) {
+    LOG(INFO) << *reinterpret_cast<const contrib::AnalysisConfig *>(config);
+    return;
+  }
+  LOG(INFO) << *config;
+}
 
 void CompareResult(const std::vector<PaddleTensor> &outputs,
                    const std::vector<PaddleTensor> &ref_outputs) {
@@ -76,42 +88,58 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
 }
 
 std::unique_ptr<PaddlePredictor> CreateTestPredictor(
-    const AnalysisConfig &config, bool use_analysis = true) {
+    const PaddlePredictor::Config *config, bool use_analysis = true) {
   if (use_analysis) {
-    return CreatePaddlePredictor<contrib::AnalysisConfig>(config);
-  } else {
-    return CreatePaddlePredictor<NativeConfig>(config);
+    return CreatePaddlePredictor<contrib::AnalysisConfig>(
+        *(reinterpret_cast<const contrib::AnalysisConfig *>(config)));
   }
+  return CreatePaddlePredictor<NativeConfig>(
+      *(reinterpret_cast<const NativeConfig *>(config)));
 }
 
 size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }
 
 std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
                                                    int *num_ops) {
+  std::unordered_map<std::string, int> res;
   auto *analysis_predictor = static_cast<AnalysisPredictor *>(predictor);
-  auto &fuse_statis = analysis_predictor->analysis_argument()
-                          .Get<std::unordered_map<std::string, int>>(
-                              framework::ir::kFuseStatisAttr);
-  for (auto &item : fuse_statis) {
+  auto *fusion_status =
+      analysis_predictor->analysis_argument().fusion_statis_ptr();
+  if (!fusion_status) {
+    return res;
+  }
+  for (auto &item : *fusion_status) {
     LOG(INFO) << "fused " << item.first << " " << item.second;
   }
   int num = 0;
   for (auto &node :
-       analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
-    if (node->IsFunction()) {
+       analysis_predictor->analysis_argument().main_graph().Nodes()) {
+    if (node->IsOp()) {
       ++num;
     }
   }
   *num_ops = num;
-  return fuse_statis;
+  return *fusion_status;
 }
 
 void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
-                       const std::string &dirname) {
+                       const std::string &dirname, bool is_combined = true,
+                       std::string model_filename = "model",
+                       std::string params_filename = "params") {
   // Set fake_image_data
   PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
-  std::vector<std::vector<int64_t>> feed_target_shapes =
-      GetFeedTargetShapes(dirname, true, "model", "params");
+  std::vector<std::vector<int64_t>> feed_target_shapes = GetFeedTargetShapes(
+      dirname, is_combined, model_filename, params_filename);
+  std::ostringstream os;
+  for (size_t i = 0; i < feed_target_shapes.size(); ++i) {
+    os << "feed target " << i << ": {" << feed_target_shapes[i][0];
+    for (size_t j = 1; j < feed_target_shapes[i].size(); ++j) {
+      os << ", " << feed_target_shapes[i][j];
+    }
+    os << "}\n";
+  }
+  LOG(INFO) << os.str();
+
   int dim1 = feed_target_shapes[0][1];
   int dim2 = feed_target_shapes[0][2];
   int dim3 = feed_target_shapes[0][3];
@@ -135,25 +163,43 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
 }
 
 void TestOneThreadPrediction(
-    const AnalysisConfig &config,
+    const PaddlePredictor::Config *config,
     const std::vector<std::vector<PaddleTensor>> &inputs,
     std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
   int batch_size = FLAGS_batch_size;
   int num_times = FLAGS_repeat;
   auto predictor = CreateTestPredictor(config, use_analysis);
-  Timer timer;
-  timer.tic();
-  for (int i = 0; i < num_times; i++) {
-    for (size_t j = 0; j < inputs.size(); j++) {
-      predictor->Run(inputs[j], outputs);
+
+  // warmup run
+  LOG(INFO) << "Warm up run...";
+  {
+    Timer warmup_timer;
+    warmup_timer.tic();
+    predictor->Run(inputs[0], outputs, batch_size);
+    PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1);
+#if !defined(_WIN32)
+    if (FLAGS_profile) {
+      paddle::platform::ResetProfiler();
     }
+#endif
+  }
+
+  LOG(INFO) << "Run " << num_times << " times...";
+  {
+    Timer run_timer;
+    run_timer.tic();
+    for (int i = 0; i < num_times; i++) {
+      for (size_t j = 0; j < inputs.size(); j++) {
+        predictor->Run(inputs[j], outputs, batch_size);
+      }
+    }
+    PrintTime(batch_size, num_times, 1, 0, run_timer.toc() / num_times,
+              inputs.size());
   }
-  PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times,
-            inputs.size());
 }
 
 void TestMultiThreadPrediction(
-    const AnalysisConfig &config,
+    const PaddlePredictor::Config *config,
     const std::vector<std::vector<PaddleTensor>> &inputs,
     std::vector<PaddleTensor> *outputs, int num_threads,
     bool use_analysis = true) {
@@ -161,11 +207,12 @@ void TestMultiThreadPrediction(
   int num_times = FLAGS_repeat;
   std::vector<std::thread> threads;
   std::vector<std::unique_ptr<PaddlePredictor>> predictors;
-  // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
-  // because AttentionLSTM's hard code nodeid will be damanged.
-  for (int tid = 0; tid < num_threads; ++tid) {
-    predictors.emplace_back(CreateTestPredictor(config, use_analysis));
+  predictors.emplace_back(CreateTestPredictor(config, use_analysis));
+  for (int tid = 1; tid < num_threads; ++tid) {
+    predictors.emplace_back(predictors.front()->Clone());
   }
+
+  size_t total_time{0};
   for (int tid = 0; tid < num_threads; ++tid) {
     threads.emplace_back([&, tid]() {
 #ifdef PADDLE_WITH_MKLDNN
@@ -173,17 +220,21 @@ void TestMultiThreadPrediction(
 #endif
       // Each thread should have local inputs and outputs.
       // The inputs of each thread are all the same.
-      std::vector<std::vector<PaddleTensor>> inputs_tid = inputs;
       std::vector<PaddleTensor> outputs_tid;
+      auto &predictor = predictors[tid];
+      LOG(INFO) << "running thread " << tid;
       Timer timer;
       timer.tic();
       for (int i = 0; i < num_times; i++) {
-        for (size_t j = 0; j < inputs_tid.size(); j++) {
-          predictors[tid]->Run(inputs_tid[j], &outputs_tid);
+        for (const auto &input : inputs) {
+          ASSERT_TRUE(predictor->Run(input, &outputs_tid));
         }
       }
-      PrintTime(batch_size, num_times, num_threads, tid,
-                timer.toc() / num_times, inputs_tid.size());
+
+      auto time = timer.toc();
+      total_time += time;
+      PrintTime(batch_size, num_times, num_threads, tid, time / num_times,
+                inputs.size());
     });
   }
   for (int i = 0; i < num_threads; ++i) {
@@ -191,12 +242,11 @@ void TestMultiThreadPrediction(
   }
 }
 
-void TestPrediction(const AnalysisConfig &config,
+void TestPrediction(const PaddlePredictor::Config *config,
                     const std::vector<std::vector<PaddleTensor>> &inputs,
                     std::vector<PaddleTensor> *outputs, int num_threads,
                     bool use_analysis = FLAGS_use_analysis) {
-  LOG(INFO) << "use_analysis: " << use_analysis
-            << ", use_mkldnn: " << config._use_mkldnn;
+  PrintConfig(config, use_analysis);
   if (num_threads == 1) {
     TestOneThreadPrediction(config, inputs, outputs, use_analysis);
   } else {
@@ -206,9 +256,9 @@ void TestPrediction(const AnalysisConfig &config,
 }
 
 void CompareNativeAndAnalysis(
-    const AnalysisConfig &config,
+    const PaddlePredictor::Config *config,
     const std::vector<std::vector<PaddleTensor>> &inputs) {
-  LOG(INFO) << "use_mkldnn: " << config._use_mkldnn;
+  PrintConfig(config, true);
   std::vector<PaddleTensor> native_outputs, analysis_outputs;
   TestOneThreadPrediction(config, inputs, &native_outputs, false);
   TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
index 75840a9c437d956da4f542a38b2532ea20ee96c5..922feba10fec5d1d13b47dbce064fce2e01d8998 100644
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -1,108 +1,149 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
-using paddle::contrib::MixedRTConfig;
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-NativeConfig GetConfigNative() {
-  NativeConfig config;
-  config.model_dir = FLAGS_dirname;
-  // LOG(INFO) << "dirname  " << config.model_dir;
-  config.fraction_of_gpu_memory = 0.45;
-  config.use_gpu = true;
-  config.device = 0;
-  return config;
+namespace inference {
+
+DEFINE_bool(use_tensorrt, true, "Test the performance of TensorRT engine.");
+DEFINE_string(prog_filename, "", "Name of model file.");
+DEFINE_string(param_filename, "", "Name of parameters file.");
+
+template <typename ConfigType>
+void SetConfig(ConfigType* config, std::string model_dir, bool use_gpu,
+               bool use_tensorrt = false, int batch_size = -1) {
+  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
+    config->prog_file = model_dir + "/" + FLAGS_prog_filename;
+    config->param_file = model_dir + "/" + FLAGS_param_filename;
+  } else {
+    config->model_dir = model_dir;
+  }
+  if (use_gpu) {
+    config->use_gpu = true;
+    config->device = 0;
+    config->fraction_of_gpu_memory = 0.15;
+  }
 }
 
-MixedRTConfig GetConfigTRT() {
-  MixedRTConfig config;
-  config.model_dir = FLAGS_dirname;
-  config.use_gpu = true;
-  config.fraction_of_gpu_memory = 0.2;
-  config.device = 0;
-  config.max_batch_size = 3;
-  return config;
+template <>
+void SetConfig<contrib::AnalysisConfig>(contrib::AnalysisConfig* config,
+                                        std::string model_dir, bool use_gpu,
+                                        bool use_tensorrt, int batch_size) {
+  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
+    config->prog_file = model_dir + "/" + FLAGS_prog_filename;
+    config->param_file = model_dir + "/" + FLAGS_param_filename;
+  } else {
+    config->model_dir = model_dir;
+  }
+  if (use_gpu) {
+    config->use_gpu = true;
+    config->device = 0;
+    config->fraction_of_gpu_memory = 0.15;
+    if (use_tensorrt) {
+      config->EnableTensorRtEngine(1 << 10, batch_size);
+      config->pass_builder()->DeletePass("conv_bn_fuse_pass");
+      config->pass_builder()->DeletePass("fc_fuse_pass");
+      config->pass_builder()->TurnOnDebug();
+    } else {
+      config->enable_ir_optim = true;
+    }
+  }
 }
 
-void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
-  NativeConfig config0 = GetConfigNative();
-  config0.model_dir = model_dirname;
-
-  MixedRTConfig config1 = GetConfigTRT();
-  config1.model_dir = model_dirname;
-  config1.max_batch_size = batch_size;
-
-  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
-  auto predictor1 = CreatePaddlePredictor<MixedRTConfig>(config1);
-  // Prepare inputs
-  int height = 224;
-  int width = 224;
-  float *data = new float[batch_size * 3 * height * width];
-  memset(data, 0, sizeof(float) * (batch_size * 3 * height * width));
-  data[0] = 1.0f;
-
-  // Prepare inputs
-  PaddleTensor tensor;
-  tensor.name = "input_0";
-  tensor.shape = std::vector<int>({batch_size, 3, height, width});
-  tensor.data = PaddleBuf(static_cast<void *>(data),
-                          sizeof(float) * (batch_size * 3 * height * width));
-  tensor.dtype = PaddleDType::FLOAT32;
-  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
-
-  // Prepare outputs
-  std::vector<PaddleTensor> outputs0;
-  std::vector<PaddleTensor> outputs1;
-  CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0));
-
-  CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
-
-  // Get output.
-  ASSERT_EQ(outputs0.size(), 1UL);
-  ASSERT_EQ(outputs1.size(), 1UL);
-
-  const size_t num_elements = outputs0.front().data.length() / sizeof(float);
-  const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
-  EXPECT_EQ(num_elements, num_elements1);
-
-  auto *data0 = static_cast<float *>(outputs0.front().data.data());
-  auto *data1 = static_cast<float *>(outputs1.front().data.data());
-
-  ASSERT_GT(num_elements, 0UL);
-  for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
-    EXPECT_NEAR(data0[i], data1[i], 1e-3);
+void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
+  std::vector<std::vector<PaddleTensor>> inputs_all;
+  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
+    SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
+                      FLAGS_param_filename);
+  } else {
+    SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
+  }
+
+  std::vector<PaddleTensor> outputs;
+  if (use_analysis || use_tensorrt) {
+    contrib::AnalysisConfig config(true);
+    SetConfig<contrib::AnalysisConfig>(&config, model_dir, true, use_tensorrt,
+                                       FLAGS_batch_size);
+    TestPrediction(reinterpret_cast<PaddlePredictor::Config*>(&config),
+                   inputs_all, &outputs, FLAGS_num_threads, true);
+  } else {
+    NativeConfig config;
+    SetConfig<NativeConfig>(&config, model_dir, true, false);
+    TestPrediction(reinterpret_cast<PaddlePredictor::Config*>(&config),
+                   inputs_all, &outputs, FLAGS_num_threads, false);
   }
 }
 
-TEST(trt_models_test, mobilenet) {
-  CompareTensorRTWithFluid(1, FLAGS_dirname + "/mobilenet");
+void compare(std::string model_dir, bool use_tensorrt) {
+  std::vector<std::vector<PaddleTensor>> inputs_all;
+  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
+    SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
+                      FLAGS_param_filename);
+  } else {
+    SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
+  }
+
+  std::vector<PaddleTensor> native_outputs;
+  NativeConfig native_config;
+  SetConfig<NativeConfig>(&native_config, model_dir, true, false,
+                          FLAGS_batch_size);
+  TestOneThreadPrediction(
+      reinterpret_cast<PaddlePredictor::Config*>(&native_config), inputs_all,
+      &native_outputs, false);
+
+  std::vector<PaddleTensor> analysis_outputs;
+  contrib::AnalysisConfig analysis_config(true);
+  SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true,
+                                     use_tensorrt, FLAGS_batch_size);
+  TestOneThreadPrediction(
+      reinterpret_cast<PaddlePredictor::Config*>(&analysis_config), inputs_all,
+      &analysis_outputs, true);
+
+  CompareResult(native_outputs, analysis_outputs);
 }
 
-TEST(trt_models_test, resnet50) {
-  CompareTensorRTWithFluid(1, FLAGS_dirname + "/resnet50");
+TEST(TensorRT_mobilenet, compare) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  compare(model_dir, /* use_tensorrt */ true);
 }
 
-TEST(trt_models_test, resnext50) {
-  CompareTensorRTWithFluid(1, FLAGS_dirname + "/resnext50");
+TEST(TensorRT_resnet50, compare) {
+  std::string model_dir = FLAGS_infer_model + "/resnet50";
+  compare(model_dir, /* use_tensorrt */ true);
 }
 
+TEST(TensorRT_resnext50, compare) {
+  std::string model_dir = FLAGS_infer_model + "/resnext50";
+  compare(model_dir, /* use_tensorrt */ true);
+}
+
+TEST(TensorRT_resnext50, profile) {
+  std::string model_dir = FLAGS_infer_model + "/resnext50";
+  profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
+}
+
+TEST(TensorRT_mobilenet, analysis) {
+  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
+  compare(model_dir, /* use_tensorrt */ false);
+}
+
+}  // namespace inference
 }  // namespace paddle
+
+USE_PASS(tensorrt_subgraph_pass);
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index edefeed67eb048387fbee2bf067553a8023001c2..5c06cad64eca296b0a9b57bd22d2a94cd7d00838 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <string>
 #include <vector>
 
 #include "glog/logging.h"
@@ -21,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/string/printf.h"
 
 DEFINE_bool(init_allocated_mem, false,
             "It is a mistake that the values of the memory allocated by "
@@ -145,12 +147,18 @@ void* Alloc<platform::CUDAPlace>(const platform::CUDAPlace& place,
     platform::SetDeviceId(place.device);
     size_t avail, total;
     platform::GpuMemoryUsage(&avail, &total);
-    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
-                 << place.device << ", available " << avail << " bytes";
+    LOG(WARNING) << "Cannot allocate " << string::HumanReadableSize(size)
+                 << " in GPU " << place.device << ", available "
+                 << string::HumanReadableSize(avail);
     LOG(WARNING) << "total " << total;
-    LOG(WARNING) << "GpuMinChunkSize " << buddy_allocator->GetMinChunkSize();
-    LOG(WARNING) << "GpuMaxChunkSize " << buddy_allocator->GetMaxChunkSize();
-    LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
+    LOG(WARNING) << "GpuMinChunkSize "
+                 << string::HumanReadableSize(
+                        buddy_allocator->GetMinChunkSize());
+    LOG(WARNING) << "GpuMaxChunkSize "
+                 << string::HumanReadableSize(
+                        buddy_allocator->GetMaxChunkSize());
+    LOG(WARNING) << "GPU memory used: "
+                 << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
     platform::SetDeviceId(cur_dev);
   }
   if (FLAGS_init_allocated_mem) {
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 1db2d54218f11eea10a2702fec28665a4c446f72..857bd87bb063be887ef8da8de9b92cb811f0596f 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -5,6 +5,8 @@ list(REMOVE_DUPLICATES GENERAL_OPS)
 set(DEPS_OPS "")
 set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h)
 file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
+
+set(PART_CUDA_KERNEL_FILES)
 function(op_library TARGET)
     # op_library is a function to create op library. The interface is same as
     # cc_library. But it handle split GPU/CPU code and link some common library
@@ -37,6 +39,12 @@ function(op_library TARGET)
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
             list(APPEND cu_srcs ${TARGET}.cu)
         endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
+            set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
+                    ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
+            list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
+        endif()
+
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
             list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
         endif()
@@ -86,7 +94,8 @@ function(op_library TARGET)
     # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
     foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
      "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
-      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
+      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op"
+            "fusion_seqexpand_concat_fc_op" "attention_lstm_op" "fused_embedding_fc_lstm_op" "fc_op")
         if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
           return()
         endif()
@@ -300,8 +309,10 @@ op_library(flatten_op DEPS reshape_op)
 op_library(sequence_pad_op DEPS sequence_padding)
 op_library(unstack_op DEPS stack_op)
 op_library(fake_quantize_op DEPS memory)
+if (NOT WIN32)
 op_library(crf_decoding_op DEPS jit_kernel)
 op_library(fusion_lstm_op DEPS jit_kernel)
+endif(NOT WIN32)
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
     op_library(layer_norm_op DEPS cub)
@@ -317,8 +328,8 @@ op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
 op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
-op_library(tensor_array_to_tensor_op DEPS concat_op)
 op_library(concat_op DEPS concat_and_split)
+op_library(tensor_array_to_tensor_op DEPS concat_op)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 
@@ -327,6 +338,8 @@ foreach(src ${GENERAL_OPS})
 endforeach()
 
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
+
+
 if (NOT WIN32)
 add_subdirectory(reader)
 endif(NOT WIN32)
@@ -353,3 +366,14 @@ if(NOT WIN32)
     nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
+
+if(WITH_GPU)
+    foreach(CUDA_KERNEL_FILE ${PART_CUDA_KERNEL_FILES})
+        file(READ ${CUDA_KERNEL_FILE} TARGET_CONTENT)
+        string(REGEX MATCH "REGISTER_OP_CUDA_KERNEL\\(\\n?([^,]+),.*" MATCHED ${TARGET_CONTENT})
+        if (MATCHED)
+            string(STRIP ${CMAKE_MATCH_1} MATCHED)
+            file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${MATCHED}, CUDA);\n")
+        endif()
+    endforeach()
+endif()
diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc
index 0784920064a879963cd9725cd9acf4cec7b874ce..cb98bc514083ad113fdebfbac043a9516fd9435a 100644
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -53,7 +53,7 @@ class AucOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Predict")->type()),
-        ctx.device_context());
+        platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 3083e622c3066879e107f930a45bcec36d347f80..3a4086274d8a4bf6725df9f3195cec2446ceae6c 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -50,12 +50,18 @@ static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache";
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
     static_cast<size_t>(1024) * 1024 * 1024;
 
-static constexpr size_t kNUM_CUDNN_FWD_ALGS =
-    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
+#if CUDNN_VERSION_MIN(6, 0, 5)
+static constexpr size_t kNUM_CUDNN_FWD_ALGS = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
 static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
     CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
 static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
     CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
+#else
+// cuDNN v5 has no CUDNN_CONVOLUTION_FWD_ALGO_COUNT etc.
+static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7;
+static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
+static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
+#endif
 
 template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index d5eec148f9b4f76866ec9fca98a596b9bc2860ef..e5c3f0eeb385e1a15fdbb12a989603996420efe3 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -22,6 +22,7 @@ iou_similarity_op.cu)
 detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
 detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc)
 detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
+detection_library(density_prior_box_op SRCS density_prior_box_op.cc)
 detection_library(anchor_generator_op SRCS anchor_generator_op.cc
 anchor_generator_op.cu)
 detection_library(target_assign_op SRCS target_assign_op.cc
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cc b/paddle/fluid/operators/detection/density_prior_box_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..99df15c3226b4305a28a3912398d6d1c766daa73
--- /dev/null
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cc
@@ -0,0 +1,175 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/density_prior_box_op.h"
+
+namespace paddle {
+namespace operators {
+
+class DensityPriorBoxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of DensityPriorBoxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Image"),
+                   "Input(Image) of DensityPriorBoxOp should not be null.");
+
+    auto image_dims = ctx->GetInputDim("Image");
+    auto input_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE(image_dims.size() == 4, "The layout of image is NCHW.");
+    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+
+    PADDLE_ENFORCE_LT(input_dims[2], image_dims[2],
+                      "The height of input must smaller than image.");
+
+    PADDLE_ENFORCE_LT(input_dims[3], image_dims[3],
+                      "The width of input must smaller than image.");
+    auto variances = ctx->Attrs().Get<std::vector<float>>("variances");
+
+    auto fixed_sizes = ctx->Attrs().Get<std::vector<float>>("fixed_sizes");
+    auto fixed_ratios = ctx->Attrs().Get<std::vector<float>>("fixed_ratios");
+    auto densities = ctx->Attrs().Get<std::vector<int>>("densities");
+
+    PADDLE_ENFORCE_EQ(fixed_sizes.size(), densities.size(),
+                      "The number of fixed_sizes and densities must be equal.");
+    size_t num_priors = 0;
+    if ((fixed_sizes.size() > 0) && (densities.size() > 0)) {
+      for (size_t i = 0; i < densities.size(); ++i) {
+        if (fixed_ratios.size() > 0) {
+          num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
+        }
+      }
+    }
+    std::vector<int64_t> dim_vec(4);
+    dim_vec[0] = input_dims[2];
+    dim_vec[1] = input_dims[3];
+    dim_vec[2] = num_priors;
+    dim_vec[3] = 4;
+    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
+    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
+        platform::CPUPlace());
+  }
+};
+
+class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "Input",
+        "(Tensor, default Tensor<float>), "
+        "the input feature data of DensityPriorBoxOp, the layout is NCHW.");
+    AddInput("Image",
+             "(Tensor, default Tensor<float>), "
+             "the input image data of DensityPriorBoxOp, the layout is NCHW.");
+    AddOutput("Boxes",
+              "(Tensor, default Tensor<float>), the output prior boxes of "
+              "DensityPriorBoxOp. The layout is [H, W, num_priors, 4]. "
+              "H is the height of input, W is the width of input, num_priors "
+              "is the box count of each position.");
+    AddOutput("Variances",
+              "(Tensor, default Tensor<float>), the expanded variances of "
+              "DensityPriorBoxOp. The layout is [H, W, num_priors, 4]. "
+              "H is the height of input, W is the width of input, num_priors "
+              "is the box count of each position.");
+    AddAttr<std::vector<float>>("variances",
+                                "(vector<float>) List of variances to be "
+                                "encoded in density prior boxes.")
+        .AddCustomChecker([](const std::vector<float>& variances) {
+          PADDLE_ENFORCE_EQ(variances.size(), 4,
+                            "Must and only provide 4 variance.");
+          for (size_t i = 0; i < variances.size(); ++i) {
+            PADDLE_ENFORCE_GT(variances[i], 0.0,
+                              "variance[%d] must be greater than 0.", i);
+          }
+        });
+    AddAttr<bool>("clip", "(bool) Whether to clip out-of-boundary boxes.")
+        .SetDefault(true);
+
+    AddAttr<float>(
+        "step_w",
+        "Density prior boxes step across width, 0.0 for auto calculation.")
+        .SetDefault(0.0)
+        .AddCustomChecker([](const float& step_w) {
+          PADDLE_ENFORCE_GE(step_w, 0.0, "step_w should be larger than 0.");
+        });
+    AddAttr<float>(
+        "step_h",
+        "Density prior boxes step across height, 0.0 for auto calculation.")
+        .SetDefault(0.0)
+        .AddCustomChecker([](const float& step_h) {
+          PADDLE_ENFORCE_GE(step_h, 0.0, "step_h should be larger than 0.");
+        });
+
+    AddAttr<float>("offset",
+                   "(float) "
+                   "Density prior boxes center offset.")
+        .SetDefault(0.5);
+    AddAttr<std::vector<float>>("fixed_sizes",
+                                "(vector<float>) List of fixed sizes "
+                                "of generated density prior boxes.")
+        .SetDefault(std::vector<float>{})
+        .AddCustomChecker([](const std::vector<float>& fixed_sizes) {
+          for (size_t i = 0; i < fixed_sizes.size(); ++i) {
+            PADDLE_ENFORCE_GT(fixed_sizes[i], 0.0,
+                              "fixed_sizes[%d] should be larger than 0.", i);
+          }
+        });
+
+    AddAttr<std::vector<float>>("fixed_ratios",
+                                "(vector<float>) List of fixed ratios "
+                                "of generated density prior boxes.")
+        .SetDefault(std::vector<float>{})
+        .AddCustomChecker([](const std::vector<float>& fixed_ratios) {
+          for (size_t i = 0; i < fixed_ratios.size(); ++i) {
+            PADDLE_ENFORCE_GT(fixed_ratios[i], 0.0,
+                              "fixed_ratios[%d] should be larger than 0.", i);
+          }
+        });
+
+    AddAttr<std::vector<int>>("densities",
+                              "(vector<float>) List of densities "
+                              "of generated density prior boxes.")
+        .SetDefault(std::vector<int>{})
+        .AddCustomChecker([](const std::vector<int>& densities) {
+          for (size_t i = 0; i < densities.size(); ++i) {
+            PADDLE_ENFORCE_GT(densities[i], 0,
+                              "densities[%d] should be larger than 0.", i);
+          }
+        });
+    AddComment(R"DOC(
+        Density Prior box operator
+        Each position of the input produce N density prior boxes, N is determined by
+        the count of fixed_ratios, densities, the calculation of N is as follows:
+        for density in densities:
+        N += size(fixed_ratios)*density^2
+        )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(density_prior_box, ops::DensityPriorBoxOp,
+                  ops::DensityPriorBoxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(density_prior_box, ops::DensityPriorBoxOpKernel<float>,
+                       ops::DensityPriorBoxOpKernel<double>);
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a52077e9cf90b278549a077af161bd4e282d972
--- /dev/null
+++ b/paddle/fluid/operators/detection/density_prior_box_op.h
@@ -0,0 +1,146 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/operators/detection/prior_box_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
+    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+    auto clip = ctx.Attr<bool>("clip");
+
+    auto fixed_sizes = ctx.Attr<std::vector<float>>("fixed_sizes");
+    auto fixed_ratios = ctx.Attr<std::vector<float>>("fixed_ratios");
+    auto densities = ctx.Attr<std::vector<int>>("densities");
+
+    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
+    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto img_width = image->dims()[3];
+    auto img_height = image->dims()[2];
+
+    auto feature_width = input->dims()[3];
+    auto feature_height = input->dims()[2];
+
+    T step_width, step_height;
+    if (step_w == 0 || step_h == 0) {
+      step_width = static_cast<T>(img_width) / feature_width;
+      step_height = static_cast<T>(img_height) / feature_height;
+    } else {
+      step_width = step_w;
+      step_height = step_h;
+    }
+    int num_priors = 0;
+    if (fixed_sizes.size() > 0 && densities.size() > 0) {
+      for (size_t i = 0; i < densities.size(); ++i) {
+        if (fixed_ratios.size() > 0) {
+          num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
+        }
+      }
+    }
+
+    boxes->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes).setConstant(0.0);
+
+    int step_average = static_cast<int>((step_width + step_height) * 0.5);
+
+    for (int h = 0; h < feature_height; ++h) {
+      for (int w = 0; w < feature_width; ++w) {
+        T center_x = (w + offset) * step_width;
+        T center_y = (h + offset) * step_height;
+        int idx = 0;
+        // Generate density prior boxes with fixed sizes.
+        for (size_t s = 0; s < fixed_sizes.size(); ++s) {
+          auto fixed_size = fixed_sizes[s];
+          int density = densities[s];
+          // Generate density prior boxes with fixed ratios.
+          if (fixed_ratios.size() > 0) {
+            for (size_t r = 0; r < fixed_ratios.size(); ++r) {
+              float ar = fixed_ratios[r];
+              int shift = step_average / density;
+              float box_width_ratio = fixed_size * sqrt(ar);
+              float box_height_ratio = fixed_size / sqrt(ar);
+              for (int di = 0; di < density; ++di) {
+                for (int dj = 0; dj < density; ++dj) {
+                  float center_x_temp =
+                      center_x - step_average / 2. + shift / 2. + dj * shift;
+                  float center_y_temp =
+                      center_y - step_average / 2. + shift / 2. + di * shift;
+                  e_boxes(h, w, idx, 0) =
+                      (center_x_temp - box_width_ratio / 2.) / img_width >= 0
+                          ? (center_x_temp - box_width_ratio / 2.) / img_width
+                          : 0;
+                  e_boxes(h, w, idx, 1) =
+                      (center_y_temp - box_height_ratio / 2.) / img_height >= 0
+                          ? (center_y_temp - box_height_ratio / 2.) / img_height
+                          : 0;
+                  e_boxes(h, w, idx, 2) =
+                      (center_x_temp + box_width_ratio / 2.) / img_width <= 1
+                          ? (center_x_temp + box_width_ratio / 2.) / img_width
+                          : 1;
+                  e_boxes(h, w, idx, 3) =
+                      (center_y_temp + box_height_ratio / 2.) / img_height <= 1
+                          ? (center_y_temp + box_height_ratio / 2.) / img_height
+                          : 1;
+                  idx++;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    if (clip) {
+      platform::Transform<platform::CPUDeviceContext> trans;
+      ClipFunctor<T> clip_func;
+      trans(ctx.template device_context<platform::CPUDeviceContext>(),
+            boxes->data<T>(), boxes->data<T>() + boxes->numel(),
+            boxes->data<T>(), clip_func);
+    }
+    framework::Tensor var_t;
+    var_t.mutable_data<T>(
+        framework::make_ddim({1, static_cast<int>(variances.size())}),
+        ctx.GetPlace());
+
+    auto var_et = framework::EigenTensor<T, 2>::From(var_t);
+
+    for (size_t i = 0; i < variances.size(); ++i) {
+      var_et(0, i) = variances[i];
+    }
+
+    int box_num = feature_height * feature_width * num_priors;
+    auto var_dim = vars->dims();
+    vars->Resize({box_num, static_cast<int>(variances.size())});
+
+    auto e_vars = framework::EigenMatrix<T, Eigen::RowMajor>::From(*vars);
+
+    e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(box_num, 1));
+
+    vars->Resize(var_dim);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
index c82930cc4994c3854e60f40ae9909a90d82cbff6..2d262f932aed9761143f7983c9a38f7a97c374ea 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -15,6 +15,10 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using paddle::platform::float16;
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index 93204216f947e5203863a3493005faa0c03ae4af..7bb6934e1496cc989eee8ba82f56959522803bfb 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -111,6 +111,17 @@ class RowwiseTransformIterator<T, platform::CPUDeviceContext>
     return *this;
   }
 
+  RowwiseTransformIterator<T, platform::CPUDeviceContext> &operator+(int n) {
+    while (n-- > 0) {
+      ++i_;
+      if (UNLIKELY(i_ == n_)) {
+        i_ = 0;
+      }
+    }
+
+    return *this;
+  }
+
   bool operator==(const RowwiseTransformIterator<T, platform::CPUDeviceContext>
                       &rhs) const {
     return (ptr_ + i_) == &(*rhs);
@@ -149,6 +160,21 @@ class MidWiseTransformIterator<T, platform::CPUDeviceContext>
     return *this;
   }
 
+  MidWiseTransformIterator<T, platform::CPUDeviceContext> &operator+(int n) {
+    while (n-- > 0) {
+      ++j_;
+      if (UNLIKELY(j_ == post_)) {
+        ++i_;
+        j_ = 0;
+        if (UNLIKELY(i_ == n_)) {
+          i_ = 0;
+        }
+      }
+    }
+
+    return *this;
+  }
+
   bool operator==(const MidWiseTransformIterator<T, platform::CPUDeviceContext>
                       &rhs) const {
     return (ptr_ + i_) == &(*rhs);
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index fa4dec9cf118cef9b836943fd4eae90d23e6218a..e80249fc87855311479b35af61f872182292795a 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -27,11 +27,9 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
                  "Out(Output) of Fully Connected should not be null.");
   PADDLE_ENFORCE(ctx->HasInput("W"),
                  "W(Input) of Fully Connected should not be null.");
-  // NCHW
+
   auto in_dims = ctx->GetInputDim("Input");
-  // IO, I=C*H*W
   auto w_dims = ctx->GetInputDim("W");
-  std::vector<int64_t> output_shape({in_dims[0], w_dims[1]});
 
   if (ctx->HasInput("Bias")) {
     auto bias_dims = ctx->GetInputDim("Bias");
@@ -44,14 +42,32 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
                         "The shape of Bias must be [1, dim].");
     }
   }
-  PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
-                 "Fully Connected input should be 2-D or 4-D tensor.");
+
+  if (ctx->Attrs().Get<bool>("use_mkldnn")) {
+    PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
+                   "Fully Connected input should be 2-D or 4-D tensor.");
+  }
   PADDLE_ENFORCE_EQ(w_dims.size(), 2UL,
                     "Fully Connected input should be 2-D tensor.");
-  PADDLE_ENFORCE_EQ(framework::product(in_dims) / in_dims[0], w_dims[0],
-                    "Fully Connected input and weigth size do not match.");
+  int in_num_col_dims = ctx->Attrs().Get<int>("in_num_col_dims");
+  PADDLE_ENFORCE_GT(
+      in_dims.size(), in_num_col_dims,
+      "The input tensor Input's rank of FCOp should be larger than "
+      "in_num_col_dims.");
+
+  auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims);
+  PADDLE_ENFORCE_EQ(
+      in_mat_dims[1], w_dims[0],
+      "Fully Connected input and weigth size do not match. %s, %s");
+
+  std::vector<int64_t> output_dims;
+  output_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
+  for (int i = 0; i < in_num_col_dims; ++i) {
+    output_dims.push_back(in_dims[i]);
+  }
+  output_dims.push_back(w_dims[1]);
 
-  ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
   ctx->ShareLoD("Input", "Out");
 }
 
@@ -101,12 +117,15 @@ framework::OpKernelType FCOpGrad::GetExpectedKernelType(
 }
 
 void FCOpMaker::Make() {
-  AddInput("Input",
-           "(Tensor), The input tensor of fully connected operator with format "
-           "(NCHW). ");
+  AddInput("Input", "(Tensor), The input tensor of fully connected operator.");
   AddInput("W", "(Tensor), The weight fc op with shape (I, O).");
   AddInput("Bias", "(Tensor, optional) Bias vector with shape (1 x O")
       .AsDispensable();
+  AddAttr<int>("in_num_col_dims",
+               "(int, default 1), The fc op can take tensors with more than "
+               "two dimensions as its inputs.")
+      .SetDefault(1)
+      .EqualGreaterThan(1);
   AddOutput("Out", "(Tensor) The output tensor of fully connected operator. ");
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
@@ -131,13 +150,15 @@ class FCOpKernel : public framework::OpKernel<T> {
     auto output = ctx.Output<Tensor>("Out");
     auto in_dims = input->dims();
     auto w_dims = w->dims();
+    auto out_dims = output->dims();
+    int M = framework::product(out_dims) / out_dims[out_dims.size() - 1];
 
     const T* input_data = input->data<T>();
     const T* w_data = w->data<T>();
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
     auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
     math::FCCompute<platform::CPUDeviceContext, T>(
-        blas, in_dims[0], w_dims[1], w_dims[0], input_data, w_data, output_data,
+        blas, M, w_dims[1], w_dims[0], input_data, w_data, output_data,
         bias ? bias->data<T>() : NULL);
 
     // TODO(TJ): fuse act
diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h
index d74d4db92528d69492ab7b90a98d3775dadc35d1..e4df59c5d51c390cf593add0c5562665c91f33f6 100644
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
@@ -50,7 +50,9 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
                const Tensor& index, Tensor* output) {
   // PADDLE_ENFORCE(platform::is_gpu_place(place));
   // check index of shape 1-D
-  PADDLE_ENFORCE(index.dims().size() == 1);
+  PADDLE_ENFORCE(index.dims().size() == 1 ||
+                 (index.dims().size() == 2 && index.dims()[1] == 1));
+
   int index_size = index.dims()[0];
 
   auto src_dims = src.dims();
diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h
index d72e07d76c97e9e455e54980207d7c02842cc04b..dc08ee5efacde5e232d751b13aaf11f51237634a 100644
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/fluid/operators/gather.h
@@ -38,7 +38,8 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
                const Tensor& index, Tensor* output) {
   PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
   // check index of shape 1-D
-  PADDLE_ENFORCE(index.dims().size() == 1);
+  PADDLE_ENFORCE(index.dims().size() == 1 ||
+                 (index.dims().size() == 2 && index.dims()[1] == 1));
   int64_t index_size = index.dims()[0];
 
   auto src_dims = src.dims();
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index f84ff206fffddef1030b7ed439e887bdfef342a6..95aa9b573c795159079bdb5401b34d7a61252115 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -31,7 +31,8 @@ class GatherOp : public framework::OperatorWithKernel {
                    "Output(Out) of GatherOp should not be null.");
 
     auto index_dims = ctx->GetInputDim("Index");
-    PADDLE_ENFORCE(index_dims.size() == 1);
+    PADDLE_ENFORCE(index_dims.size() == 1 ||
+                   (index_dims.size() == 2 && index_dims[1] == 1));
     int batch_size = ctx->GetInputDim("Index")[0];
     framework::DDim output_dims(ctx->GetInputDim("X"));
     output_dims[0] = batch_size;
@@ -53,6 +54,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X"));
   }
 
  protected:
@@ -75,7 +77,7 @@ Gather Operator.
 
 $Out = X[Index]$
 
-Out is obtained by gathering entries of the outer-most dimension 
+Out is obtained by gathering entries of the outer-most dimension
 of X indexed by Index and concatenate them together.
 
 Example:
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
index 4e91a3dcd272c8d368cb8c43e7e1fb4c98265db4..08a6043eb07a6e44d46428ee195f6cb28c2ee77c 100644
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -63,7 +63,8 @@ static void CalcGridLocations(const platform::CPUDeviceContext& ctx,
   Tensor ones;
   ones.mutable_data<T>({n, h, w}, ctx.GetPlace());
   auto ones_t = EigenTensor<T, 3>::From(ones).setConstant(1.0);
-  Tensor half_xmax, half_ymax;
+  Tensor half_xmax;
+  Tensor half_ymax;
   half_xmax.mutable_data<T>({n, h, w}, ctx.GetPlace());
   auto half_xmax_t =
       EigenTensor<T, 3>::From(half_xmax).setConstant(0.5 * x_max);
diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc
index b9ebe71a3d7ae270a10a45f4805652415078b363..b2c2c7954b79658e66f1524a81bcad0b7bf22c35 100644
--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
@@ -38,7 +38,7 @@ class HashOp : public framework::OperatorWithKernel {
     std::vector<int64_t> out_dims;
     out_dims.reserve(dims.size() + 1);
     // copy all dims except the last one
-    for (size_t i = 0u; i != dims.size() - 1; ++i) {
+    for (int i = 0u; i != dims.size() - 1; ++i) {
       out_dims.emplace_back(dims[i]);
     }
     int num_hash = ctx->Attrs().Get<int>("num_hash");
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 51219504ffa2a778b56351f759e8a8dfb951ad91..df1edc5c2e994b3093d6f6e7e4f6e0e5b2abb469 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -40,8 +40,9 @@ class LoadOp : public framework::OperatorBase {
 
     auto out_var_name = Output("Out");
     auto *out_var = scope.FindVar(out_var_name);
-    PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
-                   out_var_name);
+    PADDLE_ENFORCE(out_var != nullptr,
+                   "Output variable %s cannot be found in scope %p",
+                   out_var_name, &scope);
 
     if (out_var->IsType<framework::LoDTensor>()) {
       LoadLodTensor(fin, place, out_var);
diff --git a/paddle/fluid/operators/lookup_sparse_table_op.cc b/paddle/fluid/operators/lookup_sparse_table_op.cc
index de3f0990e109cacd49c4d888bbc1f797fb196e01..a6843f20a59a23bd4e875b0f96524cc8d7aa46d6 100644
--- a/paddle/fluid/operators/lookup_sparse_table_op.cc
+++ b/paddle/fluid/operators/lookup_sparse_table_op.cc
@@ -45,6 +45,7 @@ class LookupSparseTableOp : public framework::OperatorBase {
     auto out_var = scope.FindVar(Output("Out"));
     auto w_var = scope.FindVar(Input("W"));
     auto ids_var = scope.FindVar(Input("Ids"));
+    auto is_test = Attr<bool>("is_test");
 
     PADDLE_ENFORCE(out_var->IsType<framework::LoDTensor>(),
                    "The type of Out var should be LodTensor.");
@@ -65,7 +66,7 @@ class LookupSparseTableOp : public framework::OperatorBase {
     PADDLE_ENFORCE_EQ(framework::ToDataType(w_t->value().type()),
                       framework::proto::VarType::FP32,
                       "The sparse table only support FP32");
-    w_t->Get(ids_t, out_t, true);
+    w_t->Get(ids_t, out_t, true, is_test);
   }
 };
 
@@ -91,6 +92,10 @@ class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool default false)"
                   "Whether create new value if for nonexistent key.")
         .SetDefault(true);
+    AddAttr<bool>("is_test",
+                  "In test mode, lookup_sparse_table will "
+                  "return a 0 for unknown id")
+        .SetDefault(false);
     AddComment(R"DOC(
 Lookup Sprase Tablel Operator.
 
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 52b459a6a2e56b7c256efdb535b4652c64bae23c..61c3cb34a2472c0ba7d2a7ea5abf8e826a793951 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/lrn_op.h"
 #include <string>
+#include "paddle/fluid/operators/math/blas.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -29,34 +30,43 @@ struct LRNFunctor<platform::CPUDeviceContext, T> {
                   const framework::Tensor& input, framework::Tensor* out,
                   framework::Tensor* mid, int N, int C, int H, int W, int n,
                   T k, T alpha, T beta) {
-    auto x_v = framework::EigenVector<T>::Flatten(input);
-
-    const int start = -(n - 1) / 2;
-    const int end = start + n;
-
-    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
-    e_mid = e_mid.constant(k);
-
-    auto e_x = framework::EigenTensor<T, 4>::From(input);
-    for (int m = 0; m < N; m++) {
-      for (int i = 0; i < C; i++) {
-        for (int c = start; c < end; c++) {
-          int ch = i + c;
-          if (ch >= 0 && ch < C) {
-            auto s = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                 Eigen::array<int, 4>({{1, 1, H, W}}));
-
-            auto r = e_x.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                               Eigen::array<int, 4>({{1, 1, H, W}}));
-
-            s += alpha * r.square();
-          }
-        }
+    const T* idata = input.data<T>();
+    auto place = ctx.GetPlace();
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    T* odata = out->mutable_data<T>(place);
+    T* mdata = mid->mutable_data<T>(place);
+    Tensor squared;
+    T* sdata = squared.mutable_data<T>({1, C + n - 1, H, W}, place);
+    std::memset(sdata, 0, sizeof(T) * squared.numel());
+    for (int i = 0; i < mid->numel(); ++i) {
+      mdata[i] = k;
+    }
+    int img_size = H * W;
+    int fea_size = C * img_size;
+    int pre_pad = (n - 1) / 2;
+    // compute batches one by one
+    for (int i = 0; i < N; ++i) {
+      blas.VSQR(fea_size, idata + i * fea_size, sdata + pre_pad * img_size);
+      // init the first channel of mid
+      for (int c = 0; c < n; ++c) {
+        blas.AXPY(img_size, alpha, sdata + c * img_size, mdata + i * fea_size);
+      }
+      for (int c = 1; c < C; ++c) {
+        // copy previous scale
+        int mid_offset = i * fea_size + c * img_size;
+        std::memcpy(mdata + mid_offset, mdata + mid_offset - img_size,
+                    img_size * sizeof(T));
+        // add last
+        blas.AXPY(img_size, alpha, sdata + (c + n - 1) * img_size,
+                  mdata + mid_offset);
+        // sub rest
+        blas.AXPY(img_size, -alpha, sdata + (c - 1) * img_size,
+                  mdata + mid_offset);
       }
     }
-
-    auto out_e = framework::EigenVector<T>::Flatten(*out);
-    out_e = x_v * e_mid.reshape(Eigen::DSizes<int, 1>(e_mid.size())).pow(-beta);
+    // compute the final output
+    blas.VPOW(mid->numel(), mdata, -beta, odata);
+    blas.VMUL(mid->numel(), odata, idata, odata);
   }
 };
 template struct LRNFunctor<platform::CPUDeviceContext, float>;
@@ -156,6 +166,9 @@ class LRNOp : public framework::OperatorWithKernel {
     auto x_dim = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4.");
 
+    int n = ctx->Attrs().Get<int>("n");
+    PADDLE_ENFORCE(n > 0 && n % 2 == 1, "n should be positive odd value");
+
     ctx->SetOutputDim("Out", x_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
     ctx->SetOutputDim("MidOut", x_dim);
diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h
index 0fd3175e8579df9e61368cc151a94fa45e433884..12d39c3815395896343238b536110aecac66a376 100644
--- a/paddle/fluid/operators/lrn_op.h
+++ b/paddle/fluid/operators/lrn_op.h
@@ -60,7 +60,6 @@ class LRNKernel : public framework::OpKernel<T> {
     T beta = ctx.Attr<float>("beta");
     T k = ctx.Attr<float>("k");
 
-    PADDLE_ENFORCE(n > 0, "n should >= 0");
     PADDLE_ENFORCE(alpha >= 0.0, "alpha should >= 0.0");
     PADDLE_ENFORCE(beta >= 0.0, "beta should >= 0.0");
     PADDLE_ENFORCE(k >= 0.0, "k should >= 0.0");
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index d9bf2a8aafb9264aa62f3f7404746a53bbb4155f..a4100585592496a03c2e1b5f5c132103a0aa9a8d 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -75,12 +75,13 @@ if(WITH_GPU)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
-
-set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc)
-set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
-if(WITH_XBYAK)
-    list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
-    list(APPEND JIT_KERNEL_DEPS xbyak)
-endif()
-cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
-cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
+if (NOT WIN32)
+    set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc)
+    set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
+    if(WITH_XBYAK)
+        list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
+        list(APPEND JIT_KERNEL_DEPS xbyak)
+    endif()
+    cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
+    cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
+endif (NOT WIN32)
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index da185d93c09f9b06bd5968b9c8e93176f9ef014b..5d0d562030d2a20e4a1cefd3c36c6533fd35dc96 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -152,6 +152,12 @@ class Blas {
   template <typename T>
   void VEXP(int n, const T* x, T* y) const;
 
+  template <typename T>
+  void VSQR(int n, const T* x, T* y) const;
+
+  template <typename T>
+  void VPOW(int n, const T* x, T alpha, T* y) const;
+
   template <typename T>
   void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta,
             T* C) const;
@@ -238,6 +244,16 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template VEXP<T>(args...);
   }
 
+  template <typename... ARGS>
+  void VSQR(ARGS... args) const {
+    Base()->template VSQR<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void VPOW(ARGS... args) const {
+    Base()->template VPOW<T>(args...);
+  }
+
   template <typename... ARGS>
   void GEMV(ARGS... args) const {
     Base()->template GEMV<T>(args...);
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index e1df78d11e41c5f74e244643f40c6d0581fa6a4a..59454669be9e0f92a6fc0db52445307d88e1c7d8 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
+#include <cmath>
 #include <limits>
 #include <vector>
 #include "paddle/fluid/operators/math/math_function.h"
@@ -102,6 +103,16 @@ struct CBlas<float> {
   static void VEXP(ARGS... args) {
     platform::dynload::vsExp(args...);
   }
+
+  template <typename... ARGS>
+  static void VSQR(ARGS... args) {
+    platform::dynload::vsSqr(args...);
+  }
+
+  template <typename... ARGS>
+  static void VPOW(ARGS... args) {
+    platform::dynload::vsPowx(args...);
+  }
 };
 
 template <>
@@ -182,6 +193,16 @@ struct CBlas<double> {
   static void VEXP(ARGS... args) {
     platform::dynload::vdExp(args...);
   }
+
+  template <typename... ARGS>
+  static void VSQR(ARGS... args) {
+    platform::dynload::vdSqr(args...);
+  }
+
+  template <typename... ARGS>
+  static void VPOW(ARGS... args) {
+    platform::dynload::vdPowx(args...);
+  }
 };
 
 #else
@@ -241,6 +262,8 @@ struct CBlas<platform::float16> {
   }
   static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
   static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
+  static void VSQR(...) { PADDLE_THROW("float16 VSQR not supported on CPU"); }
+  static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
   static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
   static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
 #ifdef PADDLE_WITH_MKLML
@@ -398,6 +421,31 @@ void Blas<platform::CPUDeviceContext>::VEXP(int n, const T *x, T *y) const {
 #endif
 }
 
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::VSQR(int n, const T *x, T *y) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VSQR(n, x, y);
+#else
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::sqrt(x[i]);
+  }
+#endif
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::VPOW(int n, const T *x, T a,
+                                            T *y) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VPOW(n, x, a, y);
+#else
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::pow(x[i], a);
+  }
+#endif
+}
+
 template <>
 template <typename T>
 T Blas<platform::CPUDeviceContext>::DOT(int n, const T *x, const T *y) const {
diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index 6b3eecfbd11471b5d95dcb10c91acc536f78cb85..e46f60f764ab9f1c292db339a5b38b976de5a11a 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -118,6 +118,39 @@ void VXXJitCode::generate() {
   ret();
 }
 
+bool ReluJitCode::init(int d) { return MayIUse(avx); }
+
+void ReluJitCode::generate() {
+  int offset = 0;
+  vxorps(ymm_zero, ymm_zero, ymm_zero);
+  for (int i = 0; i < num_ / AVX_FLOAT_BLOCK; ++i) {
+    vmovups(ymm_src, ptr[param1 + offset]);
+    vmaxps(ymm_dst, ymm_zero, ymm_src);
+    vmovups(ptr[param2 + offset], ymm_dst);
+    offset += sizeof(float) * AVX_FLOAT_BLOCK;
+  }
+  int rest = num_ % AVX_FLOAT_BLOCK;
+  if (rest >= 4) {
+    vmovups(xmm_src, ptr[param1 + offset]);
+    vmaxps(xmm_dst, xmm_zero, xmm_src);
+    vmovups(ptr[param2 + offset], xmm_dst);
+    offset += sizeof(float) * 4;
+    rest -= 4;
+  }
+  if (rest >= 2) {
+    vmovups(xmm_src, ptr[param1 + offset]);
+    vmaxps(xmm_dst, xmm_zero, xmm_src);
+    vmovq(ptr[param2 + offset], xmm_dst);
+    offset += sizeof(float) * 2;
+    rest -= 2;
+  }
+  if (rest > 0) {
+    vmovups(xmm_src, ptr[param1 + offset]);
+    vmaxps(xmm_dst, xmm_zero, xmm_src);
+    vmovss(ptr[param2 + offset], xmm_dst);
+  }
+  ret();
+}
 }  // namespace gen
 }  // namespace jitkernel
 }  // namespace math
diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h
index aaedb0ae10323eeddfba9512d9e47c7a22320610..3c242870a24c5bb29d34d4b99406c5df8cec6763 100644
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -85,6 +85,29 @@ class VXXJitCode : public JitCode {
   ymm_t ymm_zero = ymm_t(3);
 };
 
+class ReluJitCode : public JitCode {
+ public:
+  DECLARE_JIT_CODE(ReluJitCode);
+  explicit ReluJitCode(int d, size_t code_size = 256 * 1024,
+                       void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr), num_(d) {}
+  static bool init(int d);
+  void generate() override;
+
+ private:
+  int num_;
+  reg64_t param1{abi_param1};
+  reg64_t param2{abi_param2};
+
+  xmm_t xmm_zero = xmm_t(0);
+  xmm_t xmm_src = xmm_t(1);
+  xmm_t xmm_dst = xmm_t(1);
+
+  ymm_t ymm_zero = ymm_t(0);
+  ymm_t ymm_src = ymm_t(1);
+  ymm_t ymm_dst = ymm_t(1);
+};
+
 }  // namespace gen
 }  // namespace jitkernel
 }  // namespace math
diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h
index e9b259282cd00cc2afc46634423ec09590bf5dd3..cd3a45e66773c89e45e80ab77ebd925abd6cbe53 100644
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -97,37 +97,38 @@ class VAddBiasKernel : public Kernel {
 template <typename T>
 class VActKernel : public Kernel {
  public:
-  virtual void Compute(const T *x, T *y) const = 0;
+  virtual void ComputeDeprecated(const T *x, T *y) const = 0;
 };
 
 template <typename T>
 class VReluKernel : public VActKernel<T> {
  public:
-  virtual void Compute(const T *x, T *y) const = 0;
+  virtual void ComputeDeprecated(const T *x, T *y) const = 0;
+  void (*Compute)(const T *, T *, int);
 };
 
 template <typename T>
 class VIdentityKernel : public VActKernel<T> {
  public:
-  virtual void Compute(const T *x, T *y) const = 0;
+  virtual void ComputeDeprecated(const T *x, T *y) const = 0;
 };
 
 template <typename T>
 class VExpKernel : public VActKernel<T> {
  public:
-  virtual void Compute(const T *x, T *y) const = 0;
+  virtual void ComputeDeprecated(const T *x, T *y) const = 0;
 };
 
 template <typename T>
 class VSigmoidKernel : public VActKernel<T> {
  public:
-  virtual void Compute(const T *x, T *y) const = 0;
+  virtual void ComputeDeprecated(const T *x, T *y) const = 0;
 };
 
 template <typename T>
 class VTanhKernel : public VActKernel<T> {
  public:
-  virtual void Compute(const T *x, T *y) const = 0;
+  virtual void ComputeDeprecated(const T *x, T *y) const = 0;
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc
index c4bfbcf925a2bbdc39f8468049c58e126d3eba1b..cf46a210afbd4903dc3841f27765c390f721c763 100644
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -71,6 +71,13 @@ void VAddBiasRefer(const T* a, const T* x, T* y, int n) {
   }
 }
 
+template <typename T>
+void VReluRefer(const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
 #ifdef PADDLE_WITH_MKLML
 template <typename T>
 void VMulMKL(const T* x, const T* y, T* z, int n);
@@ -344,124 +351,60 @@ bool VAddBiasKernelImpl<float>::useJIT(int d) {
 }
 #endif
 
-#undef DECLARE_STATIC_FUNC
-
-REGISTER_JITKERNEL(vmul, VMulKernel);
-REGISTER_JITKERNEL(vadd, VAddKernel);
-REGISTER_JITKERNEL(vaddrelu, VAddReluKernel);
-REGISTER_JITKERNEL(vscal, VScalKernel);
-REGISTER_JITKERNEL(vaddbias, VAddBiasKernel);
-
 /* VRelu JitKernel */
-template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+template <typename T>
 class VReluKernelImpl : public VReluKernel<T> {
  public:
-  explicit VReluKernelImpl(int d) : VReluKernel<T>() { this->num_ = d; }
-  void Compute(const T* x, T* y) const override {
-    for (int i = 0; i < this->num_; ++i) {
-      y[i] = x[i] > 0 ? x[i] : 0;
+  DECLARE_STATIC_FUNC;
+  explicit VReluKernelImpl(int d) : VReluKernel<T>() {
+    this->num_ = d;  // TODO(TJ): remove me when ComputeDeprecated done
+#ifdef PADDLE_WITH_XBYAK
+    if (useJIT(d)) {
+      size_t sz = 96 /*init*/ +
+                  d / AVX_FLOAT_BLOCK * 4 /* instructions*/ *
+                      8 /*everage byte for each instruction*/;
+      jitcode_.reset(new gen::ReluJitCode(d, sz > 4096 ? sz : 4096));
+      this->Compute = jitcode_->getCode<void (*)(const T*, T*, int)>();
+      return;
     }
-  }
-};
-
-#define INTRI8_FLOAT(isa)                                                   \
-  template <>                                                               \
-  void VReluKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
-      const {                                                               \
-    __m256 tmp = _mm256_loadu_ps(x);                                        \
-    tmp = _mm256_max_ps(tmp, _mm256_setzero_ps());                          \
-    _mm256_storeu_ps(y, tmp);                                               \
-  }
-
-#define INTRI16_FLOAT(isa)                                                   \
-  template <>                                                                \
-  void VReluKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
-      const {                                                                \
-    __m256 zeros = _mm256_setzero_ps();                                      \
-    __m256 tmp0 = _mm256_loadu_ps(x);                                        \
-    __m256 tmp1 = _mm256_loadu_ps(x + 8);                                    \
-    tmp0 = _mm256_max_ps(tmp0, zeros);                                       \
-    tmp1 = _mm256_max_ps(tmp1, zeros);                                       \
-    _mm256_storeu_ps(y, tmp0);                                               \
-    _mm256_storeu_ps(y + 8, tmp1);                                           \
-  }
+#endif
 
-#define INTRI_GT8LT16_FLOAT(isa)                                        \
-  template <>                                                           \
-  VReluKernelImpl<float, isa, kGT8LT16>::VReluKernelImpl(int d)         \
-      : VReluKernel<float>() {                                          \
-    this->num_ = d;                                                     \
-    this->end_ = AVX_FLOAT_BLOCK;                                       \
-    this->rest_ = d - AVX_FLOAT_BLOCK;                                  \
-  }                                                                     \
-  template <>                                                           \
-  void VReluKernelImpl<float, isa, kGT8LT16>::Compute(const float* x,   \
-                                                      float* y) const { \
-    __m256 zeros = _mm256_setzero_ps();                                 \
-    __m256 tmp0 = _mm256_loadu_ps(x);                                   \
-    __m256 tmp1 = _mm256_loadu_ps(x + this->rest_);                     \
-    tmp0 = _mm256_max_ps(tmp0, zeros);                                  \
-    tmp1 = _mm256_max_ps(tmp1, zeros);                                  \
-    _mm256_storeu_ps(y, tmp0);                                          \
-    _mm256_storeu_ps(y + this->rest_, tmp1);                            \
+    this->Compute = VReluRefer<T>;
   }
-
-#define INTRI_GT16_FLOAT(isa)                                                \
-  template <>                                                                \
-  VReluKernelImpl<float, isa, kGT16>::VReluKernelImpl(int d)                 \
-      : VReluKernel<float>() {                                               \
-    this->num_ = d;                                                          \
-    this->end_ = d - d % AVX_FLOAT_BLOCK;                                    \
-    this->rest_ = d - AVX_FLOAT_BLOCK;                                       \
-  }                                                                          \
-  template <>                                                                \
-  void VReluKernelImpl<float, isa, kGT16>::Compute(const float* x, float* y) \
-      const {                                                                \
-    __m256 zeros = _mm256_setzero_ps();                                      \
-    for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) {                  \
-      __m256 tmp = _mm256_loadu_ps(x + i);                                   \
-      tmp = _mm256_max_ps(tmp, zeros);                                       \
-      _mm256_storeu_ps(y + i, tmp);                                          \
-    }                                                                        \
-    __m256 tmp = _mm256_loadu_ps(x + this->rest_);                           \
-    tmp = _mm256_max_ps(tmp, zeros);                                         \
-    _mm256_storeu_ps(y + this->rest_, tmp);                                  \
+  void ComputeDeprecated(const T* x, T* y) const override {
+    VReluRefer(x, y, this->num_);
   }
+#ifdef PADDLE_WITH_XBYAK
 
-#ifdef __AVX__
-INTRI8_FLOAT(jit::avx);
-INTRI16_FLOAT(jit::avx);
-INTRI_GT8LT16_FLOAT(jit::avx);
-INTRI_GT16_FLOAT(jit::avx);
-#endif
-#ifdef __AVX2__
-INTRI8_FLOAT(jit::avx2);
-INTRI16_FLOAT(jit::avx2);
-INTRI_GT8LT16_FLOAT(jit::avx2);
-INTRI_GT16_FLOAT(jit::avx2);
+ private:
+  std::unique_ptr<gen::ReluJitCode> jitcode_{nullptr};
 #endif
-#ifdef __AVX512F__
-// TODO(TJ): refine avx512
-INTRI8_FLOAT(jit::avx512f);
-INTRI16_FLOAT(jit::avx512f);
-INTRI_GT8LT16_FLOAT(jit::avx512f);
-INTRI_GT16_FLOAT(jit::avx512f);
+};
+
+#ifdef PADDLE_WITH_XBYAK
+template <>
+bool VReluKernelImpl<float>::useJIT(int d) {
+  return gen::ReluJitCode::init(d);
+}
 #endif
 
-#undef INTRI8_FLOAT
-#undef INTRI16_FLOAT
-#undef INTRI_GT8LT16_FLOAT
-#undef INTRI_GT16_FLOAT
+#undef DECLARE_STATIC_FUNC
+
+REGISTER_JITKERNEL(vmul, VMulKernel);
+REGISTER_JITKERNEL(vadd, VAddKernel);
+REGISTER_JITKERNEL(vaddrelu, VAddReluKernel);
+REGISTER_JITKERNEL(vscal, VScalKernel);
+REGISTER_JITKERNEL(vaddbias, VAddBiasKernel);
+REGISTER_JITKERNEL(vrelu, VReluKernel);
 
 /* An empty JitKernel */
 template <typename T, platform::jit::cpu_isa_t isa, jit_block>
 class VIdentityKernelImpl : public VIdentityKernel<T> {
  public:
   explicit VIdentityKernelImpl(int d) : VIdentityKernel<T>() { this->num_ = d; }
-  void Compute(const T* x, T* y) const override {}
+  void ComputeDeprecated(const T* x, T* y) const override {}
 };
 
-REGISTER_JITKERNEL_DEPRECATED(vrelu, VReluKernel);
 REGISTER_JITKERNEL_DEPRECATED(videntity, VIdentityKernel);
 
 }  // namespace jitkernel
diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc
index c55e54a13f539014c0f582436ca1a105d0b0fedd..2ac9e1092362f60ea3d89da0c971a365b45f39ea 100644
--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
@@ -35,7 +35,7 @@ template <typename T, jit::cpu_isa_t isa, jit_block>
 class VExpKernelImpl : public VExpKernel<T> {
  public:
   explicit VExpKernelImpl(int d) : VExpKernel<T>() { this->num_ = d; }
-  void Compute(const T* x, T* y) const override {
+  void ComputeDeprecated(const T* x, T* y) const override {
     for (int i = 0; i < this->num_; ++i) {
       y[i] = std::exp(x[i]);
     }
@@ -43,18 +43,18 @@ class VExpKernelImpl : public VExpKernel<T> {
 };
 
 #ifdef PADDLE_WITH_MKLML
-#define MKL_FLOAT(isa, block)                                               \
-  template <>                                                               \
-  void VExpKernelImpl<float, isa, block>::Compute(const float* x, float* y) \
-      const {                                                               \
-    platform::dynload::vsExp(this->num_, x, y);                             \
+#define MKL_FLOAT(isa, block)                                                 \
+  template <>                                                                 \
+  void VExpKernelImpl<float, isa, block>::ComputeDeprecated(const float* x,   \
+                                                            float* y) const { \
+    platform::dynload::vsExp(this->num_, x, y);                               \
   }
 
-#define MKL_DOUBLE(isa, block)                                                 \
-  template <>                                                                  \
-  void VExpKernelImpl<double, isa, block>::Compute(const double* x, double* y) \
-      const {                                                                  \
-    platform::dynload::vdExp(this->num_, x, y);                                \
+#define MKL_DOUBLE(isa, block)                                \
+  template <>                                                 \
+  void VExpKernelImpl<double, isa, block>::ComputeDeprecated( \
+      const double* x, double* y) const {                     \
+    platform::dynload::vdExp(this->num_, x, y);               \
   }
 FOR_EACH_ISA(MKL_FLOAT, kLT8);
 FOR_EACH_ISA(MKL_FLOAT, kGT8LT16);
@@ -211,24 +211,24 @@ __m256 ExpAVX2(__m256 x) {
 
 }  // namespace detail
 
-#define INTRI8_FLOAT(isa, expisa)                                          \
-  template <>                                                              \
-  void VExpKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
-      const {                                                              \
-    __m256 tmp = _mm256_loadu_ps(x);                                       \
-    _mm256_storeu_ps(y, expisa(tmp));                                      \
+#define INTRI8_FLOAT(isa, expisa)                                            \
+  template <>                                                                \
+  void VExpKernelImpl<float, isa, kEQ8>::ComputeDeprecated(const float* x,   \
+                                                           float* y) const { \
+    __m256 tmp = _mm256_loadu_ps(x);                                         \
+    _mm256_storeu_ps(y, expisa(tmp));                                        \
   }
 
-#define INTRI16_FLOAT(isa, expisa)                                          \
-  template <>                                                               \
-  void VExpKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
-      const {                                                               \
-    __m256 tmp0 = _mm256_loadu_ps(x);                                       \
-    __m256 tmp1 = _mm256_loadu_ps(x + 8);                                   \
-    tmp0 = expisa(tmp0);                                                    \
-    tmp1 = expisa(tmp1);                                                    \
-    _mm256_storeu_ps(y, tmp0);                                              \
-    _mm256_storeu_ps(y + 8, tmp1);                                          \
+#define INTRI16_FLOAT(isa, expisa)                                            \
+  template <>                                                                 \
+  void VExpKernelImpl<float, isa, kEQ16>::ComputeDeprecated(const float* x,   \
+                                                            float* y) const { \
+    __m256 tmp0 = _mm256_loadu_ps(x);                                         \
+    __m256 tmp1 = _mm256_loadu_ps(x + 8);                                     \
+    tmp0 = expisa(tmp0);                                                      \
+    tmp1 = expisa(tmp1);                                                      \
+    _mm256_storeu_ps(y, tmp0);                                                \
+    _mm256_storeu_ps(y + 8, tmp1);                                            \
   }
 
 #ifdef __AVX__
@@ -260,14 +260,14 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
     this->num_ = d;
     vexp_ = KernelPool::Instance().template Get<VExpKernel<T>>(d);
   }
-  void Compute(const T* x, T* y) const override {
+  void ComputeDeprecated(const T* x, T* y) const override {
     const T min = SIGMOID_THRESHOLD_MIN;
     const T max = SIGMOID_THRESHOLD_MAX;
     for (int i = 0; i < this->num_; ++i) {
       y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
       y[i] = static_cast<T>(0) - y[i];
     }
-    vexp_->Compute(y, y);
+    vexp_->ComputeDeprecated(y, y);
     for (int i = 0; i < this->num_; ++i) {
       y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
     }
@@ -285,30 +285,30 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
   tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \
   tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp)
 
-#define INTRI8_FLOAT(isa, expisa)                                              \
-  template <>                                                                  \
-  void VSigmoidKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
-      const {                                                                  \
-    /* TODO(TJ): try to use static const*/                                     \
-    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);                        \
-    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);                        \
-    __m256 tmp = _mm256_loadu_ps(x);                                           \
-    INTRI_SIGMOID(tmp, min, max, expisa);                                      \
-    _mm256_storeu_ps(y, tmp);                                                  \
+#define INTRI8_FLOAT(isa, expisa)                               \
+  template <>                                                   \
+  void VSigmoidKernelImpl<float, isa, kEQ8>::ComputeDeprecated( \
+      const float* x, float* y) const {                         \
+    /* TODO(TJ): try to use static const*/                      \
+    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);         \
+    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);         \
+    __m256 tmp = _mm256_loadu_ps(x);                            \
+    INTRI_SIGMOID(tmp, min, max, expisa);                       \
+    _mm256_storeu_ps(y, tmp);                                   \
   }
 
-#define INTRI16_FLOAT(isa, expisa)                                      \
-  template <>                                                           \
-  void VSigmoidKernelImpl<float, isa, kEQ16>::Compute(const float* x,   \
-                                                      float* y) const { \
-    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);                 \
-    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);                 \
-    __m256 tmp0 = _mm256_loadu_ps(x);                                   \
-    __m256 tmp1 = _mm256_loadu_ps(x + 8);                               \
-    INTRI_SIGMOID(tmp0, min, max, expisa);                              \
-    INTRI_SIGMOID(tmp1, min, max, expisa);                              \
-    _mm256_storeu_ps(y, tmp0);                                          \
-    _mm256_storeu_ps(y + 8, tmp1);                                      \
+#define INTRI16_FLOAT(isa, expisa)                               \
+  template <>                                                    \
+  void VSigmoidKernelImpl<float, isa, kEQ16>::ComputeDeprecated( \
+      const float* x, float* y) const {                          \
+    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);          \
+    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);          \
+    __m256 tmp0 = _mm256_loadu_ps(x);                            \
+    __m256 tmp1 = _mm256_loadu_ps(x + 8);                        \
+    INTRI_SIGMOID(tmp0, min, max, expisa);                       \
+    INTRI_SIGMOID(tmp1, min, max, expisa);                       \
+    _mm256_storeu_ps(y, tmp0);                                   \
+    _mm256_storeu_ps(y + 8, tmp1);                               \
   }
 
 #define INTRI_GT8LT16_FLOAT(isa, expisa)                                     \
@@ -322,8 +322,8 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
         KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
   }                                                                          \
   template <>                                                                \
-  void VSigmoidKernelImpl<float, isa, kGT8LT16>::Compute(const float* x,     \
-                                                         float* y) const {   \
+  void VSigmoidKernelImpl<float, isa, kGT8LT16>::ComputeDeprecated(          \
+      const float* x, float* y) const {                                      \
     __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);                      \
     __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);                      \
     __m256 tmp = _mm256_loadu_ps(x);                                         \
@@ -335,7 +335,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
       y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]);           \
       y[i] = 0.f - y[i];                                                     \
     }                                                                        \
-    vexp_->Compute(y + this->end_, y + this->end_);                          \
+    vexp_->ComputeDeprecated(y + this->end_, y + this->end_);                \
     for (int i = this->end_; i < this->num_; ++i) {                          \
       y[i] = 1.f / (1.f + y[i]);                                             \
     }                                                                        \
@@ -352,8 +352,8 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
         KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
   }                                                                          \
   template <>                                                                \
-  void VSigmoidKernelImpl<float, isa, kGT16>::Compute(const float* x,        \
-                                                      float* y) const {      \
+  void VSigmoidKernelImpl<float, isa, kGT16>::ComputeDeprecated(             \
+      const float* x, float* y) const {                                      \
     __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);                      \
     __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);                      \
     for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) {                  \
@@ -367,7 +367,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
       y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]);           \
       y[i] = 0.f - y[i];                                                     \
     }                                                                        \
-    vexp_->Compute(y + this->end_, y + this->end_);                          \
+    vexp_->ComputeDeprecated(y + this->end_, y + this->end_);                \
     for (int i = this->end_; i < this->num_; ++i) {                          \
       y[i] = 1.f / (1.f + y[i]);                                             \
     }                                                                        \
@@ -408,10 +408,10 @@ class VTanhKernelImpl : public VTanhKernel<T> {
     vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<T>>(d);
     vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<T>>(d);
   }
-  void Compute(const T* x, T* y) const override {
+  void ComputeDeprecated(const T* x, T* y) const override {
     const T a = static_cast<T>(2), b = static_cast<T>(-1);
     vscal_->Compute(&a, x, y, this->num_);
-    vsigmoid_->Compute(y, y);
+    vsigmoid_->ComputeDeprecated(y, y);
     vscal_->Compute(&a, y, y, this->num_);
     vaddbias_->Compute(&b, y, y, this->num_);
   }
@@ -430,25 +430,25 @@ class VTanhKernelImpl : public VTanhKernel<T> {
   tmp = _mm256_div_ps(_mm256_set1_ps(2.0f), tmp);          \
   tmp = _mm256_sub_ps(tmp, _mm256_set1_ps(1.0f))
 
-#define INTRI8_FLOAT(isa, expisa)                                           \
-  template <>                                                               \
-  void VTanhKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
-      const {                                                               \
-    __m256 tmp = _mm256_loadu_ps(x);                                        \
-    INTRI_VTANH(tmp, expisa);                                               \
-    _mm256_storeu_ps(y, tmp);                                               \
+#define INTRI8_FLOAT(isa, expisa)                                             \
+  template <>                                                                 \
+  void VTanhKernelImpl<float, isa, kEQ8>::ComputeDeprecated(const float* x,   \
+                                                            float* y) const { \
+    __m256 tmp = _mm256_loadu_ps(x);                                          \
+    INTRI_VTANH(tmp, expisa);                                                 \
+    _mm256_storeu_ps(y, tmp);                                                 \
   }
 
-#define INTRI16_FLOAT(isa, expisa)                                           \
-  template <>                                                                \
-  void VTanhKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
-      const {                                                                \
-    __m256 tmp0 = _mm256_loadu_ps(x);                                        \
-    __m256 tmp1 = _mm256_loadu_ps(x + 8);                                    \
-    INTRI_VTANH(tmp0, expisa);                                               \
-    INTRI_VTANH(tmp1, expisa);                                               \
-    _mm256_storeu_ps(y, tmp0);                                               \
-    _mm256_storeu_ps(y + 8, tmp1);                                           \
+#define INTRI16_FLOAT(isa, expisa)                                             \
+  template <>                                                                  \
+  void VTanhKernelImpl<float, isa, kEQ16>::ComputeDeprecated(const float* x,   \
+                                                             float* y) const { \
+    __m256 tmp0 = _mm256_loadu_ps(x);                                          \
+    __m256 tmp1 = _mm256_loadu_ps(x + 8);                                      \
+    INTRI_VTANH(tmp0, expisa);                                                 \
+    INTRI_VTANH(tmp1, expisa);                                                 \
+    _mm256_storeu_ps(y, tmp0);                                                 \
+    _mm256_storeu_ps(y + 8, tmp1);                                             \
   }
 
 #define INTRI_GT8LT16_FLOAT(isa, expisa)                                      \
@@ -466,8 +466,8 @@ class VTanhKernelImpl : public VTanhKernel<T> {
         this->rest_);                                                         \
   }                                                                           \
   template <>                                                                 \
-  void VTanhKernelImpl<float, isa, kGT8LT16>::Compute(const float* x,         \
-                                                      float* y) const {       \
+  void VTanhKernelImpl<float, isa, kGT8LT16>::ComputeDeprecated(              \
+      const float* x, float* y) const {                                       \
     __m256 tmp = _mm256_loadu_ps(x);                                          \
     INTRI_VTANH(tmp, expisa);                                                 \
     _mm256_storeu_ps(y, tmp);                                                 \
@@ -475,40 +475,40 @@ class VTanhKernelImpl : public VTanhKernel<T> {
     y += AVX_FLOAT_BLOCK;                                                     \
     const float a = 2.f, b = -1.f;                                            \
     vscal_->Compute(&a, x, y, this->num_);                                    \
-    vsigmoid_->Compute(y, y);                                                 \
+    vsigmoid_->ComputeDeprecated(y, y);                                       \
     vscal_->Compute(&a, y, y, this->num_);                                    \
     vaddbias_->Compute(&b, y, y, this->num_);                                 \
   }
 
-#define INTRI_GT16_FLOAT(isa, expisa)                                         \
-  template <>                                                                 \
-  VTanhKernelImpl<float, isa, kGT16>::VTanhKernelImpl(int d)                  \
-      : VTanhKernel<float>() {                                                \
-    this->num_ = d;                                                           \
-    this->rest_ = d % AVX_FLOAT_BLOCK;                                        \
-    this->end_ = d - this->rest_;                                             \
-    vscal_ =                                                                  \
-        KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_); \
-    vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>(   \
-        this->rest_);                                                         \
-    vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>(   \
-        this->rest_);                                                         \
-  }                                                                           \
-  template <>                                                                 \
-  void VTanhKernelImpl<float, isa, kGT16>::Compute(const float* x, float* y)  \
-      const {                                                                 \
-    for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) {                   \
-      __m256 tmp = _mm256_loadu_ps(x + i);                                    \
-      INTRI_VTANH(tmp, expisa);                                               \
-      _mm256_storeu_ps(y + i, tmp);                                           \
-    }                                                                         \
-    x += this->end_;                                                          \
-    y += this->end_;                                                          \
-    const float a = 2.f, b = -1.f;                                            \
-    vscal_->Compute(&a, x, y, this->num_);                                    \
-    vsigmoid_->Compute(y, y);                                                 \
-    vscal_->Compute(&a, y, y, this->num_);                                    \
-    vaddbias_->Compute(&b, y, y, this->num_);                                 \
+#define INTRI_GT16_FLOAT(isa, expisa)                                          \
+  template <>                                                                  \
+  VTanhKernelImpl<float, isa, kGT16>::VTanhKernelImpl(int d)                   \
+      : VTanhKernel<float>() {                                                 \
+    this->num_ = d;                                                            \
+    this->rest_ = d % AVX_FLOAT_BLOCK;                                         \
+    this->end_ = d - this->rest_;                                              \
+    vscal_ =                                                                   \
+        KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_);  \
+    vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>(    \
+        this->rest_);                                                          \
+    vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>(    \
+        this->rest_);                                                          \
+  }                                                                            \
+  template <>                                                                  \
+  void VTanhKernelImpl<float, isa, kGT16>::ComputeDeprecated(const float* x,   \
+                                                             float* y) const { \
+    for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) {                    \
+      __m256 tmp = _mm256_loadu_ps(x + i);                                     \
+      INTRI_VTANH(tmp, expisa);                                                \
+      _mm256_storeu_ps(y + i, tmp);                                            \
+    }                                                                          \
+    x += this->end_;                                                           \
+    y += this->end_;                                                           \
+    const float a = 2.f, b = -1.f;                                             \
+    vscal_->Compute(&a, x, y, this->num_);                                     \
+    vsigmoid_->ComputeDeprecated(y, y);                                        \
+    vscal_->Compute(&a, y, y, this->num_);                                     \
+    vaddbias_->Compute(&b, y, y, this->num_);                                  \
   }
 
 #ifdef __AVX__
diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc
index ba3e917377cf12192a068a9d71238442e12d5e5e..926221f0a75c461e275a72f16b4339ae28a8e988 100644
--- a/paddle/fluid/operators/math/jit_kernel_rnn.cc
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
@@ -175,26 +175,26 @@ class LSTMKernelImpl : public LSTMKernel<T> {
   void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data,
                    T* checked) const override {
     // gates: W_ch, W_ih, W_fh, W_oh
-    act_gate_d3_->Compute(gates + d_, gates + d_);
+    act_gate_d3_->ComputeDeprecated(gates + d_, gates + d_);
 
     /* C_t = C_t-1 * fgated + cand_gated * igated */
-    act_cand_d_->Compute(gates, gates);
+    act_cand_d_->ComputeDeprecated(gates, gates);
     vmul_d_->Compute(gates, gates + d_, gates + d_, d_);
     vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_);
     vadd_d_->Compute(gates + d_, gates + d2_, ct, d_);
 
     /* H_t = act_cell(C_t) * ogated */
-    act_cell_d_->Compute(ct, gates + d2_);
+    act_cell_d_->ComputeDeprecated(ct, gates + d2_);
     vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
   }
   void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
     /* C_t = igated * cgated*/
-    act_gate_d_->Compute(gates + d_, gates + d_);
-    act_cand_d_->Compute(gates, gates);
+    act_gate_d_->ComputeDeprecated(gates + d_, gates + d_);
+    act_cand_d_->ComputeDeprecated(gates, gates);
     vmul_d_->Compute(gates, gates + d_, ct, d_);
     /* H_t = act_cell(C_t) * ogated */
-    act_gate_d_->Compute(gates + d3_, gates + d3_);
-    act_cell_d_->Compute(ct, gates + d2_);
+    act_gate_d_->ComputeDeprecated(gates + d3_, gates + d3_);
+    act_cell_d_->ComputeDeprecated(ct, gates + d2_);
     vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
   }
 
@@ -292,32 +292,32 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
     vmul_d_->Compute(wp_data, ct_1, checked, d_);
     vmul_d_->Compute(wp_data + d_, ct_1, checked + d_, d_);
     vadd_d2_->Compute(checked, gates + d_, gates + d_, d2_);
-    act_gate_d2_->Compute(gates + d_, gates + d_);
+    act_gate_d2_->ComputeDeprecated(gates + d_, gates + d_);
     /* C_t = C_t-1 * fgated + cand_gated * igated*/
-    act_cand_d_->Compute(gates, gates);
+    act_cand_d_->ComputeDeprecated(gates, gates);
     vmul_d_->Compute(gates, gates + d_, gates + d_, d_);
     vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_);
     vadd_d_->Compute(gates + d_, gates + d2_, ct, d_);
     /* get ogated*/
     vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_);
     vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_);
-    act_gate_d_->Compute(gates + d3_, gates + d3_);
+    act_gate_d_->ComputeDeprecated(gates + d3_, gates + d3_);
     /* H_t = act_cell(C_t) * ogated */
-    act_cell_d_->Compute(ct, gates + d2_);
+    act_cell_d_->ComputeDeprecated(ct, gates + d2_);
     vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
   }
 
   void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
     /* C_t = igated * cgated*/
-    act_gate_d_->Compute(gates + d_, gates + d_);
-    act_cand_d_->Compute(gates, gates);
+    act_gate_d_->ComputeDeprecated(gates + d_, gates + d_);
+    act_cand_d_->ComputeDeprecated(gates, gates);
     vmul_d_->Compute(gates, gates + d_, ct, d_);
     /* get outgated, put W_oc * C_t on igated */
     vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_);
     vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_);
     /* H_t = act_cell(C_t) * ogated */
-    act_gate_d_->Compute(gates + d3_, gates + d3_);
-    act_cell_d_->Compute(ct, gates + d2_);
+    act_gate_d_->ComputeDeprecated(gates + d3_, gates + d3_);
+    act_cell_d_->ComputeDeprecated(ct, gates + d2_);
     vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
   }
 
@@ -376,20 +376,20 @@ class GRUKernelImpl : public GRUKernel<T> {
   }
 
   void ComputeH1(T* gates, T* ht) const override {
-    act_gate_d_->Compute(gates, gates);
-    act_state_d_->Compute(gates + d2_, gates + d2_);
+    act_gate_d_->ComputeDeprecated(gates, gates);
+    act_state_d_->ComputeDeprecated(gates + d2_, gates + d2_);
     vmul_d_->Compute(gates, gates + d2_, ht, d_);
   }
 
   void ComputeHtPart1(T* gates, const T* ht_1, T* ht) const override {
     // W: {W_update, W_reset; W_state}
-    act_gate_d2_->Compute(gates, gates);
+    act_gate_d2_->ComputeDeprecated(gates, gates);
     vmul_d_->Compute(ht_1, gates + d_, ht, d_);
   }
 
   void ComputeHtPart2(T* gates, const T* ht_1, T* ht) const override {
     T* y = gates + d2_;
-    act_state_d_->Compute(y, y);
+    act_state_d_->ComputeDeprecated(y, y);
     // out = zt*ht~ + (1-zt)*ht_1
     for (int i = 0; i < d_; ++i) {
       ht[i] = gates[i] * y[i] + (static_cast<T>(1) - gates[i]) * ht_1[i];
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
index 7dc3e600b564d95b46070ff4436b2d0de2f3e105..5e1f91ffae03796be2817d0461900c2512938c77 100644
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -92,7 +92,7 @@ TEST(JitKernel, vrelu) {
 #endif
     auto ttgts = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      ker->Compute(x_data, ztgt_data);
+      ker->Compute(x_data, ztgt_data, d);
     }
     auto ttgte = GetCurrentUS();
     VLOG(30) << "Vec size " << d
@@ -181,7 +181,7 @@ TEST(JitKernel, vexp) {
 
     auto ttgts = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      ker->Compute(x_data, ztgt_data);
+      ker->ComputeDeprecated(x_data, ztgt_data);
     }
     auto ttgte = GetCurrentUS();
 
@@ -222,7 +222,7 @@ void vsigmoid_better(
     y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
     y[i] = 0.f - y[i];
   }
-  vexp->Compute(y, y);
+  vexp->ComputeDeprecated(y, y);
   for (int i = 0; i < n; ++i) {
     y[i] = 1.f / (1.f + y[i]);
   }
@@ -253,7 +253,7 @@ TEST(JitKernel, vsigmoid) {
     auto trefe = GetCurrentUS();
     auto ttgts = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      ker->Compute(x_data, ztgt_data);
+      ker->ComputeDeprecated(x_data, ztgt_data);
     }
     auto ttgte = GetCurrentUS();
 
@@ -287,7 +287,7 @@ void vtanh_better(
     const int n, const float* x, float* y) {
   const float a = 2.f, b = -1.f;
   vscal->Compute(&a, x, y, n);
-  vsigmoid->Compute(y, y);
+  vsigmoid->ComputeDeprecated(y, y);
   vscal->Compute(&a, y, y, n);
   vaddbias->Compute(&b, y, y, n);
 }
@@ -321,7 +321,7 @@ TEST(JitKernel, vtanh) {
     auto trefe = GetCurrentUS();
     auto ttgts = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      ker->Compute(x_data, ztgt_data);
+      ker->ComputeDeprecated(x_data, ztgt_data);
     }
     auto ttgte = GetCurrentUS();
 
@@ -344,8 +344,8 @@ void lstm_ctht_ref(
     const std::shared_ptr<
         const paddle::operators::math::jitkernel::VExpKernel<float>>& vexp_1,
     const int d, float* gates, const float* ct_1, float* ct, float* ht) {
-  vsigmoid_3d->Compute(gates + d, gates + d);
-  vtanh_d->Compute(gates, gates);
+  vsigmoid_3d->ComputeDeprecated(gates + d, gates + d);
+  vtanh_d->ComputeDeprecated(gates, gates);
   const float *i = gates + d, *f = gates + d * 2, *o = gates + d * 3;
   const float min = SIGMOID_THRESHOLD_MIN;
   const float max = SIGMOID_THRESHOLD_MAX;
@@ -355,7 +355,7 @@ void lstm_ctht_ref(
     // H_t = act_cell(C_t) * ogated
     float tmp = ct[k] * 2;
     tmp = 0.f - ((tmp < min) ? min : ((tmp > max) ? max : tmp));
-    vexp_1->Compute(&tmp, &tmp);
+    vexp_1->ComputeDeprecated(&tmp, &tmp);
     tmp = 2.f / (1.f + tmp) - 1.f;
     ht[k] = tmp * o[k];
   }
@@ -373,13 +373,13 @@ void lstm_ctht_better(
         const paddle::operators::math::jitkernel::VAddKernel<float>>& vadd_d,
     const int d, float* gates, const float* ct_1, float* ct, float* ht) {
   int d2 = d * 2;
-  vsigmoid_3d->Compute(gates + d, gates + d);
-  vtanh_d->Compute(gates, gates);
+  vsigmoid_3d->ComputeDeprecated(gates + d, gates + d);
+  vtanh_d->ComputeDeprecated(gates, gates);
   vmul_d->Compute(gates, gates + d, gates + d, d);
   vmul_d->Compute(ct_1, gates + d2, gates + d2, d);
   vadd_d->Compute(gates + d, gates + d2, ct, d);
   /* H_t = act_cell(C_t) * ogated */
-  vtanh_d->Compute(ct, gates + d2);
+  vtanh_d->ComputeDeprecated(ct, gates + d2);
   vmul_d->Compute(gates + d2, gates + d * 3, ht, d);
 }
 
@@ -736,7 +736,7 @@ void vaddrelu_better(
         const paddle::operators::math::jitkernel::VReluKernel<float>>& vrelu,
     const float* x, const float* y, float* z, int d) {
   vadd->Compute(x, y, z, d);
-  vrelu->Compute(z, z);
+  vrelu->ComputeDeprecated(z, z);
 }
 
 TEST(JitKernel, vaddrelu) {
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 9577a4cb9d275df9604b7578f8685e4d2938a5e9..5978c1d6056001142854583840b8bfcb54d475d1 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -244,7 +244,7 @@ typename std::enable_if<
     std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
 elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
                    size_t data_len, const T* in, T* out) {
-  for (int64_t i = 0; i < data_len; i++) {
+  for (size_t i = 0; i < data_len; i++) {
     out[i] += in[i];
   }
 }
diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc
index 2bc008dd34ffcfe93a00bd4a8cde61626d91e235..5535523e798912ff80eeb5d753914c7d8d70a05f 100644
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
@@ -70,11 +70,11 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
   EXPECT_EQ(in_grad.lod(), lod);
 
   if (paddle::platform::is_cpu_place(*place)) {
-    for (int64_t i = 0; i < in_grad.lod()[0].size() - 1; ++i) {
+    for (size_t i = 0; i < in_grad.lod()[0].size() - 1; ++i) {
       int64_t begin = in_grad.lod()[0][i];
       int64_t end = in_grad.lod()[0][i + 1];
       paddle::framework::Tensor tmp = in_grad.Slice(begin, end);
-      for (int64_t j = 0; j != tmp.numel() / second_dim; ++j) {
+      for (size_t j = 0; j != tmp.numel() / second_dim; ++j) {
         for (int64_t m = 0; m != second_dim; ++m) {
           EXPECT_EQ(tmp.data<T>()[m + j * second_dim],
                     out_grad.data<T>()[m + i * second_dim]);
@@ -82,11 +82,11 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
       }
     }
   } else {
-    for (int64_t i = 0; i < cpu_in_grad.lod()[0].size() - 1; ++i) {
+    for (size_t i = 0; i < cpu_in_grad.lod()[0].size() - 1; ++i) {
       int64_t begin = cpu_in_grad.lod()[0][i];
       int64_t end = cpu_in_grad.lod()[0][i + 1];
       paddle::framework::Tensor tmp = cpu_in_grad.Slice(begin, end);
-      for (int64_t j = 0; j != tmp.numel() / second_dim; ++j) {
+      for (size_t j = 0; j != tmp.numel() / second_dim; ++j) {
         for (int64_t m = 0; m != second_dim; ++m) {
           EXPECT_EQ(tmp.data<T>()[m + j * second_dim],
                     cpu_out_grad.data<T>()[m + i * second_dim]);
diff --git a/paddle/fluid/operators/math/softmax.cc b/paddle/fluid/operators/math/softmax.cc
index 78c65af24a8c5fa57e33415acc3018790bf70790..fa2018178f44ff4e3b14937c1f508fa8a698e20e 100644
--- a/paddle/fluid/operators/math/softmax.cc
+++ b/paddle/fluid/operators/math/softmax.cc
@@ -19,8 +19,10 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template class SoftmaxFunctor<platform::CPUDeviceContext, float>;
-template class SoftmaxFunctor<platform::CPUDeviceContext, double>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, float, true>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, float, false>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, double, true>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, double, false>;
 template class SoftmaxGradFunctor<platform::CPUDeviceContext, float>;
 template class SoftmaxGradFunctor<platform::CPUDeviceContext, double>;
 
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index ce183ed3649055aab31eb6e3f44f2224475957e9..2e9669049e36478549b793e3fa76220825888e21 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -98,9 +98,14 @@ template class SoftmaxGradCUDNNFunctor<float>;
 template class SoftmaxGradCUDNNFunctor<double>;
 template class SoftmaxGradCUDNNFunctor<platform::float16>;
 
-template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
+                              false>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
+                              true>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, float, false>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, double, false>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, float, true>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, double, true>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/math/softmax.h b/paddle/fluid/operators/math/softmax.h
index da1f0b672d3a5fb5da8f4d72892be21964bdbc0d..bf698dc2f753f0002557af07ad7ea976c85edada 100644
--- a/paddle/fluid/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename DeviceContext, typename T>
+template <typename DeviceContext, typename T, bool is_test>
 class SoftmaxFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor* X,
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index dd9971ba091cc3ece86654f65c335b98087f45ed..7cf98f27251db3cfe5e8e295ed21056f6e5a2963 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -32,10 +32,10 @@ struct ValueClip {
   }
 };
 
-template <typename DeviceContext, typename T>
-void SoftmaxFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
-                                                  const framework::Tensor* X,
-                                                  framework::Tensor* Y) {
+template <typename DeviceContext, typename T, bool is_test>
+void SoftmaxFunctor<DeviceContext, T, is_test>::operator()(
+    const DeviceContext& context, const framework::Tensor* X,
+    framework::Tensor* Y) {
   auto logits = EigenMatrix<T>::From(*X);
   auto softmax = EigenMatrix<T>::From(*Y);
 
@@ -65,6 +65,39 @@ void SoftmaxFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
                                                  .broadcast(one_by_class));
 }
 
+template <typename DeviceContext, typename T>
+class SoftmaxFunctor<DeviceContext, T, true> {
+  void operator()(const DeviceContext& context, const framework::Tensor* X,
+                  framework::Tensor* Y) {
+    auto logits = EigenMatrix<T>::From(*X);
+    auto softmax = EigenMatrix<T>::From(*Y);
+
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+
+    auto shifted_logits = (logits -
+                           logits.maximum(along_class)
+                               .eval()
+                               .reshape(batch_by_one)
+                               .broadcast(one_by_class));
+
+    softmax.device(*context.eigen_device()) = shifted_logits.exp();
+    softmax.device(*context.eigen_device()) = (softmax *
+                                               softmax.sum(along_class)
+                                                   .inverse()
+                                                   .eval()
+                                                   .reshape(batch_by_one)
+                                                   .broadcast(one_by_class));
+  }
+};
+
 template <typename DeviceContext, typename T>
 void SoftmaxGradFunctor<DeviceContext, T>::operator()(
     const DeviceContext& context, const framework::Tensor* y,
diff --git a/paddle/fluid/operators/merge_ids_op.h b/paddle/fluid/operators/merge_ids_op.h
index fef9e023d02f45e21ec409ad398ba7d9bdd36880..99c57590191d58a12760fb335df76037685d1ced 100644
--- a/paddle/fluid/operators/merge_ids_op.h
+++ b/paddle/fluid/operators/merge_ids_op.h
@@ -43,11 +43,11 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(ids.size(), outs.size(),
                       "the number of Ids and Out should be the same");
 
-    int row_ids_size = 0;
+    size_t row_ids_size = 0;
     int row_size = 0;
     int embedding_size = 0;
 
-    for (int i = 0; i < x_tensors.size(); ++i) {
+    for (size_t i = 0; i < x_tensors.size(); ++i) {
       const auto *x_tensor = x_tensors[i];
       const auto *row_id = row_ids[i];
 
@@ -66,7 +66,7 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
 
     std::unordered_map<int64_t, std::tuple<int64_t, int64_t>>
         selected_rows_idx_map;
-    for (int i = 0; i < x_tensors.size(); ++i) {
+    for (size_t i = 0; i < x_tensors.size(); ++i) {
       const auto *row_id = row_ids[i];
 
       for (int j = 0; j < row_id->numel(); ++j) {
@@ -78,7 +78,7 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(row_ids_size, selected_rows_idx_map.size(),
                       "the rows and tensor map size should be the same");
 
-    for (int i = 0; i < outs.size(); ++i) {
+    for (size_t i = 0; i < outs.size(); ++i) {
       auto *out_ids = ids[i];
       auto *out = outs[i];
 
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 08f2949d4a3774894912ae5251806b46e6240702..7e434c293c9631025a5a725d62838fa12e845838 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -56,7 +56,8 @@ class MulOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_EQ(x_mat_dims[1], y_mat_dims[0],
                       "First matrix's width must be equal with second matrix's "
-                      "height. %s, %s");
+                      "height. %s, %s",
+                      x_mat_dims[1], y_mat_dims[0]);
     std::vector<int64_t> output_dims;
     output_dims.reserve(
         static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index e471f04662a1fa3e8e77a2db37f0da4521682018..877c9a0528441a7d5b1306c3f8f8be1a5aea577a 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -69,7 +69,7 @@ class NCEOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
-        ctx.GetPlace());
+        platform::CPUPlace());
   }
 };
 
@@ -174,7 +174,7 @@ class NCEOpGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
-        ctx.GetPlace());
+        platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index 37646c7b4c50fc7409002aca56e5462bde93cc30..685ebc393794337e03de0b9ce5134d6ce382c8bf 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -74,7 +74,7 @@ PadConstantLikeOp Operator.
 
 Pad input(Y) with a pad_value, the number of values padded to the edges of each
 axis is specified by the difference of the shape of X and Y.
-((0, shape_x_0 - shape_y_0), … (0, shape_x_n - shape_y_n)) unique pad widths for
+((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n)) unique pad widths for
 each axis.
 The input should be a k-D tensor(k > 0 and k < 7). As an example:
 
diff --git a/paddle/fluid/operators/reduce_max_op.cu b/paddle/fluid/operators/reduce_max_op.cu
index 0d86b3127e42f7ee14ba57b1c762e8128a0f2d54..b21da178f3eeaafa41bde5f64cc4abcf7944b032 100644
--- a/paddle/fluid/operators/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_max_op.cu
@@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_max,
                                           int, ops::MaxFunctor>,
                         ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                           int64_t, ops::MaxFunctor>);
-REGISTER_OP_CUDA_KERNEL(
-    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_max_op.part.cu b/paddle/fluid/operators/reduce_max_op.part.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6954c8d744faee6f8f0b715d6e4c8e3bcda7fb83
--- /dev/null
+++ b/paddle/fluid/operators/reduce_max_op.part.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.cu b/paddle/fluid/operators/reduce_mean_op.cu
index 59b30244839849d79e3e531953134633503c4090..4408200d2d052c2f68c2dd35619de6ed67f07f6e 100644
--- a/paddle/fluid/operators/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_mean_op.cu
@@ -69,13 +69,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>,
                         ops::ReduceMeanKernel<double>,
                         ops::ReduceMeanKernel<int>,
                         ops::ReduceMeanKernel<int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                            float, ops::MeanGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::MeanGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::MeanGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::MeanGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_mean_op.part.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4b663bcdca7c20f8802d962a362f429d8eafe9af
--- /dev/null
+++ b/paddle/fluid/operators/reduce_mean_op.part.cu
@@ -0,0 +1,26 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// .part used to speed up nvcc compile
+#include "paddle/fluid/operators/reduce_mean_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                            float, ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MeanGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_min_op.cu b/paddle/fluid/operators/reduce_min_op.cu
index da466f805eff4709dc23471baef03e94052ee6c1..5a04a12b79444dcea30d3c1140d9708a98b55fe3 100644
--- a/paddle/fluid/operators/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_min_op.cu
@@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_min,
                                           int, ops::MinFunctor>,
                         ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                           int64_t, ops::MinFunctor>);
-REGISTER_OP_CUDA_KERNEL(
-    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_min_op.part.cu b/paddle/fluid/operators/reduce_min_op.part.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5b8f061b2d03eb76863401905ac87044fd5ea778
--- /dev/null
+++ b/paddle/fluid/operators/reduce_min_op.part.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_prod_op.cu b/paddle/fluid/operators/reduce_prod_op.cu
index d62e677d92cffecf629d1684026b0c7bcfec29e3..d8692afb96e4d5d3206210060684dd12fb4d79a7 100644
--- a/paddle/fluid/operators/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_prod_op.cu
@@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_prod,
                                           int, ops::ProdFunctor>,
                         ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                           int64_t, ops::ProdFunctor>);
-REGISTER_OP_CUDA_KERNEL(
-    reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                            float, ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_prod_op.part.cu b/paddle/fluid/operators/reduce_prod_op.part.cu
new file mode 100644
index 0000000000000000000000000000000000000000..486c578c64b9a2d80abc940a7c4266ef5fd23c7f
--- /dev/null
+++ b/paddle/fluid/operators/reduce_prod_op.part.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_prod_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                            float, ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_sum_op.cu b/paddle/fluid/operators/reduce_sum_op.cu
index 53cd9e9419dd9aecee730917ae21d7a4ab332ffc..2b031e8df99768c9208146640bddbe51149b2614 100644
--- a/paddle/fluid/operators/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_sum_op.cu
@@ -64,13 +64,3 @@ class ReduceSumKernel : public framework::OpKernel<T> {
 REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<float>,
                         ops::ReduceSumKernel<double>, ops::ReduceSumKernel<int>,
                         ops::ReduceSumKernel<int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                           float, ops::SumGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::SumGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::SumGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::SumGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_sum_op.part.cu
new file mode 100644
index 0000000000000000000000000000000000000000..525633f62a95b2d0d677fcbebe551b75cb2a180d
--- /dev/null
+++ b/paddle/fluid/operators/reduce_sum_op.part.cu
@@ -0,0 +1,26 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/cub_reduce.h"
+#include "paddle/fluid/operators/reduce_sum_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::SumGradFunctor>);
diff --git a/paddle/fluid/operators/ref_by_trainer_id_op.h b/paddle/fluid/operators/ref_by_trainer_id_op.h
index 2ce577544ae2437b9297da2190fd09b435d5173c..34192278d84758d720e021215c14a54349ba0c62 100644
--- a/paddle/fluid/operators/ref_by_trainer_id_op.h
+++ b/paddle/fluid/operators/ref_by_trainer_id_op.h
@@ -38,7 +38,7 @@ class RefByTrainerIdKernel : public framework::OpKernel<T> {
     } else {
       trainer_id = *trainer_id_data;
     }
-    PADDLE_ENFORCE_LT(trainer_id, in_list.size());
+    PADDLE_ENFORCE_LT((size_t)trainer_id, in_list.size());
     out->mutable_data<T>(context.GetPlace());
     out->ShareDataWith(*(in_list[trainer_id]));
   }
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 8e29761ec208764e263e357a0b3c9456c932d093..043ea680d1506e7b7e33ba5537a71f37feaf81be 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -122,7 +122,7 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
               "(Tensor), "
               "Argmaxes corresponding to indices in X used "
               "for gradient computation. Only output "
-              "if arg “is_test” is false.")
+              "if arg \"is_test\" is false.")
         .AsIntermediate();
     AddAttr<float>("spatial_scale",
                    "(float, default 1.0), "
diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h
index ac7d69bfb549fd98c76fcf834e8d3ad9bec2ef23..b2e79f6c82bb748293f4219845e6798347c8c46e 100644
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
@@ -51,7 +51,8 @@ void GPUScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
                       const Tensor& index, Tensor* output) {
   // PADDLE_ENFORCE(platform::is_gpu_place(place));
   // check index of shape 1-D
-  PADDLE_ENFORCE(index.dims().size() == 1);
+  PADDLE_ENFORCE(index.dims().size() == 1 ||
+                 (index.dims().size() == 2 && index.dims()[1] == 1));
   int index_size = index.dims()[0];
 
   auto src_dims = src.dims();
diff --git a/paddle/fluid/operators/scatter.h b/paddle/fluid/operators/scatter.h
index 39af717615c01f5c121e32b176b74d05be738531..8bae6606c94620ab4fa8ae34f69236e7e87e9670 100644
--- a/paddle/fluid/operators/scatter.h
+++ b/paddle/fluid/operators/scatter.h
@@ -37,7 +37,8 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
                    const Tensor& index, Tensor* output) {
   PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
   // check index of shape 1-D
-  PADDLE_ENFORCE(index.dims().size() == 1);
+  PADDLE_ENFORCE(index.dims().size() == 1 ||
+                 (index.dims().size() == 2 && index.dims()[1] == 1));
   int index_size = index.dims()[0];
 
   auto src_dims = src.dims();
diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h
index 2e206c963ea009b436bb03433d30683a29fe83aa..b27ef27e298d0f08129e2c0a349c741129acdfe2 100644
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -109,8 +109,6 @@ class SGDOpKernel : public framework::OpKernel<T> {
       const auto *grad_data = grad.value().data<T>();
       auto *out_data = param_out->mutable_value()->data<T>();
       for (size_t i = 0; i < grad.rows().size(); i++) {
-        PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
-                       "Input rows index should less than height");
         int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false);
         PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
                           "id should be in the table");
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
index cf1eeb017d666f605a431aa54637d8cbc99c7c46..2fea8a65bc5141b11549ef400f11b54278be35f9 100644
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -35,8 +35,13 @@ class SoftmaxKernel : public framework::OpKernel<T> {
     Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1);
     Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
 
-    math::SoftmaxFunctor<DeviceContext, T>()(
+#ifdef ON_INFER
+    math::SoftmaxFunctor<DeviceContext, T, true>()(
         context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
+#else
+    math::SoftmaxFunctor<DeviceContext, T, false>()(
+        context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
+#endif
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
index e9aba3b37b8cc01d4fe5de5200579d4e93f67e56..c0530e3d8bc407ddd6d7bf6e10a715185d0beb1f 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -42,8 +42,8 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx =
         context.template device_context<platform::CPUDeviceContext>();
-    math::SoftmaxFunctor<platform::CPUDeviceContext, T>()(dev_ctx, logits,
-                                                          softmax);
+    math::SoftmaxFunctor<platform::CPUDeviceContext, T, false>()(
+        dev_ctx, logits, softmax);
     math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
         dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"),
         context.Attr<int>("ignore_index"));
diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h
index 6dbada3da8826f0e7cb07a9642d327e5ee38c309..f5d6d85d7d75507f82de212812ecee0a650d3aad 100644
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -64,7 +64,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
       out_ids.resize(outs.size());
 
       // split id by their shard_num.
-      for (int i = 0; i < all_ids.size(); ++i) {
+      for (size_t i = 0; i < all_ids.size(); ++i) {
         T id = all_ids[i];
         size_t shard_id = static_cast<size_t>(id) % shard_num;
         out_ids[shard_id].push_back(id);
diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc
index 3f4b48bc7391def082c82ed451fc5a752009a2f1..9345b495415d203728238c19621a20f446c40bf5 100644
--- a/paddle/fluid/operators/stack_op.cc
+++ b/paddle/fluid/operators/stack_op.cc
@@ -21,8 +21,12 @@ REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker,
 REGISTER_OPERATOR(stack_grad, ops::StackOpGrad);
 
 REGISTER_OP_CPU_KERNEL(stack, ops::StackKernel<plat::CPUDeviceContext, float>,
-                       ops::StackKernel<plat::CPUDeviceContext, double>);
+                       ops::StackKernel<plat::CPUDeviceContext, double>,
+                       ops::StackKernel<plat::CPUDeviceContext, int>,
+                       ops::StackKernel<plat::CPUDeviceContext, int64_t>);
 
 REGISTER_OP_CPU_KERNEL(stack_grad,
                        ops::StackGradKernel<plat::CPUDeviceContext, float>,
-                       ops::StackGradKernel<plat::CPUDeviceContext, double>);
+                       ops::StackGradKernel<plat::CPUDeviceContext, double>,
+                       ops::StackGradKernel<plat::CPUDeviceContext, int>,
+                       ops::StackGradKernel<plat::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu
index 92c1bde2bcf089e5c715e90e564408e6ad37ba17..bf2a9e5b3d22996e688621727cb280dc9aed7859 100644
--- a/paddle/fluid/operators/stack_op.cu
+++ b/paddle/fluid/operators/stack_op.cu
@@ -18,8 +18,12 @@ namespace plat = paddle::platform;
 namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(stack, ops::StackKernel<plat::CUDADeviceContext, float>,
-                        ops::StackKernel<plat::CUDADeviceContext, double>);
+                        ops::StackKernel<plat::CUDADeviceContext, double>,
+                        ops::StackKernel<plat::CUDADeviceContext, int>,
+                        ops::StackKernel<plat::CUDADeviceContext, int64_t>);
 
 REGISTER_OP_CUDA_KERNEL(stack_grad,
                         ops::StackGradKernel<plat::CUDADeviceContext, float>,
-                        ops::StackGradKernel<plat::CUDADeviceContext, double>);
+                        ops::StackGradKernel<plat::CUDADeviceContext, double>,
+                        ops::StackGradKernel<plat::CUDADeviceContext, int>,
+                        ops::StackGradKernel<plat::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
index 1d441b43b14ea194152095874645f8133c423efd..6d2ccb38f677962d52ff97df25321bf195759dcd 100644
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -57,8 +57,8 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
 Input shape is: $(N, C_{in}, H_{in}, W_{in})$, Output shape is:
 $(N, C_{out}, H_{out}, W_{out})$, where
 $$
-H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
-W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
+H_{out} = (H_{in}-1) * strides[0] - 2 * paddings[0] + ksize[0] \\
+W_{out} = (W_{in}-1) * strides[1] - 2 * paddings[1] + ksize[1]
 $$
 Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf
 )DOC");
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index aa20553ceffceded09447693c6e92f55fb48702d..9273e9b1e72f0ad7abd6c20d4a34283fbe24378a 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -76,6 +76,10 @@ extern void* mklml_dso_handle;
   __macro(vdMul);                   \
   __macro(vsExp);                   \
   __macro(vdExp);                   \
+  __macro(vsSqr);                   \
+  __macro(vdSqr);                   \
+  __macro(vsPowx);                  \
+  __macro(vdPowx);                  \
   __macro(MKL_Set_Num_Threads)
 
 MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 0f3298f6006ade66ccacff4f33338ba3b6b73ebb..9f7aa556988e8ab0ca87d0e7212fe27a209f6a32 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -115,6 +115,14 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
   }
   places.emplace_back(platform::CPUPlace());
   platform::DeviceContextPool::Init(places);
+
+// windows has no support for openblas multi-thread
+#ifdef _WIN32
+  if (FLAGS_paddle_num_threads > 1) {
+    FLAGS_paddle_num_threads = 1;
+  }
+#endif
+
 #ifndef PADDLE_WITH_MKLDNN
   platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif
@@ -170,7 +178,9 @@ void InitGLOG(const std::string &prog_name) {
   // glog will not hold the ARGV[0] inside.
   // Use strdup to alloc a new string.
   google::InitGoogleLogging(strdup(prog_name.c_str()));
+#ifndef _WIN32
   google::InstallFailureSignalHandler();
+#endif
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 40af1f95208905231b933e5184a807b061164799..a6360a884d74f06603f28efeb36e39fbd0257cf6 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifndef _WIN32
 #pragma once
 
 #include <stdio.h>
@@ -149,3 +150,4 @@ struct NCCLContextMap {
 
 }  // namespace platform
 }  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index cf9f4aa95bc1cb79d95b79331fbc09e11af64194..8823e97b0b696556b32724acd096e8fc79a49f53 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -24,21 +24,16 @@
 #include "glog/logging.h"
 
 #if !defined(_WIN32)
-#define UNUSED __attribute__((unused))
 #include <dlfcn.h>     //  dladdr
 #include <execinfo.h>  // backtrace
 #include <sys/stat.h>
 #include <algorithm>  // std::accumulate
 #else
 #include <io.h>  // _popen, _pclose
+#include <stdio.h>
 #include <windows.h>
-#if defined(_WIN32)
 #include <numeric>  // std::accumulate in msvc
-#endif
-// windows version of __attribute__((unused))
-#define UNUSED __pragma(warning(suppress : 4100))
-
-#ifndef S_ISDIR  // windows port for sys/stat.h
+#ifndef S_ISDIR     // windows port for sys/stat.h
 #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
 #endif  // S_ISDIR
 
diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h
index 86c5f87f34d648d4a784eba2014a4f896d74724b..e9aef621acea44b0dab7a687c13223617d5603c0 100644
--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
@@ -43,3 +43,11 @@ limitations under the License. */
 #include <boost/mpl/less_equal.hpp>
 #include <boost/optional.hpp>
 #include <boost/variant.hpp>
+
+// some platform-independent defintion
+#if defined(_WIN32)
+#define UNUSED
+#define __builtin_expect(EXP, C) (EXP)
+#else
+#define UNUSED __attribute__((unused))
+#endif
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index e7f634c4a622b48e97040987836406cf73cb23b6..6afa53cd36dee2004ada707a52995ce33b3650e9 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,9 +2,9 @@
 set(PYBIND_DEPS pybind python proto_desc memory executor prune  feed_fetch_method pass_builder)
 set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc)
 if(NOT WIN32)
-list(APPEND PYBIND_DEPS parallel_executor profiler)
-list(APPEND PYBIND_SRCS recordio.cc)
-endif()
+  list(APPEND PYBIND_DEPS parallel_executor profiler)
+  list(APPEND PYBIND_SRCS recordio.cc)
+endif(NOT WIN32)
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
@@ -21,5 +21,13 @@ if(WITH_PYTHON)
     endif(NOT APPLE AND NOT ANDROID AND NOT WIN32)
   endif(WITH_AMD_GPU)
 
+  if(WIN32)
+    if(WITH_GPU AND NOT WITH_DSO)
+      get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
+      target_link_libraries(paddle_pybind ${cuda_modules})
+    endif(WITH_GPU AND NOT WITH_DSO)
+    target_link_libraries(paddle_pybind shlwapi)
+  endif(WIN32)
+
   cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python)
 endif(WITH_PYTHON)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 806b304be593463690063e9d454083ed7fc88b98..8ff6f6c85ace4bdfb14a2e9c82b1e07d01fc0f4c 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -21,6 +21,13 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#if defined(_WIN32)
+#define NOMINMAX
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#define GOOGLE_GLOG_DLL_DECL
+#include <Windows.h>
+#endif
+
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -29,7 +36,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
+#ifndef _WIN32
 #include "paddle/fluid/framework/parallel_executor.h"
+#endif
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -51,7 +60,9 @@ limitations under the License. */
 #include "paddle/fluid/string/to_string.h"
 
 #ifdef PADDLE_WITH_CUDA
+#ifndef _WIN32
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#endif
 #include "paddle/fluid/platform/cuda_profiler.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
@@ -342,22 +353,25 @@ All parameter, weight, gradient are variables in Paddle.
       .def("get_lod_tensor_array",
            [](Variable &self) { return self.GetMutable<LoDTensorArray>(); },
            py::return_value_policy::reference)
-#ifdef PADDLE_WITH_CUDA
+#if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
       .def("get_communicator",
            [](Variable &self) -> platform::Communicator * {
              return self.GetMutable<platform::Communicator>();
            },
            py::return_value_policy::reference)
-#endif
       .def("get_reader",
            [](Variable &self) -> framework::ReaderHolder * {
              PADDLE_ENFORCE(self.IsType<framework::ReaderHolder>());
              return self.GetMutable<framework::ReaderHolder>();
            },
-           py::return_value_policy::reference);
+           py::return_value_policy::reference)
+#endif
+      ;
 
+#if !defined(_WIN32)
   py::class_<framework::ReaderHolder>(m, "Reader", "")
       .def("reset", &framework::ReaderHolder::ResetAll);
+#endif
 
   using LoDTensorBlockingQueue =
       ::paddle::operators::reader::LoDTensorBlockingQueue;
@@ -482,7 +496,7 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
                 });;
 // clang-format on
-#ifdef PADDLE_WITH_CUDA
+#if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
   py::class_<platform::CUDAPlace>(m, "CUDAPlace")
@@ -619,11 +633,14 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_CUDA
   m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
 
+#ifndef _WIN32
   m.def("nvprof_init", platform::CudaProfilerInit);
   m.def("nvprof_start", platform::CudaProfilerStart);
   m.def("nvprof_stop", platform::CudaProfilerStop);
 #endif
+#endif
 
+#ifndef _WIN32
   py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
       .value("kDisabled", platform::ProfilerState::kDisabled)
       .value("kCPU", platform::ProfilerState::kCPU)
@@ -644,6 +661,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("disable_profiler", platform::DisableProfiler);
   m.def("is_profiler_enabled", platform::IsProfileEnabled);
   m.def("reset_profiler", platform::ResetProfiler);
+#endif
 
   py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
   pass.def(py::init())
@@ -652,9 +670,9 @@ All parameter, weight, gradient are variables in Paddle.
           [](ir::Pass &self, const std::string &name, const std::string &attr) {
             self.Set<std::string>(name, new std::string(attr));
           })
-      .def("set_int", [](ir::Pass &self, const std::string &name, int val) {
-        self.Set<const int>(name, new int(val));
-      });
+      .def("set_int", [](ir::Pass &self, const std::string &name,
+                         int val) { self.Set<const int>(name, new int(val)); })
+      .def("type", &ir::Pass::Type);
 
   py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
       m, "PassBuilder");
@@ -672,6 +690,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("remove_pass",
            [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
 
+#ifndef _WIN32
   // -- python binds for parallel executor.
   py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
   py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
@@ -793,6 +812,7 @@ All parameter, weight, gradient are variables in Paddle.
           "reduce_strategy",
           [](const BuildStrategy &self) { return self.reduce_; },
           [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
             self.reduce_ = strategy;
           },
           R"DOC(The type is STR, there are two reduce strategies in ParallelExecutor,
@@ -806,6 +826,7 @@ All parameter, weight, gradient are variables in Paddle.
           [](const BuildStrategy &self) { return self.gradient_scale_; },
           [](BuildStrategy &self,
              BuildStrategy::GradientScaleStrategy strategy) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
             self.gradient_scale_ = strategy;
           },
           R"DOC(The type is STR, there are three ways of defining :math:`loss@grad` in
@@ -817,6 +838,7 @@ All parameter, weight, gradient are variables in Paddle.
           "debug_graphviz_path",
           [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
           [](BuildStrategy &self, const std::string &path) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
             self.debug_graphviz_path_ = path;
           },
           R"DOC(The type is STR, debug_graphviz_path indicate the path that
@@ -826,6 +848,7 @@ All parameter, weight, gradient are variables in Paddle.
           "enable_data_balance",
           [](const BuildStrategy &self) { return self.enable_data_balance_; },
           [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
             self.enable_data_balance_ = b;
           })  // FIXME(chengudo): enable_data_balance seems not important
       .def_property(
@@ -834,6 +857,7 @@ All parameter, weight, gradient are variables in Paddle.
             return self.enable_sequential_execution_;
           },
           [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
             self.enable_sequential_execution_ = b;
           },
           R"DOC(The type is BOOL. If set True, the execution order of ops would be the same as what is in the program. Default False.)DOC")
@@ -843,6 +867,7 @@ All parameter, weight, gradient are variables in Paddle.
             return self.remove_unnecessary_lock_;
           },
           [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
             self.remove_unnecessary_lock_ = b;
           },
           R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC")
@@ -852,15 +877,19 @@ All parameter, weight, gradient are variables in Paddle.
             return self.fuse_elewise_add_act_ops_;
           },
           [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
             self.fuse_elewise_add_act_ops_ = b;
           },
           R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether
                      to fuse elementwise_add_op and activation_op,
                      it may make the execution faster. Default False)DOC")
-      .def("_create_passes_from_strategy",
+      .def("_finalize_strategy_and_create_passes",
            [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
-             return self.CreatePassesFromStrategy();
-           });
+             return self.CreatePassesFromStrategy(true);
+           },
+           R"DOC(Allow user to customized passes. Normally model-specific
+                optimization passes should be defined in this way. BuildStrategy
+                cannot be updated after being finalized.)DOC");
 
   pe.def(py::init<const std::vector<platform::Place> &,
                   const std::unordered_set<std::string> &,
@@ -889,6 +918,7 @@ All parameter, weight, gradient are variables in Paddle.
       });
 
   BindRecordIOWriter(&m);
+#endif
   return m.ptr();
 }
 }  // namespace pybind
diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h
index 47de23377398423dabf3b0ed5b670e564f57cdfb..a2eec6e3c48dd126614bbff0227145537b678ac4 100644
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
@@ -72,6 +72,7 @@
 #include <iostream>
 #include <sstream>
 #include <string>
+#include <vector>
 
 #include "tinyformat/tinyformat.h"  // https://github.com/c42f/tinyformat
 
@@ -102,5 +103,22 @@ void Printf(const char* fmt, const Args&... args) {
   Fprintf(std::cout, fmt, args...);
 }
 
+template <typename T>
+std::string HumanReadableSize(T size) {
+  size_t i = 0;
+  double f_size = static_cast<double>(size);
+  double orig = f_size;
+  const std::vector<std::string> units(
+      {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"});
+  while (f_size > 1024) {
+    f_size /= 1024;
+    i++;
+  }
+  if (i >= units.size()) {
+    return Sprintf("%fB", orig);
+  }
+  return Sprintf("%f%s", f_size, units[i]);
+}
+
 }  // namespace string
 }  // namespace paddle
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 595af0c29a5c2c30427a82205d3d56c49492c0c8..32f9bca645d80a11274d128b6615a73ffa224705 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -139,6 +139,7 @@ function cmake_gen() {
         -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
         -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
         -DWITH_MKL=${WITH_MKL:-ON}
+        -DWITH_NGRAPH=${WITH_NGRAPH:-OFF}
         -DWITH_AVX=${WITH_AVX:-OFF}
         -DWITH_GOLANG=${WITH_GOLANG:-OFF}
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
@@ -155,6 +156,8 @@ function cmake_gen() {
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
+        -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}
+        -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}
         -DPY_VERSION=${PY_VERSION:-2.7}
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
     ========================================
@@ -171,6 +174,7 @@ EOF
         -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
         -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
         -DWITH_MKL=${WITH_MKL:-ON} \
+        -DWITH_NGRAPH=${WITH_NGRAPH:-OFF} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
         -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
@@ -186,6 +190,8 @@ EOF
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
+        -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}\
+        -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}\
         -DPY_VERSION=${PY_VERSION:-2.7} \
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
 
@@ -775,6 +781,17 @@ function main() {
         test_fluid_lib
         assert_api_spec_approvals
         ;;
+      assert_api)
+        assert_api_not_changed ${PYTHON_ABI:-""}
+        ;;
+      test_inference)
+        gen_capi_package
+        gen_fluid_lib
+        test_fluid_lib
+        ;;
+      assert_api_approvals)
+        assert_api_spec_approvals
+        ;;
       maccheck)
         cmake_gen ${PYTHON_ABI:-""}
         build_mac
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 0d29f2ad209296688582924ae16e495930830bd4..139176b0d6c5dff511a97c9ac01f09e72a90306b 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -45,23 +45,42 @@ endif()
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
-
-set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
-add_custom_command(OUTPUT ${FLUID_CORE}
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-        DEPENDS paddle_pybind)
+IF(WIN32)
+    # Python would use the .pyd by default under Windows series platform
+    set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/)
+    get_filename_component(openblas_refpath ${CBLAS_LIBRARIES} DIRECTORY)
+    set(FLUID_CORE ${FLUID_DST_DIR}/core.pyd)
+    add_custom_command(OUTPUT ${FLUID_CORE}
+            COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+            COMMAND cmake -E copy ${openblas_refpath}/openblas.dll ${FLUID_DST_DIR}
+            DEPENDS paddle_pybind)
+ELSE()
+    set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
+    add_custom_command(OUTPUT ${FLUID_CORE}
+            COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+            DEPENDS paddle_pybind)
+ENDIF()
 add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE})
 
-
-add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
-    COMMAND touch stub.cc
-    COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
-    COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
-    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
-    COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib.* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-    DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+IF(WIN32)
+    add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+            COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle/
+            COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
+            COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+            COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+            COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+            DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+ELSE(WIN32)
+	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+		COMMAND touch stub.cc
+		COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
+		COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
+		COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+		COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+		COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+		COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+		DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+ENDIF()
 
 set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS})
 if(NOT WITH_FLUID_ONLY)
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 78d79317aa68d71059a0603f22e44de45a02314c..b99197492870e9886e8a29f8c46723401a2a5ce1 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
+import os
 # import all class inside framework into fluid module
 from . import framework
 from .framework import *
@@ -111,12 +112,16 @@ def __bootstrap__():
     os.environ['OMP_NUM_THREADS'] = str(num_threads)
 
     read_env_flags = [
-        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
-        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
+        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope',
+        'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb',
         'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
         "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb',
         'allocator_strategy', 'reader_queue_speed_test_mode'
     ]
+    if os.name != 'nt':
+        read_env_flags.append('warpctc_dir')
+        read_env_flags.append('cpu_deterministic')
+
     if core.is_compiled_with_dist():
         read_env_flags.append('rpc_deadline')
         read_env_flags.append('rpc_server_profile_path')
diff --git a/python/paddle/fluid/contrib/inferencer.py b/python/paddle/fluid/contrib/inferencer.py
index b8d5f4ffeadca0a7b103682f175d50dc46fa258a..b966ae01d039d7e9510dae73ecadb97b494f68c2 100644
--- a/python/paddle/fluid/contrib/inferencer.py
+++ b/python/paddle/fluid/contrib/inferencer.py
@@ -15,13 +15,15 @@
 from __future__ import print_function
 
 import contextlib
+import os
 
 from .. import core
 
 from .. import executor
 from .. import framework
 from .. import io
-from .. import parallel_executor
+if os.name != 'nt':
+    from .. import parallel_executor
 from .. import unique_name
 from .trainer import check_and_get_place
 
diff --git a/python/paddle/fluid/contrib/trainer.py b/python/paddle/fluid/contrib/trainer.py
index 8569e486f91786b5562e84dcdccf6d91da0612cc..096821a5ba690074ecbf023cf87fed7e206d023f 100644
--- a/python/paddle/fluid/contrib/trainer.py
+++ b/python/paddle/fluid/contrib/trainer.py
@@ -28,7 +28,8 @@ from .. import framework
 from .. import io
 # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
 from .. import optimizer as opt_module
-from .. import parallel_executor
+if os.name != 'nt':
+    from .. import parallel_executor
 from ..transpiler import distribute_transpiler
 
 __all__ = [
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 4ac94981a7a47530fe6ae4d968212c62dd3e0a93..96b6705e26c0f8d8d223e9020192a8f330c2c727 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -31,6 +31,7 @@ from functools import reduce
 
 __all__ = [
     'prior_box',
+    'density_prior_box',
     'multi_box_head',
     'bipartite_match',
     'target_assign',
@@ -1023,6 +1024,135 @@ def prior_box(input,
     return box, var
 
 
+def density_prior_box(input,
+                      image,
+                      densities=None,
+                      fixed_sizes=None,
+                      fixed_ratios=None,
+                      variance=[0.1, 0.1, 0.2, 0.2],
+                      clip=False,
+                      steps=[0.0, 0.0],
+                      offset=0.5,
+                      name=None):
+    """
+    **Density Prior Box Operator**
+
+    Generate density prior boxes for SSD(Single Shot MultiBox Detector) 
+    algorithm. Each position of the input produce N prior boxes, N is 
+    determined by the count of densities, fixed_sizes and fixed_ratios. 
+    Boxes center at grid points around each input position is generated by 
+    this operator, and the grid points is determined by densities and 
+    the count of density prior box is determined by fixed_sizes and fixed_ratios. 
+    Obviously, the number of fixed_sizes is equal to the number of densities.
+    For densities_i in densities:
+    N_density_prior_box =sum(N_fixed_ratios * densities_i^2),
+
+    Args:
+       input(Variable): The Input Variables, the format is NCHW.
+       image(Variable): The input image data of PriorBoxOp,
+            the layout is NCHW.
+       densities(list|tuple|None): the densities of generated density prior 
+            boxes, this attribute should be a list or tuple of integers. 
+            Default: None.
+       fixed_sizes(list|tuple|None): the fixed sizes of generated density
+            prior boxes, this attribute should a list or tuple of same 
+            length with :attr:`densities`. Default: None.
+       fixed_ratios(list|tuple|None): the fixed ratios of generated density
+            prior boxes, if this attribute is not set and :attr:`densities`
+            and :attr:`fix_sizes` is set, :attr:`aspect_ratios` will be used
+            to generate density prior boxes.
+       variance(list|tuple): the variances to be encoded in density prior boxes.
+            Default:[0.1, 0.1, 0.2, 0.2].
+       clip(bool): Whether to clip out-of-boundary boxes. Default: False.
+       step(list|turple): Prior boxes step across width and height, If
+            step[0] == 0.0/step[1] == 0.0, the density prior boxes step across
+            height/weight of the input will be automatically calculated.
+            Default: [0., 0.]
+       offset(float): Prior boxes center offset. Default: 0.5
+       name(str): Name of the density prior box op. Default: None.
+
+    Returns:
+        tuple: A tuple with two Variable (boxes, variances)
+
+        boxes: the output density prior boxes of PriorBox.
+        The layout is [H, W, num_priors, 4].
+        H is the height of input, W is the width of input,
+        num_priors is the total
+        box count of each position of input.
+
+        variances: the expanded variances of PriorBox.
+        The layout is [H, W, num_priors, 4].
+        H is the height of input, W is the width of input
+        num_priors is the total
+        box count of each position of input
+
+
+    Examples:
+        .. code-block:: python
+
+            box, var = fluid.layers.density_prior_box(
+                input=conv1,
+                image=images,
+                min_sizes=[100.],
+                max_sizes=[200.],
+                aspect_ratios=[1.0, 1.0 / 2.0, 2.0],
+                densities=[3, 4],
+                fixed_sizes=[50., 60.],
+                fixed_ratios=[1.0, 3.0, 1.0 / 3.0],
+                flip=True,
+                clip=True)
+    """
+    helper = LayerHelper("density_prior_box", **locals())
+    dtype = helper.input_dtype()
+
+    def _is_list_or_tuple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    if not _is_list_or_tuple_(densities):
+        raise TypeError('densities should be a list or a tuple or None.')
+    if not _is_list_or_tuple_(fixed_sizes):
+        raise TypeError('fixed_sizes should be a list or a tuple or None.')
+    if not _is_list_or_tuple_(fixed_ratios):
+        raise TypeError('fixed_ratios should be a list or a tuple or None.')
+    if len(densities) != len(fixed_sizes):
+        raise ValueError('densities and fixed_sizes length should be euqal.')
+    if not (_is_list_or_tuple_(steps) and len(steps) == 2):
+        raise ValueError('steps should be a list or tuple ',
+                         'with length 2, (step_width, step_height).')
+
+    densities = list(map(int, densities))
+    fixed_sizes = list(map(float, fixed_sizes))
+    fixed_ratios = list(map(float, fixed_ratios))
+    steps = list(map(float, steps))
+
+    attrs = {
+        'variances': variance,
+        'clip': clip,
+        'step_w': steps[0],
+        'step_h': steps[1],
+        'offset': offset,
+    }
+    if densities is not None and len(densities) > 0:
+        attrs['densities'] = densities
+    if fixed_sizes is not None and len(fixed_sizes) > 0:
+        attrs['fixed_sizes'] = fixed_sizes
+    if fixed_ratios is not None and len(fixed_ratios) > 0:
+        attrs['fixed_ratios'] = fixed_ratios
+
+    box = helper.create_variable_for_type_inference(dtype)
+    var = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type="density_prior_box",
+        inputs={"Input": input,
+                "Image": image},
+        outputs={"Boxes": box,
+                 "Variances": var},
+        attrs=attrs, )
+    box.stop_gradient = True
+    var.stop_gradient = True
+    return box, var
+
+
 def multi_box_head(inputs,
                    image,
                    base_size,
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 1ab48c00548b58f4b3e411d8e46e8cf496d6b891..a9075045a2d5282ecded1681bc9835feb15298ea 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import contextlib
 import multiprocessing
+import os
 import six
 import threading
 
@@ -346,70 +347,72 @@ def _copy_reader_create_op_(block, op):
     return new_op
 
 
-@templatedoc(op_type='create_recordio_file_reader')
-def open_recordio_file(filename,
-                       shapes,
-                       lod_levels,
-                       dtypes,
-                       pass_num=1,
-                       for_parallel=True):
-    """
-    ${comment}
-
-    Args:
-       filename(${filename_type}): ${filename_comment}.
-       shapes(list): List of tuples which declaring data shapes.
-       lod_levels(${lod_levels_type}): ${lod_levels_comment}.
-       dtypes(list): List of strs which declaring data type.
-       pass_num(int): Number of passes to run.
-       for_parallel(Bool): Set it as True if you are going to run
-            subsequent operators in parallel.
-
-    Returns:
-       ${out_comment}.
-
-    Examples:
-
-        >>> import paddle.fluid as fluid
-        >>> reader = fluid.layers.io.open_recordio_file(
-        >>>                               filename='./data.recordio',
-        >>>                               shapes=[(3,224,224), (1)],
-        >>>                               lod_levels=[0, 0],
-        >>>                               dtypes=['float32', 'int64'])
-        >>> # Via the reader, we can use 'read_file' layer to get data:
-        >>> image, label = fluid.layers.io.read_file(reader)
-    """
-    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
-    shape_concat = []
-    ranks = []
+if os.name != 'nt':
+
+    @templatedoc(op_type='create_recordio_file_reader')
+    def open_recordio_file(filename,
+                           shapes,
+                           lod_levels,
+                           dtypes,
+                           pass_num=1,
+                           for_parallel=True):
+        """
+        ${comment}
+
+        Args:
+           filename(${filename_type}): ${filename_comment}.
+           shapes(list): List of tuples which declaring data shapes.
+           lod_levels(${lod_levels_type}): ${lod_levels_comment}.
+           dtypes(list): List of strs which declaring data type.
+           pass_num(int): Number of passes to run.
+           for_parallel(Bool): Set it as True if you are going to run
+                subsequent operators in parallel.
+
+        Returns:
+           ${out_comment}.
+
+        Examples:
+
+            >>> import paddle.fluid as fluid
+            >>> reader = fluid.layers.io.open_recordio_file(
+            >>>                               filename='./data.recordio',
+            >>>                               shapes=[(3,224,224), (1)],
+            >>>                               lod_levels=[0, 0],
+            >>>                               dtypes=['float32', 'int64'])
+            >>> # Via the reader, we can use 'read_file' layer to get data:
+            >>> image, label = fluid.layers.io.read_file(reader)
+        """
+        dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
+        shape_concat = []
+        ranks = []
 
-    for shape in shapes:
-        shape_concat.extend(shape)
-        ranks.append(len(shape))
+        for shape in shapes:
+            shape_concat.extend(shape)
+            ranks.append(len(shape))
 
-    var_name = unique_name('open_recordio_file')
+        var_name = unique_name('open_recordio_file')
 
-    startup_blk = default_startup_program().current_block()
-    startup_var = startup_blk.create_var(name=var_name)
-    startup_blk.append_op(
-        type='create_recordio_file_reader',
-        outputs={'Out': [startup_var]},
-        attrs={
-            'shape_concat': shape_concat,
-            'lod_levels': lod_levels,
-            'filename': filename,
-            'ranks': ranks
-        })
+        startup_blk = default_startup_program().current_block()
+        startup_var = startup_blk.create_var(name=var_name)
+        startup_blk.append_op(
+            type='create_recordio_file_reader',
+            outputs={'Out': [startup_var]},
+            attrs={
+                'shape_concat': shape_concat,
+                'lod_levels': lod_levels,
+                'filename': filename,
+                'ranks': ranks
+            })
 
-    startup_var.desc.set_dtypes(dtypes)
-    startup_var.persistable = True
-    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
-                                      startup_var)
+        startup_var.desc.set_dtypes(dtypes)
+        startup_var.persistable = True
+        main_prog_var = _copy_reader_var_(
+            default_main_program().current_block(), startup_var)
 
-    if pass_num > 1:
-        main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
+        if pass_num > 1:
+            main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
 
-    return monkey_patch_reader_methods(main_prog_var)
+        return monkey_patch_reader_methods(main_prog_var)
 
 
 def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index c1a93e831b6df02c2c4cfa34cd498b7b147a8b1b..1b5009e76126e8b0fbe2805cc85f4f133a70c1ae 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -18,6 +18,7 @@ All layers just related to the neural network.
 from __future__ import print_function
 
 import numpy as np
+import os
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable, OpProtoHolder
@@ -165,6 +166,7 @@ __all__ = [
     'grid_sampler',
     'log_loss',
     'add_position_encoding',
+    'bilinear_tensor_product',
 ]
 
 
@@ -340,126 +342,128 @@ def embedding(input,
     return tmp
 
 
-@templatedoc(op_type="lstm")
-def dynamic_lstm(input,
-                 size,
-                 h_0=None,
-                 c_0=None,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_peepholes=True,
-                 is_reverse=False,
-                 gate_activation='sigmoid',
-                 cell_activation='tanh',
-                 candidate_activation='tanh',
-                 dtype='float32',
-                 name=None):
-    """
-    ${comment}
+if os.name != 'nt':
 
-    Args:
-        input (Variable): ${input_comment}
-        size (int): 4 * hidden size.
-        h_0(Variable): The initial hidden state is an optional input, default is zero.
-                       This is a tensor with shape (N x D), where N is the
-                       batch size and D is the hidden size.
-        c_0(Variable): The initial cell state is an optional input, default is zero.
-                       This is a tensor with shape (N x D), where N is the
-                       batch size. `h_0` and `c_0` can be NULL but only at the same time.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-                               hidden-hidden weights.
-
-                               - Weights = {:math:`W_{ch}, W_{ih}, \
-                                                W_{fh}, W_{oh}`}
-                               - The shape is (D x 4D), where D is the hidden
-                                 size.
-
-                               If it is set to None or one attribute of ParamAttr,
-                               dynamic_lstm will create ParamAttr as param_attr.
-                               If the Initializer of the param_attr is not set, the
-                               parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The bias attribute for the learnable bias
-                              weights, which contains two parts, input-hidden
-                              bias weights and peephole connections weights if
-                              setting `use_peepholes` to `True`.
-
-                              1. `use_peepholes = False`
-                                 - Biases = {:math:`b_c, b_i, b_f, b_o`}.
-                                 - The shape is (1 x 4D).
-                              2. `use_peepholes = True`
-                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
-                                                 W_{fc}, W_{oc}`}.
-                                 - The shape is (1 x 7D).
-
-                              If it is set to None or one attribute of ParamAttr,
-                              dynamic_lstm will create ParamAttr as bias_attr.
-                              If the Initializer of the bias_attr is not set,
-                              the bias is initialized zero. Default: None.
-        use_peepholes (bool): ${use_peepholes_comment}
-        is_reverse (bool): ${is_reverse_comment}
-        gate_activation (str): ${gate_activation_comment}
-        cell_activation (str): ${cell_activation_comment}
-        candidate_activation (str): ${candidate_activation_comment}
-        dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
-        name (str|None): A name for this layer(optional). If set None, the layer
-                         will be named automatically.
-
-    Returns:
-        tuple: The hidden state, and cell state of LSTM. The shape of both \
-        is (T x D), and lod is the same with the `input`.
-
-    Examples:
-        .. code-block:: python
-
-            hidden_dim = 512
-            forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
-                                           bias_attr=False)
-            forward, _ = fluid.layers.dynamic_lstm(
-                input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
-    """
-    assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
-    helper = LayerHelper('lstm', **locals())
-    size = size // 4
-    weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
-    bias_size = [1, 7 * size]
-    if not use_peepholes:
-        bias_size[1] = 4 * size
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+    @templatedoc(op_type="lstm")
+    def dynamic_lstm(input,
+                     size,
+                     h_0=None,
+                     c_0=None,
+                     param_attr=None,
+                     bias_attr=None,
+                     use_peepholes=True,
+                     is_reverse=False,
+                     gate_activation='sigmoid',
+                     cell_activation='tanh',
+                     candidate_activation='tanh',
+                     dtype='float32',
+                     name=None):
+        """
+        ${comment}
+
+        Args:
+            input (Variable): ${input_comment}
+            size (int): 4 * hidden size.
+            h_0(Variable): The initial hidden state is an optional input, default is zero.
+                           This is a tensor with shape (N x D), where N is the
+                           batch size and D is the hidden size.
+            c_0(Variable): The initial cell state is an optional input, default is zero.
+                           This is a tensor with shape (N x D), where N is the
+                           batch size. `h_0` and `c_0` can be NULL but only at the same time.
+            param_attr(ParamAttr|None): The parameter attribute for the learnable
+                                   hidden-hidden weights.
+
+                                   - Weights = {:math:`W_{ch}, W_{ih}, \
+                                                    W_{fh}, W_{oh}`}
+                                   - The shape is (D x 4D), where D is the hidden
+                                     size.
+
+                                   If it is set to None or one attribute of ParamAttr,
+                                   dynamic_lstm will create ParamAttr as param_attr.
+                                   If the Initializer of the param_attr is not set, the
+                                   parameter is initialized with Xavier. Default: None.
+            bias_attr (ParamAttr|None): The bias attribute for the learnable bias
+                                  weights, which contains two parts, input-hidden
+                                  bias weights and peephole connections weights if
+                                  setting `use_peepholes` to `True`.
+
+                                  1. `use_peepholes = False`
+                                     - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                                     - The shape is (1 x 4D).
+                                  2. `use_peepholes = True`
+                                     - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
+                                                     W_{fc}, W_{oc}`}.
+                                     - The shape is (1 x 7D).
+
+                                  If it is set to None or one attribute of ParamAttr,
+                                  dynamic_lstm will create ParamAttr as bias_attr.
+                                  If the Initializer of the bias_attr is not set,
+                                  the bias is initialized zero. Default: None.
+            use_peepholes (bool): ${use_peepholes_comment}
+            is_reverse (bool): ${is_reverse_comment}
+            gate_activation (str): ${gate_activation_comment}
+            cell_activation (str): ${cell_activation_comment}
+            candidate_activation (str): ${candidate_activation_comment}
+            dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
+            name (str|None): A name for this layer(optional). If set None, the layer
+                             will be named automatically.
+
+        Returns:
+            tuple: The hidden state, and cell state of LSTM. The shape of both \
+            is (T x D), and lod is the same with the `input`.
+
+        Examples:
+            .. code-block:: python
+
+                hidden_dim = 512
+                forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
+                                               bias_attr=False)
+                forward, _ = fluid.layers.dynamic_lstm(
+                    input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
+        """
+        assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
+        helper = LayerHelper('lstm', **locals())
+        size = size // 4
+        weight = helper.create_parameter(
+            attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
+        bias_size = [1, 7 * size]
+        if not use_peepholes:
+            bias_size[1] = 4 * size
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
 
-    hidden = helper.create_variable_for_type_inference(dtype)
-    cell = helper.create_variable_for_type_inference(dtype)
-    batch_gate = helper.create_variable_for_type_inference(dtype)
-    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
-    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
-    batch_size = input.shape[0]
-    if h_0:
-        assert h_0.shape == (batch_size, size), \
-            'The shape of h0 should be (batch_size, %d)' % size
-        inputs['H0'] = h_0
-    if c_0:
-        assert c_0.shape == (batch_size, size), \
-            'The shape of c0 should be (batch_size, %d)' % size
-        inputs['C0'] = c_0
+        hidden = helper.create_variable_for_type_inference(dtype)
+        cell = helper.create_variable_for_type_inference(dtype)
+        batch_gate = helper.create_variable_for_type_inference(dtype)
+        batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
+        inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
+        batch_size = input.shape[0]
+        if h_0:
+            assert h_0.shape == (batch_size, size), \
+                'The shape of h0 should be (batch_size, %d)' % size
+            inputs['H0'] = h_0
+        if c_0:
+            assert c_0.shape == (batch_size, size), \
+                'The shape of c0 should be (batch_size, %d)' % size
+            inputs['C0'] = c_0
 
-    helper.append_op(
-        type='lstm',
-        inputs=inputs,
-        outputs={
-            'Hidden': hidden,
-            'Cell': cell,
-            'BatchGate': batch_gate,
-            'BatchCellPreAct': batch_cell_pre_act
-        },
-        attrs={
-            'use_peepholes': use_peepholes,
-            'is_reverse': is_reverse,
-            'gate_activation': gate_activation,
-            'cell_activation': cell_activation,
-            'candidate_activation': candidate_activation
-        })
-    return hidden, cell
+        helper.append_op(
+            type='lstm',
+            inputs=inputs,
+            outputs={
+                'Hidden': hidden,
+                'Cell': cell,
+                'BatchGate': batch_gate,
+                'BatchCellPreAct': batch_cell_pre_act
+            },
+            attrs={
+                'use_peepholes': use_peepholes,
+                'is_reverse': is_reverse,
+                'gate_activation': gate_activation,
+                'cell_activation': cell_activation,
+                'candidate_activation': candidate_activation
+            })
+        return hidden, cell
 
 
 def dynamic_lstmp(input,
@@ -958,39 +962,43 @@ def linear_chain_crf(input, label, param_attr=None):
     return log_likelihood
 
 
-@templatedoc()
-def crf_decoding(input, param_attr, label=None):
-    """
-    ${comment}
+if os.name != 'nt':
 
-    Args:
-        input(${emission_type}): ${emission_comment}
+    @templatedoc()
+    def crf_decoding(input, param_attr, label=None):
+        """
+        ${comment}
 
-        param_attr(ParamAttr): The parameter attribute for training.
+        Args:
+            input(${emission_type}): ${emission_comment}
 
-        label(${label_type}): ${label_comment}
+            param_attr(ParamAttr): The parameter attribute for training.
 
-    Returns:
-        Variable: ${viterbi_path_comment}
+            label(${label_type}): ${label_comment}
 
-    Examples:
-        .. code-block:: python
+        Returns:
+            Variable: ${viterbi_path_comment}
 
-           crf_decode = layers.crf_decoding(
-                input=hidden, param_attr=ParamAttr(name="crfw"))
-    """
-    helper = LayerHelper('crf_decoding', **locals())
-    transition = helper.get_parameter(param_attr.name)
-    viterbi_path = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
-    helper.append_op(
-        type='crf_decoding',
-        inputs={"Emission": [input],
+        Examples:
+            .. code-block:: python
+
+               crf_decode = layers.crf_decoding(
+                    input=hidden, param_attr=ParamAttr(name="crfw"))
+        """
+        helper = LayerHelper('crf_decoding', **locals())
+        transition = helper.get_parameter(param_attr.name)
+        viterbi_path = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
+        helper.append_op(
+            type='crf_decoding',
+            inputs={
+                "Emission": [input],
                 "Transition": transition,
-                "Label": label},
-        outputs={"ViterbiPath": [viterbi_path]})
+                "Label": label
+            },
+            outputs={"ViterbiPath": [viterbi_path]})
 
-    return viterbi_path
+        return viterbi_path
 
 
 @templatedoc()
@@ -4066,8 +4074,8 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None):
     Examples:
         .. code-block:: python
 
-            x = fluid.layers.data(name='x', shape=[8], dtype='float32')
-            y = fluid.layers.data(name='y', shape=[7], dtype='float32')
+            x = fluid.layers.data(name='x', shape=[1], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
             cost = fluid.layers.edit_distance(input=x,label=y)
     """
     helper = LayerHelper("edit_distance", **locals())
@@ -4741,7 +4749,8 @@ def softmax_with_cross_entropy(logits,
                                label,
                                soft_label=False,
                                ignore_index=-100,
-                               numeric_stable_mode=False):
+                               numeric_stable_mode=False,
+                               return_softmax=False):
     """
     **Softmax With Cross Entropy Operator.**
 
@@ -4805,9 +4814,15 @@ def softmax_with_cross_entropy(logits,
                                     the algorithm is always numerically stable. 
                                     Note that the speed may be slower when use 
                                     stable algorithm. Default: False
+        return_softmax (bool): A flag indicating whether to return the softmax 
+                               along with the cross entropy loss. Default: False
 
     Returns:
-        Variable: The cross entropy loss is a 2-D tensor with shape [N x 1].
+        Variable or Tuple of two Variables: Return the cross entropy loss if 
+                              `return_softmax` is False, otherwise the tuple 
+                              (loss, softmax), where the cross entropy loss is 
+                              a 2-D tensor with shape [N x 1], and softmax is a 
+                              2-D tensor with shape [N x K].
 
     Examples:
         .. code-block:: python
@@ -4832,6 +4847,10 @@ def softmax_with_cross_entropy(logits,
             'ignore_index': ignore_index,
             'numeric_stable_mode': numeric_stable_mode
         })
+
+    if return_softmax:
+        return loss, softmax
+
     return loss
 
 
@@ -5526,42 +5545,48 @@ def label_smooth(label,
     return smooth_label
 
 
-@templatedoc()
-def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
-    """
-    ${comment}
-
-    Args:
-        input (Variable): ${x_comment}
-        rois (Variable): ROIs (Regions of Interest) to pool over.
-        pooled_height (integer): ${pooled_height_comment} Default: 1
-        pooled_width (integer): ${pooled_width_comment} Default: 1
-        spatial_scale (float): ${spatial_scale_comment} Default: 1.0
-
-    Returns:
-        Variable: ${out_comment}.
-
-    Examples:
-        .. code-block:: python
-
-            pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
-    """
-    helper = LayerHelper('roi_pool', **locals())
-    dtype = helper.input_dtype()
-    pool_out = helper.create_variable_for_type_inference(dtype)
-    argmaxes = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
-        type="roi_pool",
-        inputs={"X": input,
-                "ROIs": rois},
-        outputs={"Out": pool_out,
-                 "Argmax": argmaxes},
-        attrs={
-            "pooled_height": pooled_height,
-            "pooled_width": pooled_width,
-            "spatial_scale": spatial_scale
-        })
-    return pool_out
+if os.name != 'nt':
+
+    @templatedoc()
+    def roi_pool(input,
+                 rois,
+                 pooled_height=1,
+                 pooled_width=1,
+                 spatial_scale=1.0):
+        """
+        ${comment}
+
+        Args:
+            input (Variable): ${x_comment}
+            rois (Variable): ROIs (Regions of Interest) to pool over.
+            pooled_height (integer): ${pooled_height_comment} Default: 1
+            pooled_width (integer): ${pooled_width_comment} Default: 1
+            spatial_scale (float): ${spatial_scale_comment} Default: 1.0
+
+        Returns:
+            Variable: ${out_comment}.
+
+        Examples:
+            .. code-block:: python
+
+                pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
+        """
+        helper = LayerHelper('roi_pool', **locals())
+        dtype = helper.input_dtype()
+        pool_out = helper.create_variable_for_type_inference(dtype)
+        argmaxes = helper.create_variable_for_type_inference(dtype='int32')
+        helper.append_op(
+            type="roi_pool",
+            inputs={"X": input,
+                    "ROIs": rois},
+            outputs={"Out": pool_out,
+                     "Argmax": argmaxes},
+            attrs={
+                "pooled_height": pooled_height,
+                "pooled_width": pooled_width,
+                "spatial_scale": spatial_scale
+            })
+        return pool_out
 
 
 @templatedoc()
@@ -6810,7 +6835,7 @@ def prelu(x, mode, param_attr=None, name=None):
         alpha_shape = x.shape
     dtype = helper.input_dtype(input_param_name='x')
     alpha = helper.create_parameter(
-        attr=param_attr,
+        attr=helper.param_attr,
         shape=alpha_shape,
         dtype='float32',
         is_bias=False,
@@ -8289,3 +8314,72 @@ def add_position_encoding(input, alpha, beta, name=None):
         attrs={"alpha": alpha,
                "beta": beta})
     return out
+
+
+def bilinear_tensor_product(x,
+                            y,
+                            size,
+                            act=None,
+                            name=None,
+                            param_attr=None,
+                            bias_attr=None):
+    """
+    **Add Bilinear Tensor Product Layer**
+
+    This layer performs bilinear tensor product on two inputs.
+    For example:
+
+    .. math::
+       out{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1
+
+    In this formula:
+      - :math:`x`: the first input contains M elements, shape is [batch_size, M].
+      - :math:`y`: the second input contains N elements, shape is [batch_size, N].
+      - :math:`W_{i}`: the i-th learned weight, shape is [M, N]
+      - :math:`out{i}`: the i-th element of out, shape is [batch_size, size].
+      - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`.
+
+    Args:
+        x (Variable): 2-D input tensor with shape [batch_size, M]
+        y (Variable): 2-D input tensor with shape [batch_size, N]
+        size (int): The dimension of this layer.
+        act (str, default None): Activation to be applied to the output of this layer.
+        name (str, default None): The name of this layer.
+        param_attr (ParamAttr, default None): The parameter attribute for the learnable w.
+            parameters/weights of this layer.
+        bias_attr (ParamAttr, default None): The parameter attribute for the bias
+            of this layer. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
+
+    Returns:
+        Variable: A 2-D Tensor of shape [batch_size, size].
+
+    Examples:
+        .. code-block:: python
+
+          tensor = bilinear_tensor_product(x=layer1, y=layer2, size=1000)
+    """
+    helper = LayerHelper('bilinear_tensor_product', **locals())
+    dtype = helper.input_dtype('x')
+
+    param_shape = [size, x.shape[1], y.shape[1]]
+
+    w = helper.create_parameter(
+        attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=False)
+
+    if name is None:
+        out = helper.create_variable_for_type_inference(dtype=dtype)
+    else:
+        out = helper.create_variable(name=name, dtype=dtype, persistable=False)
+
+    inputs = {"X": x, "Y": y, "Weight": w}
+    if helper.bias_attr:
+        bias_size = [1, size]
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+        inputs["Bias"] = bias
+    helper.append_op(
+        type="bilinear_tensor_product", inputs=inputs, outputs={"Out": out})
+
+    # add activation
+    return helper.append_activation(out)
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 1ff40a26f2f24e2ff06719972489b0c1e5d140c3..66eb1229aa3ec7a956146f12da2889d59b88671a 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
+import os
 from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr
 from .. import core
 from ..framework import convert_np_dtype_to_dtype_
@@ -99,27 +100,26 @@ Examples:
     >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3)
 """
 
-__all__ += ['cumsum']
-
-_cum_sum_ = generate_layer_fn('cumsum')
-
-
-def cumsum(x, axis=None, exclusive=None, reverse=None):
-    locals_var = locals().keys()
-    kwargs = dict()
-    for name in locals_var:
-        val = locals()[name]
-        if val is not None:
-            kwargs[name] = val
-    return _cum_sum_(**kwargs)
-
-
-cumsum.__doc__ = _cum_sum_.__doc__ + """
-Examples:
-
-    >>> data = fluid.layers.data(name="input", shape=[32, 784])
-    >>> result = fluid.layers.cumsum(data, axis=0)
-"""
+if os.name != 'nt':
+    __all__ += ['cumsum']
+
+    _cum_sum_ = generate_layer_fn('cumsum')
+
+    def cumsum(x, axis=None, exclusive=None, reverse=None):
+        locals_var = locals().keys()
+        kwargs = dict()
+        for name in locals_var:
+            val = locals()[name]
+            if val is not None:
+                kwargs[name] = val
+        return _cum_sum_(**kwargs)
+
+    cumsum.__doc__ = _cum_sum_.__doc__ + """
+    Examples:
+    
+        >>> data = fluid.layers.data(name="input", shape=[32, 784])
+        >>> result = fluid.layers.cumsum(data, axis=0)
+    """
 
 __all__ += ['thresholded_relu']
 
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 57e5d197b618615b32a7f446df0a81e18c25b097..ff32c00104171bf42c00be33f05758a4387228e1 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -235,11 +235,11 @@ def tensor_array_to_tensor(input, axis=1, name=None):
 
            output, output_index = fluid.layers.tensor_array_to_tensor(input=tensor_array)
     """
-    helper = LayerHelper('tensor_array_concat', **locals())
+    helper = LayerHelper('tensor_array_to_tensor', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     out_index = helper.create_variable_for_type_inference(dtype="int32")
     helper.append_op(
-        type='tensor_array_concat',
+        type='tensor_array_to_tensor',
         inputs={'X': input},
         outputs={'Out': [out],
                  'OutIndex': [out_index]},
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 28dc7519571d8b5464e92fddf634ba46691ceaa9..982d29180141d052e25ea3dcba6e3e7ce4181c48 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -128,6 +128,24 @@ class TestPriorBox(unittest.TestCase):
         assert box.shape[3] == 4
 
 
+class TestDensityPriorBox(unittest.TestCase):
+    def test_density_prior_box(self):
+        data_shape = [3, 224, 224]
+        images = fluid.layers.data(
+            name='pixel', shape=data_shape, dtype='float32')
+        conv1 = fluid.layers.conv2d(images, 3, 3, 2)
+        box, var = layers.density_prior_box(
+            input=conv1,
+            image=images,
+            densities=[3, 4],
+            fixed_sizes=[50., 60.],
+            fixed_ratios=[1.0],
+            clip=True)
+        assert len(box.shape) == 4
+        assert box.shape == var.shape
+        assert box.shape[3] == 4
+
+
 class TestAnchorGenerator(unittest.TestCase):
     def test_anchor_generator(self):
         data_shape = [3, 224, 224]
diff --git a/python/paddle/fluid/tests/unittests/dist_save_load.py b/python/paddle/fluid/tests/unittests/dist_save_load.py
index edc60550058f53da456c21de4b41142b907743df..cf62817956c12cd4487eba88bf49ed43331dff03 100644
--- a/python/paddle/fluid/tests/unittests/dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/dist_save_load.py
@@ -26,6 +26,7 @@ from multiprocessing import Process
 from functools import reduce
 
 import numpy as np
+import pickle
 import unittest
 import six
 
@@ -166,7 +167,10 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
                 io.save_persistables(startup_exe, model_dir, trainer_prog)
 
         var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor())
-        print(np.ravel(var).tolist())
+        if six.PY2:
+            print(pickle.dumps(np.ravel(var).tolist()))
+        else:
+            sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist()))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..79d1fd3d7171e06a88a75cf50b6a51ef4da51f07
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+import math
+from op_test import OpTest
+
+
+class TestDensityPriorBoxOp(OpTest):
+    def set_data(self):
+        self.init_test_params()
+        self.init_test_input()
+        self.init_test_output()
+        self.inputs = {'Input': self.input, 'Image': self.image}
+
+        self.attrs = {
+            'variances': self.variances,
+            'clip': self.clip,
+            'step_w': self.step_w,
+            'step_h': self.step_h,
+            'offset': self.offset,
+            'densities': self.densities,
+            'fixed_sizes': self.fixed_sizes,
+            'fixed_ratios': self.fixed_ratios
+        }
+        self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "density_prior_box"
+        self.set_data()
+
+    def set_density(self):
+        self.densities = []
+        self.fixed_sizes = []
+        self.fixed_ratios = []
+
+    def init_test_params(self):
+        self.layer_w = 32
+        self.layer_h = 32
+
+        self.image_w = 40
+        self.image_h = 40
+
+        self.step_w = float(self.image_w) / float(self.layer_w)
+        self.step_h = float(self.image_h) / float(self.layer_h)
+
+        self.input_channels = 2
+        self.image_channels = 3
+        self.batch_size = 10
+
+        self.variances = [0.1, 0.1, 0.2, 0.2]
+        self.variances = np.array(self.variances, dtype=np.float).flatten()
+
+        self.set_density()
+
+        self.clip = True
+        self.num_priors = 0
+        if len(self.fixed_sizes) > 0 and len(self.densities) > 0:
+            for density in self.densities:
+                if len(self.fixed_ratios) > 0:
+                    self.num_priors += len(self.fixed_ratios) * (pow(density,
+                                                                     2))
+        self.offset = 0.5
+
+    def init_test_input(self):
+        self.image = np.random.random(
+            (self.batch_size, self.image_channels, self.image_w,
+             self.image_h)).astype('float32')
+
+        self.input = np.random.random(
+            (self.batch_size, self.input_channels, self.layer_w,
+             self.layer_h)).astype('float32')
+
+    def init_test_output(self):
+        out_dim = (self.layer_h, self.layer_w, self.num_priors, 4)
+        out_boxes = np.zeros(out_dim).astype('float32')
+        out_var = np.zeros(out_dim).astype('float32')
+
+        step_average = int((self.step_w + self.step_h) * 0.5)
+        for h in range(self.layer_h):
+            for w in range(self.layer_w):
+                idx = 0
+                c_x = (w + self.offset) * self.step_w
+                c_y = (h + self.offset) * self.step_h
+                # Generate density prior boxes with fixed size
+                for density, fixed_size in zip(self.densities,
+                                               self.fixed_sizes):
+                    if (len(self.fixed_ratios) > 0):
+                        for ar in self.fixed_ratios:
+                            shift = int(step_average / density)
+                            box_width_ratio = fixed_size * math.sqrt(ar)
+                            box_height_ratio = fixed_size / math.sqrt(ar)
+                            for di in range(density):
+                                for dj in range(density):
+                                    c_x_temp = c_x - step_average / 2.0 + shift / 2.0 + dj * shift
+                                    c_y_temp = c_y - step_average / 2.0 + shift / 2.0 + di * shift
+                                    out_boxes[h, w, idx, :] = [
+                                        max((c_x_temp - box_width_ratio / 2.0) /
+                                            self.image_w, 0),
+                                        max((c_y_temp - box_height_ratio / 2.0)
+                                            / self.image_h, 0),
+                                        min((c_x_temp + box_width_ratio / 2.0) /
+                                            self.image_w, 1),
+                                        min((c_y_temp + box_height_ratio / 2.0)
+                                            / self.image_h, 1)
+                                    ]
+                                    idx += 1
+        if self.clip:
+            out_boxes = np.clip(out_boxes, 0.0, 1.0)
+        out_var = np.tile(self.variances,
+                          (self.layer_h, self.layer_w, self.num_priors, 1))
+        self.out_boxes = out_boxes.astype('float32')
+        self.out_var = out_var.astype('float32')
+
+
+class TestDensityPriorBox(TestDensityPriorBoxOp):
+    def set_density(self):
+        self.densities = [3, 4]
+        self.fixed_sizes = [1.0, 2.0]
+        self.fixed_ratios = [1.0]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 4b8a215190a90c974a9ecc8658d044c59b80c989..97e7ee6229f081ff67ca3e2aedcad0a2e3d9cabf 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -105,7 +105,7 @@ class TestDistRunnerBase(object):
             build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
 
         if args.batch_merge_repeat > 1:
-            pass_builder = build_stra._create_passes_from_strategy()
+            pass_builder = build_stra._finalize_strategy_and_create_passes()
             mypass = pass_builder.insert_pass(
                 len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass")
             mypass.set_int("num_repeats", args.batch_merge_repeat)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_save_load.py b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
index 03066fee48b703f8b55bd4ae6a9c4bb8deecab1e..ea2b554dac83988955e3a7e8919e57a4ed7a8215 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
@@ -65,14 +65,14 @@ class TestDistSaveLoadDense2x2(TestDistBase):
 
         shutil.rmtree(model_dir)
 
-        local_np = np.array(eval(local_var[0]))
-        train0_np = np.array(eval(tr0_var[0]))
-        train1_np = np.array(eval(tr1_var[0]))
+        local_np = np.array(local_var)
+        train0_np = np.array(tr0_var)
+        train1_np = np.array(tr1_var)
+
         self.assertAlmostEqual(local_np.all(), train0_np.all(), delta=delta)
         self.assertAlmostEqual(local_np.all(), train1_np.all(), delta=delta)
         self.assertAlmostEqual(train0_np.all(), train1_np.all(), delta=delta)
 
-    @unittest.skip(reason="CI fail")
     def test_dist(self):
         need_envs = {
             "IS_DISTRIBUTED": '0',
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 692f2375f2119c3def1751d92087613cd965bf25..a8fa5436c43d2f05f632b920f67d43d837d28da9 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -369,6 +369,10 @@ class TestBook(unittest.TestCase):
         with program_guard(program):
             x = layers.data(name='x', shape=[16], dtype='float32')
             y = layers.data(name='label', shape=[1], dtype='int64')
+            loss, softmax = layers.softmax_with_cross_entropy(
+                x, y, return_softmax=True)
+            self.assertIsNotNone(loss)
+            self.assertIsNotNone(softmax)
             loss = layers.softmax_with_cross_entropy(x, y)
             self.assertIsNotNone(loss)
         print(str(program))
@@ -911,6 +915,16 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(data_1)
         print(str(program))
 
+    def test_bilinear_tensor_product_layer(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name='data', shape=[4], dtype="float32")
+
+            theta = layers.data(name="theta", shape=[5], dtype="float32")
+            out = layers.bilinear_tensor_product(data, theta, 6)
+
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
index 11e5d8b536fb65b66c954991bf815241774702ec..c7f4f3e913bfd66cbbb703c0e73336f9a3563507 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
@@ -80,6 +80,33 @@ class TestLookupSpraseTable(OpTest):
         assert (result_array2[3] == w_array[6]).all()
         assert (result_array2[4] == w_array[7]).all()
 
+        # create and run lookup_table operator
+        test_lookup_table = Operator(
+            "lookup_sparse_table",
+            W='W',
+            Ids='Ids',
+            Out='Out',
+            min=-5.0,
+            max=10.0,
+            seed=10,
+            is_test=True)
+
+        ids = scope.var("Ids").get_tensor()
+        unknown_id = [44, 22, 33]
+        ids_array2 = np.array([4, 2, 3, 7, 100000] + unknown_id).astype("int64")
+        ids.set(ids_array2, place)
+        test_lookup_table.run(scope, place)
+
+        result_array2 = np.array(out_tensor)
+        assert (result_array2[0] == w_array[5]).all()
+        assert (result_array2[1] == w_array[1]).all()
+        assert (result_array2[2] == w_array[2]).all()
+        assert (result_array2[3] == w_array[6]).all()
+        assert (result_array2[4] == w_array[7]).all()
+
+        for i in [5, 6, 7]:
+            assert np.all(result_array2[i] == 0)
+
     def test_w_is_selected_rows(self):
         places = [core.CPUPlace()]
         # currently only support CPU
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
index c93740669f40aee3a6c143d153cfd0f5bb72dbd9..18d95c94ad36316b7149eb5412260b40a57ac002 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
@@ -21,8 +21,8 @@ import six
 class TestBase(unittest.TestCase):
     def main(self,
              network_func,
-             iter=100,
-             iter_per_pe=100,
+             iter=10,
+             iter_per_pe=10,
              use_gpu=True,
              use_experimental_executor=False):
         if use_gpu and not fluid.core.is_compiled_with_cuda():
@@ -45,7 +45,7 @@ class TestBase(unittest.TestCase):
             exe_strategy._dry_run = True
             exe_strategy.use_experimental_executor = use_experimental_executor
             pe = fluid.ParallelExecutor(
-                use_cuda=True,
+                use_cuda=use_gpu,
                 loss_name=loss.name,
                 main_program=main_prog,
                 exec_strategy=exe_strategy)
diff --git a/python/paddle/fluid/tests/unittests/test_pass_builder.py b/python/paddle/fluid/tests/unittests/test_pass_builder.py
index 288c5f6a1f6b1760ca40c0c653e4c0726b799519..5a3ec8ff0180281babeaa006133b3ff9dc6d8083 100644
--- a/python/paddle/fluid/tests/unittests/test_pass_builder.py
+++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py
@@ -94,7 +94,12 @@ class TestPassBuilder(unittest.TestCase):
 
     def test_parallel_testing_with_new_strategy(self):
         build_strategy = fluid.BuildStrategy()
-        pass_builder = build_strategy._create_passes_from_strategy()
+        self.assertFalse(build_strategy.fuse_elewise_add_act_ops)
+        build_strategy.fuse_elewise_add_act_ops = True
+        pass_builder = build_strategy._finalize_strategy_and_create_passes()
+        self.assertTrue("fuse_elewise_add_act_pass" in
+                        [p.type() for p in pass_builder.all_passes()])
+
         origin_len = len(pass_builder.all_passes())
 
         viz_pass = pass_builder.append_pass("graph_viz_pass")
diff --git a/python/setup.py.in b/python/setup.py.in
index b1ff9f3a5c3d877edb6bc6a12efce053a44b4c9c..26f3c5aeaa17f60ccdac849e589c779a73138088 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -9,7 +9,7 @@ class BinaryDistribution(Distribution):
 
 RC      = 0
 
-
+ext_name = '.dll' if os.name == 'nt' else '.so'
 
 def git_commit():
     try:
@@ -136,10 +136,13 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
                    '${PADDLE_BINARY_DIR}/paddle/legacy/pserver/paddle_pserver_main',
                    '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
 
-package_data={'paddle.fluid': ['core.so']}
+package_data={'paddle.fluid': ['core' + (ext_name if os.name != 'nt' else '.pyd')]}
+if os.name == 'nt':
+    package_data['paddle.fluid'] += ['openblas' + ext_name]
+
 if '${WITH_FLUID_ONLY}'== 'OFF':
-    package_data['paddle.v2.master']=['libpaddle_master.so']
-    package_data['py_paddle']=['*.py','_swig_paddle.so']
+    package_data['paddle.v2.master']=['libpaddle_master' + ext_name]
+    package_data['py_paddle']=['*.py','_swig_paddle' +  + ext_name]
 
 package_dir={
     '': '${PADDLE_BINARY_DIR}/python',
@@ -153,13 +156,15 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
     package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle'
 
 # put all thirdparty libraries in paddle.libs
-package_data['paddle.libs']=['libwarpctc.so']
 libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
-shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
+if os.name != 'nt':
+    package_data['paddle.libs']= []
+    package_data['paddle.libs']=['libwarpctc' + ext_name]
+    shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
 if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_LIB}', libs_path)
     shutil.copy('${MKLML_IOMP_LIB}', libs_path)
-    package_data['paddle.libs']+=['libmklml_intel.so','libiomp5.so']
+    package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name]
 if '${CMAKE_BUILD_TYPE}' == 'Release':
     # only change rpath in Release mode.
     if '${WITH_MKLDNN}' == 'ON':
@@ -174,37 +179,60 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
             raise Exception("patch libmkldnn.so failed, command: %s" % command)
         package_data['paddle.libs']+=['libmkldnn.so.0']
         shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
+if '${WITH_NGRAPH}' == 'ON':
+    if '${CMAKE_BUILD_TYPE}' == 'Release':
+        # only change rpath in Release mode.
+        command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}"
+        if os.system(command) != 0:
+            raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command)
+    shutil.copy('${NGRAPH_SHARED_LIB}', libs_path)
+    shutil.copy('${NGRAPH_CPU_LIB}', libs_path)
+    shutil.copy('${NGRAPH_TBB_LIB}', libs_path)
+    package_data['paddle.libs']+=['${NGRAPH_SHARED_LIB_NAME}',
+                                  '${NGRAPH_CPU_LIB_NAME}',
+                                  '${NGRAPH_TBB_LIB_NAME}']
 # remove unused paddle/libs/__init__.py
-os.remove(libs_path+'/__init__.py')
+if os.path.isfile(libs_path+'/__init__.py'):
+    os.remove(libs_path+'/__init__.py')
 package_dir['paddle.libs']=libs_path
 
-# change rpath of core.so, add $ORIGIN/../libs/ to it.
-# The reason is that libwarpctc.so, libiomp5.so etc are in paddle.libs, and
-# core.so is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
+# change rpath of core.ext, add $ORIGIN/../libs/ to it.
+# The reason is that libwarpctc.ext, libiomp5.ext etc are in paddle.libs, and
+# core.ext is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
 # This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
 if '${CMAKE_BUILD_TYPE}' == 'Release':
-    # only change rpath in Release mode, since in Debug mode, core.so is too large to be changed.
-    if "@APPLE@" == "1":
-        command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
-    else:
-        command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
-    if os.system(command) != 0:
-        raise Exception("patch core.so failed, command: %s" % command)
-    if '${WITH_FLUID_ONLY}'== 'OFF':
-        # change rpath of _swig_paddle.so.
+    if os.name != 'nt':
+        # only change rpath in Release mode, since in Debug mode, core.xx is too large to be changed.
         if "@APPLE@" == "1":
-            command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+            command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core" + ext_name
         else:
-            command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+            command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core" + ext_name
         if os.system(command) != 0:
-            raise Exception("patch _swig_paddle.so failed, command: %s" % command)
+            raise Exception("patch core.%s failed, command: %s" % (ext_name, command))
+        if '${WITH_FLUID_ONLY}'== 'OFF':
+            # change rpath of _swig_paddle.xx.
+            if "@APPLE@" == "1":
+                command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle" + ext_name
+            else:
+                command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle" + ext_name
+            if os.system(command) != 0:
+                raise Exception("patch _swig_paddle.%s failed, command: %s" % (ext_name, command))
+
+ext_modules = [Extension('_foo', ['stub.cc'])]
+if os.name == 'nt':
+    # fix the path separator under windows
+    fix_package_dir = {}
+    for k, v in package_dir.items():
+        fix_package_dir[k] = v.replace('/', '\\')
+    package_dir = fix_package_dir
+    ext_modules = []
 
 setup(name='${PACKAGE_NAME}',
       version='${PADDLE_VERSION}',
       description='Parallel Distributed Deep Learning',
       install_requires=setup_requires,
       packages=packages,
-      ext_modules=[Extension('_foo', ['stub.cc'])],
+      ext_modules=ext_modules,
       package_data=package_data,
       package_dir=package_dir,
       scripts=paddle_bins