Merge remote-tracking branch 'ups/develop' into fea/jitkernel

77fc42d2 · tensor-tang · 2937314d · 7a5f3f75 · 77fc42d2 · 77fc42d2
142 changed file
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -52,6 +52,7 @@ ExternalProject_Add(
    PREFIX              ${ANAKIN_SOURCE_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          ${CMAKE_ARGS_PREFIX}
+                        -DUSE_LOGGER=YES
                        -DUSE_X86_PLACE=YES
                        -DBUILD_WITH_UNIT_TEST=NO
                        -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -27,7 +27,6 @@ endfunction()
 CheckCompilerCXX11Flag()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support
@@ -71,6 +70,20 @@ macro(safe_set_nvflag flag_name)
    endif()
 endmacro()
+macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared
+    if (BUILD_SHARED_LIBS) 
+        return() # if build shared libs, the flags keep same with '/MD'
+    endif(BUILD_SHARED_LIBS)
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+endmacro()
 CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
 if(NOT UINT64_MAX_EXISTS)
@@ -97,9 +110,13 @@ SET(CMAKE_EXTRA_INCLUDE_FILES "")
 # Common flags. the compiler flag used for C/C++ sources whenever release or debug
 # Do not care if this flag is support for gcc.
+# https://github.com/PaddlePaddle/Paddle/issues/12773
+if (NOT WIN32)
 set(COMMON_FLAGS
    -fPIC
    -fno-omit-frame-pointer
+    -Werror
    -Wall
    -Wextra
    -Wnon-virtual-dtor
@@ -114,11 +131,6 @@ set(COMMON_FLAGS
    -Wno-error=terminate  # Warning in PADDLE_ENFORCE
 )
-# https://github.com/PaddlePaddle/Paddle/issues/12773
-if (NOT WIN32)
-list(APPEND COMMON_FLAGS -Werror)
-endif()
 set(GPU_COMMON_FLAGS
    -fPIC
    -fno-omit-frame-pointer
@@ -133,30 +145,53 @@ set(GPU_COMMON_FLAGS
    -Wno-error=array-bounds # Warnings in Eigen::array
 )
+else(NOT WIN32)
+set(COMMON_FLAGS
+    "/w") #disable all warnings.
+set(GPU_COMMON_FLAGS
+    "/w") #disable all warnings
+endif(NOT WIN32)
 if (APPLE)
    if(NOT CMAKE_CROSSCOMPILING)
        # On Mac OS X build fat binaries with x86_64 architectures by default.
        set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
    endif()
-else()
+endif(APPLE)
+if(LINUX)
    set(GPU_COMMON_FLAGS
        -Wall
        -Wextra
        -Werror
        ${GPU_COMMON_FLAGS})
-endif()
+endif(LINUX)
 if(UNIX AND NOT APPLE)
  # except apple from nix*Os family
  set(LINUX TRUE)
 endif(UNIX AND NOT APPLE)
 foreach(flag ${COMMON_FLAGS})
    safe_set_cflag(CMAKE_C_FLAGS ${flag})
    safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
 endforeach()
 foreach(flag ${GPU_COMMON_FLAGS})
    safe_set_nvflag(${flag})
 endforeach()
+if(WIN32)
+# windows build turn off warnings.
+safe_set_static_flag()
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/W3")
+        string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/W3")
+    endforeach(flag_var)
+endif(WIN32)
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
 add_custom_target(paddle_apis ALL
-                  DEPENDS paddle_v2_apis paddle_fluid_apis)
+                  DEPENDS paddle_v2_apis)
 add_custom_target(paddle_docs ALL
                  DEPENDS paddle_v2_docs paddle_v2_docs_cn
-                  paddle_fluid_docs paddle_fluid_docs_cn
                  paddle_mobile_docs paddle_mobile_docs_cn)
 add_subdirectory(v2)
-add_subdirectory(fluid)
 add_subdirectory(mobile)
--- a/paddle/contrib/float16/float16_transpiler.py
+++ b/paddle/contrib/float16/float16_transpiler.py
@@ -102,7 +102,7 @@ class Float16Transpiler:
                continue
            for input_arg in current_op.input_arg_names:
                if input_arg in self.input_map:
-                    current_op.rename_input(input_arg,
+                    current_op._rename_input(input_arg,
                                             self.input_map[input_arg])
    def _remove_unused_var(self):
@@ -187,7 +187,7 @@ class Float16Transpiler:
                    shape=var.shape,
                    persistable=var.persistable)
                find_op(var)
-                var.op.rename_output(var_name, tmp_var_name)
+                var.op._rename_output(var_name, tmp_var_name)
                self.block._insert_op(
                    i,
                    type="cast",

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -6,26 +6,9 @@ paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=
 paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
-paddle.fluid.Operator.__init__ ArgSpec(args=['self', 'block', 'desc', 'type', 'inputs', 'outputs', 'attrs'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.block_attr_id ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.blocks_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.blocks_attr_ids ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.output ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.rename_input ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.rename_output ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
-paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.name_scope ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
@@ -38,7 +21,7 @@ paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'en
 paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
-paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
+paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False))
 paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DistributeTranspilerConfig.__init__ 
 paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None))
@@ -170,6 +153,13 @@ paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'out', 'axis', 'use_
 paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
 paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
 paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0))
+paddle.fluid.layers.gaussian_random ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32', False))
+paddle.fluid.layers.sampling_id ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32'))
+paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32'))
+paddle.fluid.layers.sum ArgSpec(args=['x', 'use_mkldnn'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.layers.slice ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.shape ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@@ -241,13 +231,6 @@ paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwarg
 paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.sampling_id ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -286,7 +269,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw
 paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
-paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1))
+paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
@@ -315,13 +298,18 @@ paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs
 paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
 paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000))
+paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
+paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
-paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
+paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False))
 paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None)

--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -13,3 +13,5 @@ if(WITH_INFERENCE)
  # NOTE: please add subdirectory inference at last.
  add_subdirectory(inference)
 endif()
+add_subdirectory(train)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -56,9 +56,9 @@ else()
  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
 endif()
 if (NOT WIN32)
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
+  cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
 else()
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
+  cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
 endif (NOT WIN32)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
@@ -141,20 +141,22 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
+cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
 if(WITH_DISTRIBUTE)
  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass elementwise_add_op)
 endif()
 if (NOT WIN32)
-  cc_library(parallel_executor SRCS parallel_executor.cc DEPS
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
-          graph graph_viz_pass multi_devices_graph_pass
+        graph build_strategy
-          multi_devices_graph_print_pass multi_devices_graph_check_pass
+        fast_threaded_ssa_graph_executor)
-          fast_threaded_ssa_graph_executor fuse_elewise_add_act_pass)
 endif() # NOT WIN32
 cc_library(prune SRCS prune.cc DEPS framework_proto)

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -54,3 +54,8 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu
 #        device_context reduce_op_handle )
 cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
        DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
+cc_library(build_strategy SRCS build_strategy.cc DEPS
+        graph_viz_pass multi_devices_graph_pass
+        multi_devices_graph_print_pass multi_devices_graph_check_pass
+        fuse_elewise_add_act_pass)
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+namespace paddle {
+namespace framework {
+namespace details {
+class ParallelExecutorPassBuilder : public ir::PassBuilder {
+ public:
+  explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
+      : ir::PassBuilder(), strategy_(strategy) {
+    // Add a graph viz pass to record a graph.
+    if (!strategy_.debug_graphviz_path_.empty()) {
+      auto viz_pass = AppendPass("graph_viz_pass");
+      const std::string graph_path = string::Sprintf(
+          "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph");
+      viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
+    }
+    // Add op fusion.
+    if (strategy.fuse_elewise_add_act_ops_) {
+      auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass");
+      // Add a graph viz pass to record a graph.
+      if (!strategy.debug_graphviz_path_.empty()) {
+        auto viz_pass = AppendPass("graph_viz_pass");
+        const std::string graph_path = string::Sprintf(
+            "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
+        viz_pass->Set<std::string>("graph_viz_path",
+                                   new std::string(graph_path));
+      }
+    }
+    // Convert graph to run on multi-devices.
+    auto multi_devices_pass = AppendPass("multi_devices_pass");
+    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
+                                                         &strategy_);
+    // Add a graph print pass to record a graph with device info.
+    if (!strategy_.debug_graphviz_path_.empty()) {
+      auto multi_devices_print_pass = AppendPass("multi_devices_print_pass");
+      multi_devices_print_pass->SetNotOwned<const std::string>(
+          "debug_graphviz_path", &strategy_.debug_graphviz_path_);
+      multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
+          "graph_printer", new details::GraphvizSSAGraphPrinter);
+    }
+    // Verify that the graph is correct for multi-device executor.
+    AppendPass("multi_devices_check_pass");
+  }
+ private:
+  BuildStrategy strategy_;
+};
+std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy()
+    const {
+  pass_builder_.reset(new ParallelExecutorPassBuilder(*this));
+  return pass_builder_;
+}
+std::unique_ptr<ir::Graph> BuildStrategy::Apply(
+    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
+    const std::string &loss_var_name,
+    const std::unordered_set<std::string> &param_names,
+    const std::vector<Scope *> &local_scopes,
+#ifdef PADDLE_WITH_CUDA
+    const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const {
+#else
+    const bool use_cuda) const {
+#endif
+  // Create a default one if not initialized by user.
+  if (!pass_builder_) {
+    CreatePassesFromStrategy();
+  }
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
+  for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
+    if (pass->Type() == "multi_devices_pass") {
+      pass->Erase("places");
+      pass->SetNotOwned<const std::vector<platform::Place>>("places", &places);
+      pass->Erase("loss_var_name");
+      pass->SetNotOwned<const std::string>("loss_var_name", &loss_var_name);
+      pass->Erase("params");
+      pass->SetNotOwned<const std::unordered_set<std::string>>("params",
+                                                               &param_names);
+      pass->Erase("local_scopes");
+      pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
+                                                    &local_scopes);
+#ifdef PADDLE_WITH_CUDA
+      platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
+      pass->Erase("nccl_ctxs");
+      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
+#endif
+    }
+    graph = pass->Apply(std::move(graph));
+  }
+  return graph;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+USE_PASS(fuse_elewise_add_act_pass);
+USE_PASS(graph_viz_pass);
+USE_PASS(multi_devices_pass);
+USE_PASS(multi_devices_check_pass);
+USE_PASS(multi_devices_print_pass);
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -15,6 +15,17 @@
 #pragma once
 #include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/pass_builder.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
 namespace paddle {
 namespace framework {
@@ -57,6 +68,30 @@ struct BuildStrategy {
  bool fuse_elewise_add_act_ops_{false};
  bool enable_data_balance_{false};
+  // User normally doesn't need to call this API.
+  // The PassBuilder allows for more customized insert, remove of passes
+  // from python side.
+  // A new PassBuilder is created based on configs defined above and
+  // passes are owned by the PassBuilder.
+  std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy() const;
+  // Apply the passes built by the pass_builder_. The passes will be
+  // applied to the Program and output an ir::Graph.
+  std::unique_ptr<ir::Graph> Apply(
+      const ProgramDesc &main_program,
+      const std::vector<platform::Place> &places,
+      const std::string &loss_var_name,
+      const std::unordered_set<std::string> &param_names,
+      const std::vector<Scope *> &local_scopes,
+#ifdef PADDLE_WITH_CUDA
+      const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const;
+#else
+      const bool use_cuda) const;
+#endif
+ private:
+  mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/cow_ptr.h
+++ b/paddle/fluid/framework/details/cow_ptr.h
@@ -20,79 +20,37 @@ namespace paddle {
 namespace framework {
 namespace details {
-// Change it to thread safe flags if needed.
+template <class T>
-class ThreadUnsafeOwnershipFlags {
+class COWPtr {
 public:
-  explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
+  typedef std::shared_ptr<T> RefPtr;
-  ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
-  ThreadUnsafeOwnershipFlags& operator=(
-      const ThreadUnsafeOwnershipFlags& other) = delete;
-  ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
-  void SetOwnership(bool flag) { flag_ = flag; }
-  // Invoke the callback if it is not owned.
-  template <typename Callback>
-  void AcquireOwnershipOnce(Callback acquire) {
-    if (!flag_) {
-      acquire();
-      flag_ = true;
-    }
-  }
 private:
-  bool flag_;
+  RefPtr m_sp;
-};
-// Copy-On-Write pointer.
-// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
-//
-// The template parameter OwnershipFlags should have:
-//   * a constructor takes a bool. True if own.
-//   * SetOwnership(bool flag).
-//   * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
-//     owned.
-//
-// https://en.wikipedia.org/wiki/Copy-on-write
-template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
-class COWPtr {
 public:
-  // Ctor from raw pointer.
+  COWPtr() : m_sp(nullptr) {}
-  explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {}
+  explicit COWPtr(T* t) : m_sp(t) {}
-  // Move methods. Steal ownership from origin
+  const T& Data() const { return *m_sp; }
-  COWPtr(COWPtr&& other)
-      : payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
-  COWPtr& operator=(COWPtr&& origin) = default;
-  // Copy methods. Not own payload
-  COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
-  COWPtr& operator=(const COWPtr& other) {
-    payload_ = other.payload_;
-    ownership_.SetOwnership(false);
-    return *this;
-  }
-  // Access read only data.
-  const T& Data() const { return *payload_; }
-  // Access mutable data. If the data is not owned, the data will be copied
-  // before.
  T* MutableData() {
-    ownership_.AcquireOwnershipOnce(
+    DetachIfNotUnique();
-        [this] { payload_.reset(new T(*payload_)); });
+    return m_sp.get();
-    return payload_.get();
  }
- private:
+  void DetachIfNotUnique() {
-  // Actual data pointer.
+    T* tmp = m_sp.get();
-  std::shared_ptr<T> payload_;
+    if (!(tmp == nullptr || m_sp.unique())) {
+      Detach();
+    }
+  }
-  // Ownership flag.
+  void Detach() {
-  OwnershipFlags ownership_;
+    T* tmp = m_sp.get();
+    m_sp = RefPtr(new T(*tmp));
+  }
 };
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/cow_ptr_test.cc
+++ b/paddle/fluid/framework/details/cow_ptr_test.cc
@@ -30,6 +30,14 @@ TEST(COWPtr, all) {
  ASSERT_EQ(ptr2.Data(), 10);
 }
+TEST(COWPtr, change_old) {
+  COWPtr<int> ptr(new int{0});
+  COWPtr<int> ptr2 = ptr;
+  *ptr.MutableData() = 10;
+  ASSERT_EQ(ptr2.Data(), 0);
+  ASSERT_EQ(ptr.Data(), 10);
+}
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -28,9 +28,9 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(fc_fuse_pass inference)
-if(WITH_MKLDNN)
+if (WITH_MKLDNN)
    pass_library(conv_relu_mkldnn_fuse_pass inference)
-endif()
+endif ()
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
 pass_library(fc_lstm_fuse_pass inference)
@@ -41,12 +41,14 @@ cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass
 set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
+cc_library(pass_builder SRCS pass_builder.cc DEPS pass)
 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
-if(WITH_MKLDNN)
+if (WITH_MKLDNN)
    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
-endif()
+endif ()
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -257,6 +257,22 @@ std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  PDPattern external_pattern, subblock_pattern;
+  // Use the following variables to tell whether this model is RNN1.
+  // This fuse can only works on the RNN1 model.
+  std::unordered_set<std::string> specified_vars({"data_lod_attention",
+                                                  "cell_init", "hidden_init",
+                                                  "data", "week", "minute"});
+  int count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsVar() && specified_vars.count(node->Name())) {
+      ++count;
+    }
+  }
+  if (count < specified_vars.size()) {
+    return graph;
+  }
+  // Continue to fuse.
  FindWhileOp(graph.get());
  return graph;
 }

--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -77,10 +77,12 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
    const std::string BatchedCellPreAct =
        patterns::UniqueKey("BatchedCellPreAct");
    const std::string BatchedGate = patterns::UniqueKey("BatchedGate");
+    const std::string CheckedCell = patterns::UniqueKey("CheckedCell");
    scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>();
    scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>();
    scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>();
+    scope->Var(CheckedCell)->GetMutable<framework::LoDTensor>();
    op_desc.SetInput("H0", {});
    op_desc.SetInput("C0", {});
@@ -90,6 +92,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
    op_desc.SetOutput("BatchedGate", {BatchedGate});
    op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct});
    op_desc.SetOutput("BatchedInput", {BatchedInput});
+    op_desc.SetOutput("CheckedCell", {CheckedCell});
    op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse"));
    op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes"));
    // TODO(TJ): get from attr

--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include <algorithm>
+#include <deque>
 #include <unordered_set>
-#include "paddle/fluid/framework/ir/graph_helper.h"
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -113,6 +113,74 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
  return adj_list;
 }
+size_t GraphNum(const Graph &graph) {
+  std::unordered_set<ir::Node *> nodes = graph.Nodes();
+  std::unordered_set<ir::Node *> visited_nodes;
+  visited_nodes.reserve(nodes.size());
+  std::deque<ir::Node *> q_nodes;
+  std::vector<std::unordered_set<ir::Node *>> graph_nodes;
+  std::unordered_set<ir::Node *> g_nodes;
+  size_t graph_count = 0;
+  auto traverse_nodes = [&visited_nodes,
+                         &q_nodes](const std::vector<ir::Node *> &nodes) {
+    std::copy_if(
+        nodes.begin(), nodes.end(), std::back_inserter(q_nodes),
+        [&visited_nodes](Node *node) { return !visited_nodes.count(node); });
+  };
+  while (visited_nodes.size() != nodes.size()) {
+    if (!q_nodes.empty()) {
+      auto cur_node = q_nodes.front();
+      q_nodes.pop_front();
+      visited_nodes.insert(cur_node);
+      g_nodes.insert(cur_node);
+      traverse_nodes(cur_node->inputs);
+      traverse_nodes(cur_node->outputs);
+    } else {
+      ++graph_count;
+      if (g_nodes.size()) {
+        graph_nodes.emplace_back(g_nodes);
+      }
+      g_nodes.clear();
+      for (auto &n : nodes) {
+        if (visited_nodes.count(n) == 0) {
+          q_nodes.push_back(n);
+          break;
+        }
+      }
+    }
+  }
+  if (g_nodes.size()) {
+    graph_nodes.emplace_back(g_nodes);
+  }
+  if (VLOG_IS_ON(10)) {
+    VLOG(10) << "graph_num: " << graph_nodes.size();
+    for (auto &g_n : graph_nodes) {
+      VLOG(10) << "graph_nodes: " << g_n.size();
+      if (g_n.size() < 10) {
+        std::stringstream out;
+        for (auto &node : g_n) {
+          out << "\nNode: " << node->Name() << " in [";
+          for (auto &n : node->inputs) {
+            out << n->Name() << ", ";
+          }
+          out << "], out[";
+          for (auto &n : node->outputs) {
+            out << n->Name() << ", ";
+          }
+          out << "]";
+        }
+        VLOG(10) << out.str();
+      }
+    }
+  }
+  return graph_count;
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -27,6 +27,8 @@ namespace ir {
 // Test if the graph contains circle.
 bool HasCircle(const Graph &graph);
+size_t GraphNum(const Graph &graph);
 // Topology Sort the operations in the graph from inputs to outputs.
 // `graph` cannot contain circle.
 std::vector<ir::Node *> TopologySortOperations(const Graph &graph);

--- a/paddle/fluid/framework/ir/graph_helper_test.cc
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
@@ -120,6 +120,97 @@ TEST(GraphHelperTest, Basic) {
  ASSERT_EQ(node_map.at("op2"), 1UL);
  ASSERT_TRUE(node_map.at("op3") < node_map.at("op5"));
 }
+void BuildZeroGraph(Graph* g) {}
+void BuildOneGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
+  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
+  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
+  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
+  // o1->v1->o2
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2->outputs.push_back(v2);
+  o3->inputs.push_back(v2);
+  o4->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o3);
+  v2->outputs.push_back(o4);
+  // o2->v3->o5
+  o2->outputs.push_back(v3);
+  o5->inputs.push_back(v3);
+  v3->inputs.push_back(o2);
+  v3->outputs.push_back(o5);
+  // o3-v4->o5
+  o3->outputs.push_back(v4);
+  o5->inputs.push_back(v4);
+  v4->inputs.push_back(o3);
+  v4->outputs.push_back(o5);
+}
+void BuildTwoGraphs(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
+  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
+  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
+  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
+  // o1->v1->o2
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2->outputs.push_back(v2);
+  o3->inputs.push_back(v2);
+  o4->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o3);
+  v2->outputs.push_back(o4);
+  // o2->v3->o5
+  //  o2->outputs.push_back(v3);
+  o5->inputs.push_back(v3);
+  //  v3->inputs.push_back(o2);
+  v3->outputs.push_back(o5);
+  // o3-v4->o5
+  o3->outputs.push_back(v4);
+  //  o5->inputs.push_back(v4);
+  v4->inputs.push_back(o3);
+  //  v4->outputs.push_back(o5);
+}
+TEST(GraphHelperTest, GraphNum) {
+  ProgramDesc prog;
+  Graph g(prog);
+  BuildZeroGraph(&g);
+  ASSERT_EQ(GraphNum(g), 0);
+  Graph g2(prog);
+  BuildOneGraph(&g2);
+  ASSERT_EQ(GraphNum(g2), 1);
+  Graph g3(prog);
+  BuildTwoGraphs(&g3);
+  ASSERT_EQ(GraphNum(g3), 2);
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@@ -14,6 +14,8 @@
 #include "paddle/fluid/framework/ir/graph_traits.h"
+#include <vector>
 namespace paddle {
 namespace framework {
 namespace ir {

--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -19,7 +19,6 @@ namespace paddle {
 namespace framework {
 namespace ir {
 std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
-  PADDLE_ENFORCE(!applied_, "Pass can only Apply() once.");
  PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty.");
  for (const std::string& attr : required_pass_attrs_) {
    PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(),

--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -42,6 +42,8 @@ class Pass {
    attr_dels_.clear();
  }
+  std::string Type() const { return type_; }
  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const;
  // Get a reference to the attributed previously set.
@@ -52,6 +54,21 @@ class Pass {
    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
  }
+  bool Has(const std::string &attr_name) const {
+    return attrs_.find(attr_name) != attrs_.end();
+  }
+  void Erase(const std::string &attr_name) {
+    if (!Has(attr_name)) {
+      return;
+    }
+    if (attr_dels_.find(attr_name) != attr_dels_.end()) {
+      attr_dels_[attr_name]();
+      attr_dels_.erase(attr_name);
+    }
+    attrs_.erase(attr_name);
+  }
  // Set a pointer to the attribute. Pass takes ownership of the attribute.
  template <typename AttrType>
  void Set(const std::string &attr_name, AttrType *attr) {
@@ -68,13 +85,15 @@ class Pass {
  // should delete the attribute.
  template <typename AttrType>
  void SetNotOwned(const std::string &attr_name, AttrType *attr) {
-    PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass",
+                   attr_name);
    attrs_[attr_name] = attr;
  }
 protected:
-  virtual std::unique_ptr<Graph> ApplyImpl(
+  virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const {
-      std::unique_ptr<Graph> graph) const = 0;
+    LOG(FATAL) << "Calling virtual Pass not implemented.";
+  }
 private:
  template <typename PassType>
@@ -89,7 +108,10 @@ class Pass {
    required_graph_attrs_.insert(attrs.begin(), attrs.end());
  }
+  void RegisterType(const std::string &type) { type_ = type; }
  mutable bool applied_{false};
+  std::string type_;
  std::unordered_set<std::string> required_pass_attrs_;
  std::unordered_set<std::string> required_graph_attrs_;
  std::map<std::string, boost::any> attrs_;
@@ -143,10 +165,11 @@ struct PassRegistrar : public Registrar {
    PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type),
                   "'%s' is registered more than once.", pass_type);
    PassRegistry::Instance().Insert(
-        pass_type, [this]() -> std::unique_ptr<Pass> {
+        pass_type, [this, pass_type]() -> std::unique_ptr<Pass> {
          std::unique_ptr<Pass> pass(new PassType());
          pass->RegisterRequiredPassAttrs(this->required_pass_attrs_);
          pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_);
+          pass->RegisterType(pass_type);
          return pass;
        });
  }

--- a/paddle/fluid/framework/ir/pass_builder.cc
+++ b/paddle/fluid/framework/ir/pass_builder.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/ir/pass_builder.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+std::shared_ptr<Pass> PassBuilder::AppendPass(const std::string& pass_type) {
+  auto pass = ir::PassRegistry::Instance().Get(pass_type);
+  passes_.emplace_back(pass.release());
+  return passes_.back();
+}
+void PassBuilder::RemovePass(size_t idx) {
+  PADDLE_ENFORCE(passes_.size() > idx);
+  passes_.erase(passes_.begin() + idx);
+}
+std::shared_ptr<Pass> PassBuilder::InsertPass(size_t idx,
+                                              const std::string& pass_type) {
+  PADDLE_ENFORCE(passes_.size() >= idx);
+  std::shared_ptr<Pass> pass(
+      ir::PassRegistry::Instance().Get(pass_type).release());
+  passes_.insert(passes_.begin() + idx, std::move(pass));
+  return passes_[idx];
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/pass_builder.h
+++ b/paddle/fluid/framework/ir/pass_builder.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class PassBuilder {
+ public:
+  PassBuilder() {}
+  virtual ~PassBuilder() {}
+  // Append a new pass to the end.
+  std::shared_ptr<Pass> AppendPass(const std::string& pass_type);
+  // Insert a new pass after `idx`.
+  std::shared_ptr<Pass> InsertPass(size_t idx, const std::string& pass_type);
+  // Remove a new pass at `idx`.
+  void RemovePass(size_t idx);
+  // Returns a list of all passes.
+  std::vector<std::shared_ptr<Pass>> AllPasses() const { return passes_; }
+ protected:
+  std::vector<std::shared_ptr<Pass>> passes_;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/pass_test.cc
+++ b/paddle/fluid/framework/ir/pass_test.cc
@@ -82,12 +82,10 @@ TEST(PassTest, TestPassAttrCheck) {
  ASSERT_EQ(graph->Get<int>("copy_test_pass_attr"), 2);
  ASSERT_EQ(graph->Get<int>("copy_test_graph_attr"), 2);
-  try {
+  // Allow apply more than once.
+  graph.reset(new Graph(prog));
+  graph->Set<int>("test_graph_attr", new int);
  graph = pass->Apply(std::move(graph));
-  } catch (paddle::platform::EnforceNotMet e) {
-    exception = std::string(e.what());
-  }
-  ASSERT_TRUE(exception.find("Pass can only Apply() once") != exception.npos);
  pass = PassRegistry::Instance().Get("test_pass");
  pass->SetNotOwned<int>("test_pass_attr", &val);

--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -17,10 +17,13 @@
 #include <algorithm>
 #include <initializer_list>
 #include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
 #include <vector>
+#include "paddle/fluid/framework/details/cow_ptr.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "glog/logging.h"
@@ -28,173 +31,167 @@ namespace paddle {
 namespace framework {
 #if defined(PADDLE_WITH_CUDA)
-// Vector<T> implements the std::vector interface, and can get Data or
+namespace details {
-// MutableData from any place. The data will be synced implicitly inside.
+struct CUDABuffer {
-template <typename T>
+  void *data_{nullptr};
-class Vector {
+  size_t size_{0};
- public:
+  platform::CUDAPlace place_;
-  using value_type = T;
-  // Default ctor. Create empty Vector
+  CUDABuffer() {}
-  Vector() { InitEmpty(); }
+  CUDABuffer(platform::Place place, size_t size)
+      : size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
-  // Fill vector with value. The vector size is `count`.
+    data_ = memory::Alloc(place_, size);
-  explicit Vector(size_t count, const T &value = T()) {
-    InitEmpty();
-    if (count != 0) {
-      resize(count);
-      T *ptr = begin();
-      for (size_t i = 0; i < count; ++i) {
-        ptr[i] = value;
-      }
-    }
  }
-  // Ctor with init_list
+  ~CUDABuffer() { ClearMemory(); }
-  Vector(std::initializer_list<T> init) {
-    if (init.size() == 0) {
+  CUDABuffer(const CUDABuffer &o) = delete;
-      InitEmpty();
+  CUDABuffer &operator=(const CUDABuffer &o) = delete;
-    } else {
-      InitByIter(init.size(), init.begin(), init.end());
+  void Resize(platform::Place place, size_t size) {
+    ClearMemory();
+    place_ = boost::get<platform::CUDAPlace>(place);
+    data_ = memory::Alloc(place_, size);
+    PADDLE_ENFORCE_NOT_NULL(data_);
+    size_ = size;
  }
+  void Swap(CUDABuffer &o) {
+    std::swap(data_, o.data_);
+    std::swap(place_, o.place_);
+    std::swap(size_, o.size_);
  }
-  // implicit cast from std::vector.
+ private:
-  template <typename U>
+  void ClearMemory() const {
-  Vector(const std::vector<U> &dat) {  // NOLINT
+    if (data_ != nullptr) {
-    if (dat.size() == 0) {
+      memory::Free(place_, data_);
-      InitEmpty();
-    } else {
-      InitByIter(dat.size(), dat.begin(), dat.end());
    }
  }
+};
+}  // namespace details
-  // Copy ctor
+// Vector<T> implements the std::vector interface, and can get Data or
-  Vector(const Vector<T> &other) { this->operator=(other); }
+// MutableData from any place. The data will be synced implicitly inside.
+template <typename T>
+class Vector {
+ public:
+  using value_type = T;
+  using iterator = typename std::vector<T>::iterator;
+  using const_iterator = typename std::vector<T>::const_iterator;
-  // Copy operator
+ private:
-  Vector<T> &operator=(const Vector<T> &other) {
+  // The actual class to implement vector logic
-    if (other.size() != 0) {
+  class VectorData {
-      this->InitByIter(other.size(), other.begin(), other.end());
+   public:
-    } else {
+    VectorData() : flag_(kDataInCPU) {}
-      InitEmpty();
+    VectorData(size_t count, const T &value)
-    }
+        : cpu_(count, value), flag_(kDataInCPU) {}
-    return *this;
+    VectorData(std::initializer_list<T> init) : cpu_(init), flag_(kDataInCPU) {}
-  }
+    template <typename U>
+    explicit VectorData(const std::vector<U> &dat)
+        : cpu_(dat), flag_(kDataInCPU) {}
+    ~VectorData() {}
-  // Move ctor
+    VectorData(const VectorData &o) {
-  Vector(Vector<T> &&other) {
+      o.ImmutableCPU();
-    this->size_ = other.size_;
+      cpu_ = o.cpu_;
-    this->flag_ = other.flag_;
+      flag_ = kDataInCPU;
-    if (other.cuda_vec_.memory_size()) {
-      this->cuda_vec_.ShareDataWith(other.cuda_vec_);
-    }
-    if (other.cpu_vec_.memory_size()) {
-      this->cpu_vec_.ShareDataWith(other.cpu_vec_);
    }
+    VectorData &operator=(const VectorData &o) {
+      o.ImmutableCPU();
+      cpu_ = o.cpu_;
+      flag_ = kDataInCPU;
+      details::CUDABuffer null;
+      gpu_.Swap(null);
+      return *this;
    }
-  // CPU data access method. Mutable.
    T &operator[](size_t i) {
      MutableCPU();
-    return const_cast<T *>(cpu_vec_.data<T>())[i];
+      return cpu_[i];
    }
-  // CPU data access method. Immutable.
    const T &operator[](size_t i) const {
      ImmutableCPU();
-    return cpu_vec_.data<T>()[i];
+      return cpu_[i];
    }
-  // std::vector iterator methods. Based on CPU data access method
+    size_t size() const { return cpu_.size(); }
-  size_t size() const { return size_; }
-  T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
+    iterator begin() {
+      MutableCPU();
+      return cpu_.begin();
+    }
-  T *end() {
+    iterator end() {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
+      MutableCPU();
+      return cpu_.end();
    }
-  T &front() { return *begin(); }
+    T &front() {
+      MutableCPU();
+      return cpu_.front();
+    }
    T &back() {
-    auto it = end();
+      MutableCPU();
-    --it;
+      return cpu_.back();
-    return *it;
    }
-  const T *begin() const {
+    const_iterator begin() const {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
+      ImmutableCPU();
+      return cpu_.begin();
    }
-  const T *end() const {
+    const_iterator end() const {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
+      ImmutableCPU();
+      return cpu_.end();
    }
-  const T *cbegin() const { return begin(); }
-  const T *cend() const { return end(); }
    const T &back() const {
-    auto it = end();
+      ImmutableCPU();
-    --it;
+      return cpu_.back();
-    return *it;
    }
-  T *data() { return begin(); }
+    T *data() { return &(*this)[0]; }
-  const T *data() const { return begin(); }
+    const T *data() const { return &(*this)[0]; }
-  const T &front() const { return *begin(); }
+    const T &front() const {
-  // end of std::vector iterator methods
+      ImmutableCPU();
+      return cpu_.front();
+    }
    // assign this from iterator.
    // NOTE: the iterator must support `end-begin`
    template <typename Iter>
    void assign(Iter begin, Iter end) {
-    InitByIter(end - begin, begin, end);
+      MutableCPU();
+      cpu_.assign(begin, end);
    }
    // push_back. If the previous capacity is not enough, the memory will
    // double.
    void push_back(T elem) {
-    if (size_ + 1 > capacity()) {
+      MutableCPU();
-      reserve((size_ + 1) << 1);
+      cpu_.push_back(elem);
-    }
-    *end() = elem;
-    ++size_;
    }
    // extend a vector by iterator.
    // NOTE: the iterator must support end-begin
    template <typename It>
    void Extend(It begin, It end) {
-    size_t pre_size = size_;
+      MutableCPU();
-    resize(pre_size + (end - begin));
+      auto out_it = std::back_inserter<std::vector<T>>(this->cpu_);
-    T *ptr = this->begin() + pre_size;
+      std::copy(begin, end, out_it);
-    for (; begin < end; ++begin, ++ptr) {
-      *ptr = *begin;
-    }
    }
    // resize the vector
    void resize(size_t size) {
-    if (size + 1 <= capacity()) {
-      size_ = size;
-    } else {
      MutableCPU();
-      Tensor cpu_tensor;
+      cpu_.resize(size);
-      platform::Place cpu = platform::CPUPlace();
-      T *ptr = cpu_tensor.mutable_data<T>(
-          framework::make_ddim({static_cast<int64_t>(size)}), cpu);
-      const T *old_ptr =
-          cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
-      if (old_ptr != nullptr) {
-        std::copy(old_ptr, old_ptr + size_, ptr);
-      }
-      size_ = size;
-      cpu_vec_.ShareDataWith(cpu_tensor);
-    }
    }
    // get cuda ptr. immutable
@@ -202,7 +199,7 @@ class Vector {
      PADDLE_ENFORCE(platform::is_gpu_place(place),
                     "CUDA Data must on CUDA place");
      ImmutableCUDA(place);
-    return cuda_vec_.data<T>();
+      return reinterpret_cast<T *>(gpu_.data_);
    }
    // get cuda ptr. mutable
@@ -214,77 +211,39 @@ class Vector {
    // clear
    void clear() {
-    size_ = 0;
+      cpu_.clear();
      flag_ = kDirty | kDataInCPU;
    }
-  size_t capacity() const {
+    size_t capacity() const { return cpu_.capacity(); }
-    return cpu_vec_.memory_size() / SizeOfType(typeid(T));
-  }
    // reserve data
-  void reserve(size_t size) {
+    void reserve(size_t size) const { cpu_.reserve(size); }
-    size_t pre_size = size_;
-    resize(size);
-    resize(pre_size);
-  }
-  // the unify method to access CPU or CUDA data. immutable.
-  const T *Data(platform::Place place) const {
-    if (platform::is_gpu_place(place)) {
-      return CUDAData(place);
-    } else {
-      return data();
-    }
-  }
-  // the unify method to access CPU or CUDA data. mutable.
-  T *MutableData(platform::Place place) {
-    if (platform::is_gpu_place(place)) {
-      return CUDAMutableData(place);
-    } else {
-      return data();
-    }
-  }
    // implicit cast operator. Vector can be cast to std::vector implicitly.
    operator std::vector<T>() const {
-    std::vector<T> result;
+      ImmutableCPU();
-    result.resize(size());
+      return cpu_;
-    std::copy(begin(), end(), result.begin());
-    return result;
    }
-  bool operator==(const Vector<T> &other) const {
+    bool operator==(const VectorData &other) const {
-    if (size() != other.size()) return false;
+      ImmutableCPU();
-    auto it1 = cbegin();
+      other.ImmutableCPU();
-    auto it2 = other.cbegin();
+      return cpu_ == other.cpu_;
-    for (; it1 < cend(); ++it1, ++it2) {
-      if (*it1 != *it2) {
-        return false;
-      }
-    }
-    return true;
    }
- private:
+    std::mutex &Mutex() const { return mtx_; }
-  void InitEmpty() {
-    size_ = 0;
-    flag_ = kDataInCPU;
-  }
-  template <typename Iter>
+    std::unique_ptr<platform::CUDAPlace> CUDAPlace() const {
-  void InitByIter(size_t size, Iter begin, Iter end) {
+      if (gpu_.data_ == nullptr) {
-    platform::Place cpu = platform::CPUPlace();
+        return nullptr;
-    T *ptr = this->cpu_vec_.template mutable_data<T>(
+      } else {
-        framework::make_ddim({static_cast<int64_t>(size)}), cpu);
+        return std::unique_ptr<platform::CUDAPlace>(
-    for (size_t i = 0; i < size; ++i) {
+            new platform::CUDAPlace(gpu_.place_));
-      *ptr++ = *begin++;
+      }
-    }
-    flag_ = kDataInCPU | kDirty;
-    size_ = size;
    }
+   private:
    enum DataFlag {
      kDataInCPU = 0x01,
      kDataInCUDA = 0x02,
@@ -294,8 +253,15 @@ class Vector {
    void CopyToCPU() const {
      // COPY GPU Data To CPU
-    TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-    WaitPlace(cuda_vec_.place());
+          platform::DeviceContextPool::Instance().Get(
+              platform::Place(gpu_.place_)));
+      auto stream = dev_ctx->stream();
+      void *src = gpu_.data_;
+      void *dst = cpu_.data();
+      memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
+                   stream);
+      dev_ctx->Wait();
    }
    void MutableCPU() {
@@ -308,16 +274,12 @@ class Vector {
    void ImmutableCUDA(platform::Place place) const {
      if (IsDirty()) {
        if (IsInCPU()) {
-        TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
+          CopyCPUDataToCUDA(place);
-                   &cuda_vec_);
-        WaitPlace(place);
          UnsetFlag(kDirty);
          SetFlag(kDataInCUDA);
-      } else if (IsInCUDA() && !(place == cuda_vec_.place())) {
+        } else if (IsInCUDA() &&
-        framework::Tensor tmp;
+                   !(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
-        TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
+          PADDLE_THROW("This situation should not happen");
-        WaitPlace(cuda_vec_.place());
-        cuda_vec_.ShareDataWith(tmp);
          // Still dirty
        } else {
          // Dirty && DataInCUDA && Device is same
@@ -326,17 +288,10 @@ class Vector {
      } else {
        if (!IsInCUDA()) {
          // Even data is not dirty. However, data is not in CUDA. Copy data.
-        TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
+          CopyCPUDataToCUDA(place);
-                   &cuda_vec_);
-        WaitPlace(place);
          SetFlag(kDataInCUDA);
-      } else if (!(place == cuda_vec_.place())) {
+        } else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
-        framework::Tensor tmp;
+          PADDLE_THROW("This situation should not happen.");
-        WaitPlace(cuda_vec_.place());
-        TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
-        WaitPlace(cuda_vec_.place());
-        WaitPlace(place);
-        cuda_vec_.ShareDataWith(tmp);
        } else {
          // Not Dirty && DataInCUDA && Device is same
          // Do nothing.
@@ -344,9 +299,20 @@ class Vector {
      }
    }
+    void CopyCPUDataToCUDA(const platform::Place &place) const {
+      void *src = cpu_.data();
+      gpu_.Resize(place, cpu_.size() * sizeof(T));
+      void *dst = gpu_.data_;
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+          platform::DeviceContextPool::Instance().Get(place));
+      auto stream = dev_ctx->stream();
+      memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
+                   stream);
+    }
    void ImmutableCPU() const {
-    if (IsDirty() &&
+      if (IsDirty() && !IsInCPU()) {  // If data has been changed in CUDA, or
-        !IsInCPU()) {  // If data has been changed in CUDA, or CPU has no data.
+                                      // CPU has no data.
        CopyToCPU();
        UnsetFlag(kDirty);
      }
@@ -362,23 +328,178 @@ class Vector {
    bool IsInCPU() const { return flag_ & kDataInCPU; }
-  static void WaitPlace(const platform::Place place) {
+    mutable std::vector<T> cpu_;
+    mutable details::CUDABuffer gpu_;
+    mutable int flag_;
+    mutable std::mutex mtx_;
+  };
+ public:
+  // Default ctor. Create empty Vector
+  Vector() : m_(new VectorData()) {}
+  // Fill vector with value. The vector size is `count`.
+  explicit Vector(size_t count, const T &value = T())
+      : m_(new VectorData(count, value)) {}
+  // Ctor with init_list
+  Vector(std::initializer_list<T> init) : m_(new VectorData(init)) {}
+  // implicit cast from std::vector.
+  template <typename U>
+  Vector(const std::vector<U> &dat) : m_(new VectorData(dat)) {  // NOLINT
+  }
+  // Copy ctor
+  Vector(const Vector<T> &other) { m_ = other.m_; }
+  // Copy operator
+  Vector<T> &operator=(const Vector<T> &other) {
+    m_ = other.m_;
+    return *this;
+  }
+  // Move ctor
+  Vector(Vector<T> &&other) { m_ = std::move(other.m_); }
+  // CPU data access method. Mutable.
+  T &operator[](size_t i) { return (*m_.MutableData())[i]; }
+  // CPU data access method. Immutable.
+  const T &operator[](size_t i) const { return m_.Data()[i]; }
+  // std::vector iterator methods. Based on CPU data access method
+  size_t size() const { return m_.Data().size(); }
+  iterator begin() { return m_.MutableData()->begin(); }
+  iterator end() { return m_.MutableData()->end(); }
+  T &front() { return m_.MutableData()->front(); }
+  T &back() { return m_.MutableData()->back(); }
+  const_iterator begin() const { return m_.Data().begin(); }
+  const_iterator end() const { return m_.Data().end(); }
+  const_iterator cbegin() const { return begin(); }
+  const_iterator cend() const { return end(); }
+  const T &back() const { return m_.Data().back(); }
+  T *data() { return m_.MutableData()->data(); }
+  const T *data() const { return m_.Data().data(); }
+  const T &front() const { return m_.Data().front(); }
+  // end of std::vector iterator methods
+  // assign this from iterator.
+  // NOTE: the iterator must support `end-begin`
+  template <typename Iter>
+  void assign(Iter begin, Iter end) {
+    m_.MutableData()->assign(begin, end);
+  }
+  // push_back. If the previous capacity is not enough, the memory will
+  // double.
+  void push_back(T elem) { m_.MutableData()->push_back(elem); }
+  // extend a vector by iterator.
+  // NOTE: the iterator must support end-begin
+  template <typename It>
+  void Extend(It begin, It end) {
+    m_.MutableData()->Extend(begin, end);
+  }
+  // resize the vector
+  void resize(size_t size) {
+    if (m_.Data().size() != size) {
+      m_.MutableData()->resize(size);
+    }
+  }
+  // get cuda ptr. immutable
+  const T *CUDAData(platform::Place place) const {
+    {
+      auto &mtx = m_.Data().Mutex();
+      std::lock_guard<std::mutex> guard(mtx);
+      auto cuda_place = m_.Data().CUDAPlace();
+      if (cuda_place == nullptr ||
+          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+        return m_.Data().CUDAData(place);
+      }
+    }
+    // If m_ contains CUDAData in a different place. Detach manually.
+    m_.Detach();
+    return CUDAData(place);
+  }
+  // get cuda ptr. mutable
+  T *CUDAMutableData(platform::Place place) {
+    {
+      auto &mtx = m_.Data().Mutex();
+      std::lock_guard<std::mutex> guard(mtx);
+      auto cuda_place = m_.Data().CUDAPlace();
+      if (cuda_place == nullptr ||
+          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+        return m_.MutableData()->CUDAMutableData(place);
+      }
+    }
+    // If m_ contains CUDAData in a different place. Detach manually.
+    m_.Detach();
+    return CUDAMutableData(place);
+  }
+  // clear
+  void clear() { m_.MutableData()->clear(); }
+  size_t capacity() const { return m_.Data().capacity(); }
+  // reserve data
+  void reserve(size_t size) { m_.Data().reserve(size); }
+  // the unify method to access CPU or CUDA data. immutable.
+  const T *Data(platform::Place place) const {
+    if (platform::is_gpu_place(place)) {
+      return CUDAData(place);
+    } else {
+      return data();
+    }
+  }
+  // the unify method to access CPU or CUDA data. mutable.
+  T *MutableData(platform::Place place) {
    if (platform::is_gpu_place(place)) {
-      platform::DeviceContextPool::Instance()
+      return CUDAMutableData(place);
-          .Get(boost::get<platform::CUDAPlace>(place))
+    } else {
-          ->Wait();
+      return data();
    }
  }
-  static T &EmptyDummy() {
+  // implicit cast operator. Vector can be cast to std::vector implicitly.
-    static T dummy = T();
+  operator std::vector<T>() const { return m_.Data(); }
-    return dummy;
+  bool operator==(const Vector<T> &other) const {
+    if (size() != other.size()) return false;
+    auto it1 = cbegin();
+    auto it2 = other.cbegin();
+    for (; it1 < cend(); ++it1, ++it2) {
+      if (*it1 != *it2) {
+        return false;
+      }
+    }
+    return true;
  }
-  mutable int flag_;
+  const void *Handle() const { return &m_.Data(); }
-  mutable Tensor cpu_vec_;
-  mutable Tensor cuda_vec_;
+ private:
-  size_t size_;
+  // Vector is an COW object.
+  mutable details::COWPtr<VectorData> m_;
 };
 #else  // PADDLE_WITH_CUDA

--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/string/pretty_log.h"
+namespace paddle {
+namespace framework {
+// These code can be shared with Executor.
+static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
+  if (var_type == proto::VarType::LOD_TENSOR) {
+    var->GetMutable<LoDTensor>();
+  } else if (var_type == proto::VarType::SELECTED_ROWS) {
+    var->GetMutable<SelectedRows>();
+  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarType::FETCH_LIST) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarType::STEP_SCOPES) {
+    var->GetMutable<std::vector<framework::Scope>>();
+  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
+    var->GetMutable<LoDRankTable>();
+  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
+    var->GetMutable<LoDTensorArray>();
+  } else if (var_type == proto::VarType::PLACE_LIST) {
+    var->GetMutable<platform::PlaceList>();
+  } else if (var_type == proto::VarType::READER) {
+    var->GetMutable<ReaderHolder>();
+  } else if (var_type == proto::VarType::CHANNEL) {
+    var->GetMutable<ChannelHolder>();
+  } else if (var_type == proto::VarType::RAW) {
+    // GetMutable will be called in operator
+  } else {
+    PADDLE_THROW(
+        "Variable type %d is not in "
+        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
+        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
+        var_type);
+  }
+}
+void NaiveExecutor::Prepare(Scope *parent_scope,
+                            const ProgramDesc &program_desc, int block_id,
+                            bool with_feed_fetch_ops) {
+  if (!parent_scope) {
+    scope_ = new framework::Scope;
+  } else {
+    scope_ = &parent_scope->NewScope();
+  }
+  CreateVariables(program_desc, scope_, block_id);
+  CreateOps(program_desc, block_id, with_feed_fetch_ops);
+}
+void NaiveExecutor::Run() {
+  for (auto &op : ops_) {
+    VLOG(4) << "run " << op->Type();
+    op->Run(*scope_, place_);
+  }
+}
+void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope,
+                                    int block_id) {
+  PADDLE_ENFORCE(scope);
+  auto &global_block = desc.Block(block_id);
+  const Scope *ancestor_scope = scope;
+  while (ancestor_scope->parent()) {
+    ancestor_scope = ancestor_scope->parent();
+  }
+  if (ancestor_scope != scope) {
+    for (auto &var : global_block.AllVars()) {
+      if (var->Name() == framework::kEmptyVarName) {
+        continue;
+      }
+      // Create persistable vars in ancestor scope.
+      if (var->Persistable()) {
+        auto *ptr = const_cast<Scope *>(ancestor_scope)->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+        VLOG(3) << "Create Variable " << var->Name()
+                << " global, which pointer is " << ptr;
+      } else {  // Create temporary variables in local scope.
+        auto *ptr = scope->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+        VLOG(3) << "Create Variable " << var->Name()
+                << " locally, which pointer is " << ptr;
+      }
+    }
+  } else {
+    for (auto &var : global_block.AllVars()) {
+      auto *ptr = scope->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+      VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
+              << ptr;
+    }
+  }
+}
+void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id,
+                              bool with_feed_fetch_ops) {
+  for (const auto &op_desc : desc.Block(block_id).AllOps()) {
+    if (!with_feed_fetch_ops &&
+        (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) {
+      string::PrettyLogEndl(string::Style::detail(), "---  skip [%s], %s -> %s",
+                            op_desc->Input("X")[0], op_desc->Type(),
+                            op_desc->Output("Out")[0]);
+      continue;
+    }
+    ops_.emplace_back(OpRegistry::CreateOp(*op_desc));
+  }
+}
+LoDTensor *NaiveExecutor::FindTensor(const std::string &name) {
+  PADDLE_ENFORCE(scope_, "Need to init scope first");
+  auto *var = scope_->FindVar(name);
+  PADDLE_ENFORCE(var, "No variable [%s] in the scope");
+  auto *tensor = const_cast<LoDTensor *>(&var->Get<LoDTensor>());
+  return tensor;
+}
+void NaiveExecutor::CleanFeedFetchOps() {
+  std::vector<std::unique_ptr<OperatorBase>> ops;
+  for (auto &op : ops_) {
+    if (op->Type() != "feed" && op->Type() != "fetch") {
+      ops.emplace_back(std::move(op));
+    }
+  }
+  ops_.swap(ops);
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace framework {
+/*
+ * Simple, intuitive and effective. Only single thread is supported, and
+ * currently designed for inference.
+ */
+class NaiveExecutor {
+ public:
+  explicit NaiveExecutor(const platform::Place& place) : place_(place) {}
+  // Create child scope.
+  // Create variables.
+  // @with_feed_fetch_ops: whether to work with the feed and fetch operators.
+  void Prepare(Scope* parent_scope, const ProgramDesc& program_desc,
+               int block_id, bool with_feed_fetch_ops);
+  // Run all the operators.
+  void Run();
+  // Get an tensor to operating directly, without the need for feed_ops.
+  LoDTensor* FindTensor(const std::string& name);
+  Scope* scope() { return scope_; }
+  void CleanFeedFetchOps();
+ protected:
+  void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id);
+  void CreateOps(const ProgramDesc& desc, int block_id,
+                 bool with_feed_fetch_ops);
+ private:
+  const platform::Place place_;
+  // Catch the required resource to avoid recreate.
+  std::vector<std::unique_ptr<OperatorBase>> ops_;
+  Scope* scope_;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/naive_executor_test.cc
+++ b/paddle/fluid/framework/naive_executor_test.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/naive_executor.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+namespace paddle {
+namespace framework {
+TEST(NaiveExecutor, Basic) {
+  ProgramDesc program;
+  auto* main_block = program.MutableBlock(0);
+  auto* a = main_block->Var("a");  // input
+  auto* b = main_block->Var("b");  // input
+  auto* c = main_block->Var("c");  // input
+  a->SetType(proto::VarType::LOD_TENSOR);
+  b->SetType(proto::VarType::LOD_TENSOR);
+  c->SetType(proto::VarType::LOD_TENSOR);
+  auto* add = main_block->AppendOp();
+  add->SetType("elementwise_add");
+  add->SetInput("X", {"a"});
+  add->SetInput("Y", {"b"});
+  add->SetOutput("Out", {"c"});
+  auto place = platform::CPUPlace();
+  NaiveExecutor exe(place);
+  exe.Prepare(nullptr, program, 0, false /*with feed fetch ops*/);
+  auto* a_tensor = exe.FindTensor("a");
+  auto* b_tensor = exe.FindTensor("b");
+  auto* c_tensor = exe.FindTensor("c");
+  a_tensor->Resize({1, 4});
+  b_tensor->Resize({1, 4});
+  c_tensor->Resize({1, 4});
+  b_tensor->mutable_data<float>(place);
+  a_tensor->mutable_data<float>(place);
+  float a_arr[] = {0, 1, 2, 3};
+  float b_arr[] = {0.0, .1, .2, .3};
+  std::copy_n(a_arr, 4, a_tensor->mutable_data<float>(place));
+  std::copy_n(b_arr, 4, b_tensor->mutable_data<float>(place));
+  exe.Run();
+  auto* c_data = c_tensor->mutable_data<float>(place);
+  for (int i = 0; i < 4; i++) {
+    EXPECT_NEAR(c_data[i], 1.1 * i, 1e-3);
+  }
+}
+}  // namespace framework
+}  // namespace paddle
+USE_OP(elementwise_add);
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -154,9 +154,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
      platform::SetDeviceId(dev_id);
 #endif
    }
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    if (platform::IsProfileEnabled()) {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
      platform::RecordEvent record_event(Type(), pool.Get(place));
+    }
    RunImpl(scope, place);
    if (VLOG_IS_ON(3)) {
      VLOG(3) << place << " " << DebugStringEx(&scope);
    }

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,21 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/parallel_executor.h"
 #include <string>
 #include <tuple>
 #include <vector>
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -35,80 +33,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
-    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
-    const std::string &loss_var_name,
-    const std::unordered_set<std::string> &param_names,
-    const std::vector<Scope *> &local_scopes, const bool use_cuda,
-#ifdef PADDLE_WITH_CUDA
-    const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) {
-#else
-    const BuildStrategy &strategy) {
-#endif
-  // Convert the program to graph.
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
-  // Apply a graph viz pass to record a graph.
-  if (!strategy.debug_graphviz_path_.empty()) {
-    auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
-    const std::string graph_path = string::Sprintf(
-        "%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph");
-    viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
-    graph = viz_pass->Apply(std::move(graph));
-  }
-  // Apply op fusion.
-  if (strategy.fuse_elewise_add_act_ops_) {
-    auto fuse_elewise_add_act_pass =
-        ir::PassRegistry::Instance().Get("fuse_elewise_add_act_pass");
-    graph = fuse_elewise_add_act_pass->Apply(std::move(graph));
-    // Apply a graph viz pass to record a graph.
-    if (!strategy.debug_graphviz_path_.empty()) {
-      auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
-      const std::string graph_path = string::Sprintf(
-          "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
-      viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
-      graph = viz_pass->Apply(std::move(graph));
-    }
-  }
-  // Convert graph to run on multi-devices.
-  auto multi_devices_pass =
-      ir::PassRegistry::Instance().Get("multi_devices_pass");
-  multi_devices_pass->SetNotOwned<const std::vector<platform::Place>>("places",
-                                                                      &places);
-  multi_devices_pass->SetNotOwned<const std::string>("loss_var_name",
-                                                     &loss_var_name);
-  multi_devices_pass->SetNotOwned<const std::unordered_set<std::string>>(
-      "params", &param_names);
-  multi_devices_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
-                                                              &local_scopes);
-  multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
-#ifdef PADDLE_WITH_CUDA
-  platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
-  multi_devices_pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
-#endif
-  graph = multi_devices_pass->Apply(std::move(graph));
-  // Apply a graph print pass to record a graph with device info.
-  if (!strategy.debug_graphviz_path_.empty()) {
-    auto multi_devices_print_pass =
-        ir::PassRegistry::Instance().Get("multi_devices_print_pass");
-    multi_devices_print_pass->SetNotOwned<const std::string>(
-        "debug_graphviz_path", &strategy.debug_graphviz_path_);
-    multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
-        "graph_printer", new details::GraphvizSSAGraphPrinter);
-    graph = multi_devices_print_pass->Apply(std::move(graph));
-  }
-  // Verify that the graph is correct for multi-device executor.
-  auto multi_devices_check_pass =
-      ir::PassRegistry::Instance().Get("multi_devices_check_pass");
-  graph = multi_devices_check_pass->Apply(std::move(graph));
-  return graph;
-}
 class ParallelExecutorPrivate {
 public:
  explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
@@ -199,10 +123,9 @@ ParallelExecutor::ParallelExecutor(
 // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
 // ncclOp
 #ifdef PADDLE_WITH_CUDA
-  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
+  std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
      main_program, member_->places_, loss_var_name, params,
-      member_->local_scopes_, member_->use_cuda_, build_strategy,
+      member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get());
-      member_->nccl_ctxs_.get());
  auto max_memory_size = GetEagerDeletionThreshold();
  if (max_memory_size >= 0) {
@@ -228,11 +151,17 @@ ParallelExecutor::ParallelExecutor(
    }
  }
 #else
-  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
+  std::unique_ptr<ir::Graph> graph =
-      main_program, member_->places_, loss_var_name, params,
+      build_strategy.Apply(main_program, member_->places_, loss_var_name,
-      member_->local_scopes_, member_->use_cuda_, build_strategy);
+                           params, member_->local_scopes_, member_->use_cuda_);
 #endif
+  // If the loss_var_name is given, the number of graph should be only one.
+  if (loss_var_name.size()) {
+    PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
+                      "The number of graph should be only one");
+  }
  if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
    member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
        exec_strategy, member_->local_scopes_, places, std::move(graph)));
@@ -373,12 +302,6 @@ ParallelExecutor::~ParallelExecutor() {
 }  // namespace framework
 }  // namespace paddle
-USE_PASS(fuse_elewise_add_act_pass);
-USE_PASS(graph_viz_pass);
-USE_PASS(multi_devices_pass);
-USE_PASS(multi_devices_check_pass);
-USE_PASS(multi_devices_print_pass);
 #ifdef PADDLE_WITH_CUDA
 USE_PASS(reference_count_pass);
 #endif
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,14 +14,14 @@ limitations under the License. */
 #pragma once
-#include <paddle/fluid/framework/details/build_strategy.h>
 #include <atomic>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+#include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -20,6 +20,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/string/printf.h"
+// The mutex is not needed by training and inference, only for distribution.
+#if PADDLE_WITH_DISTRIBUTE
+#define WITH_LOCK 1
+#else
+#define WITH_LOCK 0
+#endif
 DEFINE_bool(benchmark, false,
            "Doing memory benchmark. It will make deleting scope synchronized, "
            "and add some memory usage logs."
@@ -49,18 +56,24 @@ int64_t GetEagerDeletionThreshold() {
 Scope::~Scope() { DropKids(); }
 Scope& Scope::NewScope() const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  kids_.push_back(new Scope(this));
  return *kids_.back();
 }
 Variable* Scope::Var(const std::string& name) {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  return VarInternal(name);
 }
 Variable* Scope::Var(std::string* name) {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  if (name != nullptr) {
    *name = new_name;
@@ -69,29 +82,39 @@ Variable* Scope::Var(std::string* name) {
 }
 Variable* Scope::FindVar(const std::string& name) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  return FindVarInternal(name);
 }
 const Scope* Scope::FindScope(const Variable* var) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  return FindScopeInternal(var);
 }
 void Scope::DropKids() {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  for (Scope* s : kids_) delete s;
  kids_.clear();
 }
 bool Scope::HasKid(const Scope* scope) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  return it != this->kids_.end();
 }
 std::vector<std::string> Scope::LocalVarNames() const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  std::vector<std::string> known_vars;
  known_vars.reserve(this->vars_.size());
  for (auto& p : vars_) {
@@ -101,7 +124,9 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }
 void Scope::DeleteScope(Scope* scope) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
  this->kids_.erase(it);
@@ -114,7 +139,9 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  std::set<std::string> var_set(var_names.begin(), var_names.end());
  for (auto it = vars_.begin(); it != vars_.end();) {
    if (var_set.find(it->first) != var_set.end()) {
@@ -127,12 +154,16 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 void Scope::Rename(const std::string& origin_name,
                   const std::string& new_name) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  RenameInternal(origin_name, new_name);
 }
 std::string Scope::Rename(const std::string& origin_name) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  RenameInternal(origin_name, new_name);
  return new_name;

--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -27,8 +27,11 @@ class SelectedRowsTester : public ::testing::Test {
    selected_rows_.reset(new SelectedRows(rows, height));
    Tensor* value = selected_rows_->mutable_value();
-    value->mutable_data<float>(
+    auto* data = value->mutable_data<float>(
        make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
+    for (int64_t i = 0; i < value->numel(); ++i) {
+      data[i] = static_cast<float>(i);
+    }
  }
 protected:
@@ -60,6 +63,10 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
  ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
  ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims());
  ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
+  auto* dst_data = dst_tensor.value().data<float>();
+  for (int64_t i = 0; i < dst_tensor.value().numel(); ++i) {
+    ASSERT_EQ(dst_data[i], static_cast<float>(i));
+  }
 }
 TEST(SelectedRows, SparseTable) {

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -37,12 +37,16 @@ TEST(Analyzer, analysis_without_tensorrt) {
 TEST(Analyzer, analysis_with_tensorrt) {
  FLAGS_IA_enable_tensorrt_subgraph_engine = true;
  Argument argument;
+  argument.Set<int>("minimum_subgraph_size", new int(0));
+  argument.Set<int>("max_batch_size", new int(3));
+  argument.Set<int>("workspace_size", new int(1 << 20));
+  argument.Set<std::string>("precision_mode", new std::string("FP32"));
  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
  Analyzer analyser;
  analyser.Run(&argument);
 }
-void TestWord2vecPrediction(const std::string &model_path) {
+void TestWord2vecPrediction(const std::string& model_path) {
  NativeConfig config;
  config.model_dir = model_path;
  config.use_gpu = false;
@@ -73,8 +77,8 @@ void TestWord2vecPrediction(const std::string &model_path) {
  // The outputs' buffers are in CPU memory.
  for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
    LOG(INFO) << "data: "
-              << static_cast<float *>(outputs.front().data.data())[i];
+              << static_cast<float*>(outputs.front().data.data())[i];
-    PADDLE_ENFORCE(static_cast<float *>(outputs.front().data.data())[i],
+    PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
                   result[i]);
  }
 }

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -97,8 +97,10 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
  }
 }
-void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
+void CreateTrtEngineOp(Node *node, Argument *argument,
                       framework::proto::BlockDesc *block) {
+  PADDLE_ENFORCE(argument->main_dfg.get());
+  const DataFlowGraph &graph = *(argument->main_dfg);
  static int counter{0};
  PADDLE_ENFORCE(node->IsFunctionBlock());
  framework::OpDesc desc;
@@ -204,7 +206,10 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
  PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc");
  // Set attrs
  SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
+  SetAttr(desc.Proto(), "max_batch_size", argument->Get<int>("max_batch_size"));
+  SetAttr(desc.Proto(), "workspace_size", argument->Get<int>("workspace_size"));
  SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
  SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
  SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
@@ -248,7 +253,7 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
  *block_desc.Proto()->mutable_vars() =
      argument_->origin_program_desc->blocks(0).vars();
  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
-  CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto());
+  CreateTrtEngineOp(node, argument_, block_desc.Proto());
  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
  auto *op = main_block->add_ops();
  PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");

--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -309,6 +309,8 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
 void SubGraphFuse::ReplaceNodesWithSubGraphs() {
  auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
  for (auto &subgraph : subgraphs) {
+    if (subgraph.size() <= argument_->Get<int>("minimum_subgraph_size"))
+      continue;
    std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
    // replace this sub-graph with the first node. Two steps: 1. Create a Block
    // Node that contains this subgraph 2. Mark the nodes inside the sub-graph

--- a/paddle/fluid/inference/analysis/subgraph_splitter.h
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <vector>
+#include "paddle/fluid/inference/analysis/argument.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/node.h"
@@ -63,8 +64,11 @@ class SubGraphFuse {
 public:
  using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller;
-  SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller)
+  SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller,
-      : graph_(graph), node_inside_subgraph_teller_(teller) {}
+               Argument *argument)
+      : graph_(graph),
+        node_inside_subgraph_teller_(teller),
+        argument_(argument) {}
  // The main method which run all the logic.
  void operator()();
@@ -76,6 +80,7 @@ class SubGraphFuse {
 private:
  DataFlowGraph *graph_;
  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+  Argument *argument_;
 };
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -66,10 +66,12 @@ TEST(SubGraphSplitter, Split) {
 TEST(SubGraphSplitter, Fuse) {
  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
  auto dfg = ProgramDescToDFG(desc);
+  Argument argument;
+  argument.Set<int>("minimum_subgraph_size", new int(3));
  size_t count0 = dfg.nodes.size();
-  SubGraphFuse fuse(&dfg, teller);
+  SubGraphFuse fuse(&dfg, teller, &argument);
  fuse();
  int count1 = 0;

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
    : node_inside_subgraph_teller_(teller) {}
 void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
-  SubGraphFuse(graph, node_inside_subgraph_teller_)();
+  SubGraphFuse(graph, node_inside_subgraph_teller_, argument_)();
  VLOG(4) << "debug info "
          << graph->HumanReadableInfo(false /*show_values*/,
                                      true /*show_functions*/);

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -33,7 +33,10 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
  explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
-  bool Initialize(Argument* argument) override { return true; }
+  bool Initialize(Argument* argument) override {
+    argument_ = argument;
+    return true;
+  }
  // This class get a sub-graph as input and determine whether to transform this
  // sub-graph into TensorRT.
@@ -46,6 +49,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
 private:
  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+  Argument* argument_;
 };
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
@@ -36,6 +36,10 @@ TEST(TensorRTSubGraphPass, main) {
  };
  Argument argument(FLAGS_inference_model_dir);
+  argument.Set<int>("minimum_subgraph_size", new int(0));
+  argument.Set<int>("max_batch_size", new int(3));
+  argument.Set<int>("workspace_size", new int(1 << 20));
+  argument.Set<std::string>("precision_mode", new std::string("FP32"));
  DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
  DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -18,10 +18,10 @@ if(APPLE)
 endif(APPLE)
-set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB})
+set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB})
 if(WITH_GPU AND TENSORRT_FOUND)
-    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
+    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor)
 endif()
 function(inference_api_test TARGET_NAME)
@@ -43,8 +43,10 @@ function(inference_api_test TARGET_NAME)
    endif(WITH_TESTING)
 endfunction(inference_api_test)
-cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis)
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
+cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
+cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api)
 cc_test(test_paddle_inference_api
        SRCS api_tester.cc
        DEPS paddle_inference_api)
@@ -52,18 +54,22 @@ cc_test(test_paddle_inference_api
 inference_api_test(test_api_impl SRC api_impl_tester.cc
                    ARGS test_word2vec test_image_classification)
+set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api
+        ARGS --dirname=${PYTHON_TESTS_DIR}/book)
 if(WITH_GPU AND TENSORRT_FOUND)
 cc_library(paddle_inference_tensorrt_subgraph_engine
        SRCS api_tensorrt_subgraph_engine.cc
-        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter)
+        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy)
 inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
 endif()
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
    # compile the libinference_anakin_api.a and anakin.so.
-    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml)
+    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml scope zero_copy_tensor_dummy)
-    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
+    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber scope)
    function(anakin_target target_name)
      target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
    endfunction()

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -16,11 +16,15 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/inference/api/timer.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -28,8 +32,11 @@ DECLARE_bool(profile);
 namespace paddle {
+using contrib::AnalysisConfig;
 bool AnalysisPredictor::Init(
-    const std::shared_ptr<framework::Scope>& parent_scope) {
+    const std::shared_ptr<framework::Scope> &parent_scope,
+    const std::shared_ptr<framework::ProgramDesc> &program) {
  VLOG(3) << "Predictor::init()";
 #if !defined(_WIN32)
  if (FLAGS_profile) {
@@ -43,7 +50,8 @@ bool AnalysisPredictor::Init(
  if (config_.use_gpu) {
    place_ = paddle::platform::CUDAPlace(config_.device);
-    LOG(WARNING) << "ir optimize only supports CPU currently";
+    LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim "
+                    "is turned false.";
    config_.enable_ir_optim = false;
  } else {
    place_ = paddle::platform::CPUPlace();
@@ -56,37 +64,134 @@ bool AnalysisPredictor::Init(
    scope_.reset(new paddle::framework::Scope());
  }
-  executor_.reset(new paddle::framework::Executor(place_));
+  executor_.reset(new paddle::framework::NaiveExecutor(place_));
-  // Initialize the inference program
+  if (!program) {
-  if (!config_.model_dir.empty()) {
+    if (!LoadProgramDesc()) return false;
-    // Parameters are saved in separate files sited in
+    OptimizeInferenceProgram();
-    // the specified `dirname`.
-    inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(),
-                                                 config_.model_dir);
-  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
-    // All parameters are saved in a single file.
-    // The file names should be consistent with that used
-    // in Python API `fluid.io.save_inference_model`.
-    inference_program_ = paddle::inference::Load(
-        executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
  } else {
-    LOG(ERROR) << "fail to load inference model from " << config_.model_dir;
+    inference_program_ = program;
+  }
+  executor_->Prepare(scope_.get(), *inference_program_, 0,
+                     config_.use_feed_fetch_ops);
+  // Get the feed_target_names and fetch_target_names
+  PrepareFeedFetch();
+  return true;
+}
+bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
+                            std::vector<PaddleTensor> *output_data,
+                            int batch_size) {
+  VLOG(3) << "Predictor::predict";
+  inference::Timer timer;
+  timer.tic();
+  // set feed variable
+  std::vector<framework::LoDTensor> feeds;
+  framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
+  if (!SetFeed(inputs, scope)) {
+    LOG(ERROR) << "fail to set feed";
    return false;
  }
+  // Run the inference program
+  // if share variables, we need not create variables
+  executor_->Run();
-  OptimizeInferenceProgram();
+  // get fetch variable
-  if (config_._use_mkldnn) {
+  if (!GetFetch(output_data, scope)) {
-    executor_->EnableMKLDNN(*inference_program_);
+    LOG(ERROR) << "fail to get fetches";
+    return false;
  }
-  ctx_ = executor_->Prepare(*inference_program_, 0);
+  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+  return true;
+}
-  VLOG(5) << "to create variables";
+bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
-  PADDLE_ENFORCE(scope_.get());
+                                framework::Scope *scope) {
-  executor_->CreateVariables(*inference_program_,
+  VLOG(3) << "Predictor::set_feed";
-                             sub_scope_ ? sub_scope_ : scope_.get(), 0);
+  if (inputs.size() != feeds_.size()) {
-  // Get the feed_target_names and fetch_target_names
+    LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
-  PrepareFeedFetch();
+               << inputs.size();
+    return false;
+  }
+  // Cache the inputs memory for better concurrency performance.
+  feed_tensors_.resize(inputs.size());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    auto &input = feed_tensors_[i];
+    framework::DDim ddim = framework::make_ddim(inputs[i].shape);
+    void *input_ptr;
+    if (inputs[i].dtype == PaddleDType::INT64) {
+      input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
+    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
+      input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
+    } else {
+      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
+      return false;
+    }
+    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
+    std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
+                inputs[i].data.length());
+    // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
+    framework::LoD lod;
+    for (auto &level : inputs[i].lod) {
+      lod.emplace_back(level);
+    }
+    input.set_lod(lod);
+    int idx = -1;
+    if (config_.specify_input_name) {
+      idx = feed_names_[inputs[i].name];
+    } else {
+      idx = boost::get<int>(feeds_[i]->GetAttr("col"));
+    }
+    framework::SetFeedVariable(scope, input, "feed", idx);
+  }
+  return true;
+}
+template <typename T>
+void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
+                                    PaddleTensor *output) {
+  // set shape.
+  auto shape = framework::vectorize(fetch.dims());
+  output->shape.assign(shape.begin(), shape.end());
+  // set data.
+  const T *data = fetch.data<T>();
+  int num_elems = inference::VecReduceToInt(shape);
+  output->data.Resize(num_elems * sizeof(T));
+  // The fetched tensor output by fetch op, should always in CPU memory, so just
+  // copy.
+  memcpy(output->data.data(), data, num_elems * sizeof(T));
+  // set lod
+  output->lod.clear();
+  for (auto &level : fetch.lod()) {
+    output->lod.emplace_back(level.begin(), level.end());
+  }
+}
+bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
+                                 framework::Scope *scope) {
+  VLOG(3) << "Predictor::get_fetch";
+  outputs->resize(fetchs_.size());
+  for (size_t i = 0; i < fetchs_.size(); ++i) {
+    int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
+    PADDLE_ENFORCE((size_t)idx == i);
+    framework::LoDTensor &fetch =
+        framework::GetFetchVariable(*scope, "fetch", idx);
+    auto type = fetch.type();
+    auto output = &(outputs->at(i));
+    if (type == typeid(float)) {
+      GetFetchOne<float>(fetch, output);
+      output->dtype = PaddleDType::FLOAT32;
+    } else if (type == typeid(int64_t)) {
+      GetFetchOne<int64_t>(fetch, output);
+      output->dtype = PaddleDType::INT64;
+    } else {
+      LOG(ERROR) << "unknown type, only support float32 and int64 now.";
+    }
+  }
  return true;
 }
@@ -107,6 +212,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
        new std::string(config_.prog_file));
    argument_.fluid_model_param_path.reset(new std::string(config_.param_file));
  }
  argument_.origin_program_desc.reset(
      new ProgramDesc(*inference_program_->Proto()));
  PADDLE_ENFORCE(
@@ -127,9 +233,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 }
 template <>
-std::unique_ptr<PaddlePredictor>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
-CreatePaddlePredictor<contrib::AnalysisConfig, PaddleEngineKind::kAnalysis>(
+    AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
-    const contrib::AnalysisConfig& config) {
  VLOG(3) << "create AnalysisConfig";
  if (config.use_gpu) {
    // 1. GPU memeroy
@@ -150,15 +255,90 @@ CreatePaddlePredictor<contrib::AnalysisConfig, PaddleEngineKind::kAnalysis>(
  }
  std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config));
-  if (!dynamic_cast<AnalysisPredictor*>(predictor.get())->Init(nullptr)) {
+  if (!dynamic_cast<AnalysisPredictor *>(predictor.get())->Init(nullptr)) {
    return nullptr;
  }
  return predictor;
 }
+void AnalysisPredictor::PrepareFeedFetch() {
+  for (auto *op : inference_program_->Block(0).AllOps()) {
+    if (op->Type() == "feed") {
+      int idx = boost::get<int>(op->GetAttr("col"));
+      if (feeds_.size() <= static_cast<size_t>(idx)) {
+        feeds_.resize(idx + 1);
+      }
+      feeds_[idx] = op;
+      feed_names_[op->Output("Out")[0]] = idx;
+    } else if (op->Type() == "fetch") {
+      int idx = boost::get<int>(op->GetAttr("col"));
+      if (fetchs_.size() <= static_cast<size_t>(idx)) {
+        fetchs_.resize(idx + 1);
+      }
+      fetchs_[idx] = op;
+    }
+  }
+}
+std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
+    const std::string &name) {
+  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
+  std::unique_ptr<ZeroCopyTensor> res(
+      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
+  res->input_or_output_ = true;
+  res->SetName(name);
+  return res;
+}
+std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
+    const std::string &name) {
+  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
+  std::unique_ptr<ZeroCopyTensor> res(
+      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
+  res->input_or_output_ = false;
+  res->SetName(name);
+  return res;
+}
+bool AnalysisPredictor::ZeroCopyRun() {
+  executor_->Run();
+  return true;
+}
+bool AnalysisPredictor::LoadProgramDesc() {
+  // Initialize the inference program
+  std::unique_ptr<framework::Executor> tmp_exe(
+      new framework::Executor(platform::CPUPlace()));
+  if (!config_.model_dir.empty()) {
+    // Parameters are saved in separate files sited in
+    // the specified `dirname`.
+    inference_program_ = paddle::inference::Load(
+        static_cast<framework::Executor *>(tmp_exe.get()), scope_.get(),
+        config_.model_dir);
+  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+    // All parameters are saved in a single file.
+    // The file names should be consistent with that used
+    // in Python API `fluid.io.save_inference_model`.
+    inference_program_ = paddle::inference::Load(
+        static_cast<framework::Executor *>(tmp_exe.get()), scope_.get(),
+        config_.prog_file, config_.param_file);
+  } else {
+    LOG(ERROR) << string::Sprintf(
+        "not valid model path '%s' or program path '%s'.", config_.model_dir,
+        config_.param_file);
+    return false;
+  }
+  return true;
+}
+std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
+  auto *x = new AnalysisPredictor(config_);
+  x->Init(scope_, inference_program_);
+  return std::unique_ptr<PaddlePredictor>(x);
+}
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
-    const contrib::AnalysisConfig& config) {
+    const contrib::AnalysisConfig &config) {
  return CreatePaddlePredictor<contrib::AnalysisConfig,
                               PaddleEngineKind::kAnalysis>(config);
 }

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -12,42 +12,81 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#pragma once
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/string/printf.h"
 namespace paddle {
 using inference::analysis::Argument;
 using inference::analysis::Analyzer;
 using framework::proto::ProgramDesc;
+using framework::NaiveExecutor;
+using contrib::AnalysisConfig;
 /* This predictor is based on the original native predictor with IR and Analysis
 * support. It will optimize IR and Parameters in the runtime.
 * TODO(Superjomn) Replace the Navive predictor?
 */
-class AnalysisPredictor : public NativePaddlePredictor {
+class AnalysisPredictor : public PaddlePredictor {
 public:
-  explicit AnalysisPredictor(const contrib::AnalysisConfig& config)
+  explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {}
-      : NativePaddlePredictor(config), config_(config) {}
-  bool Init(const std::shared_ptr<framework::Scope>& parent_scope);
+  bool Init(const std::shared_ptr<framework::Scope> &parent_scope,
+            const std::shared_ptr<framework::ProgramDesc> &program = nullptr);
-  bool Run(const std::vector<PaddleTensor>& inputs,
+  bool Run(const std::vector<PaddleTensor> &inputs,
-           std::vector<PaddleTensor>* output_data,
+           std::vector<PaddleTensor> *output_data,
-           int batch_size = -1) override {
+           int batch_size = -1) override;
-    return NativePaddlePredictor::Run(inputs, output_data, batch_size);
-  }
+  std::unique_ptr<ZeroCopyTensor> GetInputTensor(
+      const std::string &name) override;
+  std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
+      const std::string &name) override;
+  bool ZeroCopyRun() override;
+  void PrepareFeedFetch();
  void OptimizeInferenceProgram();
-  Argument& analysis_argument() { return argument_; }
+  Argument &analysis_argument() { return argument_; }
+  std::unique_ptr<PaddlePredictor> Clone() override;
+  framework::Scope *scope() { return executor_->scope(); }
+  framework::ProgramDesc &program() { return *inference_program_; }
+ protected:
+  bool LoadProgramDesc();
+  bool SetFeed(const std::vector<PaddleTensor> &input_datas,
+               framework::Scope *scope);
+  bool GetFetch(std::vector<PaddleTensor> *output_data,
+                framework::Scope *scope);
+  template <typename T>
+  void GetFetchOne(const framework::LoDTensor &fetchs,
+                   PaddleTensor *output_data);
 private:
  contrib::AnalysisConfig config_;
  Argument argument_;
+  std::unique_ptr<NaiveExecutor> executor_;
+  platform::Place place_;
+  std::shared_ptr<framework::Scope> scope_;
+  framework::Scope *sub_scope_{nullptr};
+  std::shared_ptr<framework::ProgramDesc> inference_program_;
+  std::vector<framework::OpDesc *> feeds_;
+  std::map<std::string, size_t> feed_names_;
+  std::vector<framework::OpDesc *> fetchs_;
+  // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
+  // concurrency problems, so cache them.
+  std::vector<framework::LoDTensor> feed_tensors_;
 };
 }  // namespace paddle
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+DEFINE_string(dirname, "", "dirname to tests.");
+namespace paddle {
+namespace inference {
+using contrib::AnalysisConfig;
+TEST(AnalysisPredictor, ZeroCopy) {
+  AnalysisConfig config;
+  config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
+  config.use_feed_fetch_ops = false;
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+  auto w0 = predictor->GetInputTensor("firstw");
+  auto w1 = predictor->GetInputTensor("secondw");
+  auto w2 = predictor->GetInputTensor("thirdw");
+  auto w3 = predictor->GetInputTensor("forthw");
+  w0->Reshape({4, 1});
+  w1->Reshape({4, 1});
+  w2->Reshape({4, 1});
+  w3->Reshape({4, 1});
+  auto* w0_data = w0->mutable_data<int64_t>(PaddlePlace::kCPU);
+  auto* w1_data = w1->mutable_data<int64_t>(PaddlePlace::kCPU);
+  auto* w2_data = w2->mutable_data<int64_t>(PaddlePlace::kCPU);
+  auto* w3_data = w3->mutable_data<int64_t>(PaddlePlace::kCPU);
+  for (int i = 0; i < 4; i++) {
+    w0_data[i] = i;
+    w1_data[i] = i;
+    w2_data[i] = i;
+    w3_data[i] = i;
+  }
+  predictor->ZeroCopyRun();
+  auto out = predictor->GetOutputTensor("fc_1.tmp_2");
+  PaddlePlace place;
+  int size = 0;
+  auto* out_data = out->data<float>(&place, &size);
+  LOG(INFO) << "output size: " << size / sizeof(float);
+  LOG(INFO) << "output_data: " << out_data;
+}
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
+//
-you may not use this file except in compliance with the License.
+// Licensed under the Apache License, Version 2.0 (the "License");
-You may obtain a copy of the License at
+// you may not use this file except in compliance with the License.
-http://www.apache.org/licenses/LICENSE-2.0
+// You may obtain a copy of the License at
-Unless required by applicable law or agreed to in writing, software
+//
-distributed under the License is distributed on an "AS IS" BASIS,
+//     http://www.apache.org/licenses/LICENSE-2.0
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//
-See the License for the specific language governing permissions and
+// Unless required by applicable law or agreed to in writing, software
-limitations under the License. */
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle_inference_api.h"
 namespace paddle {
@@ -26,7 +32,7 @@ int PaddleDtypeSize(PaddleDType dtype) {
  }
 }
-PaddleBuf::PaddleBuf(PaddleBuf&& other)
+PaddleBuf::PaddleBuf(PaddleBuf &&other)
    : data_(other.data_),
      length_(other.length_),
      memory_owned_(other.memory_owned_) {
@@ -35,9 +41,9 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other)
  other.length_ = 0;
 }
-PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
+PaddleBuf::PaddleBuf(const PaddleBuf &other) { *this = other; }
-PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
+PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) {
  if (!other.memory_owned_) {
    data_ = other.data_;
    length_ = other.length_;
@@ -51,7 +57,7 @@ PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
  return *this;
 }
-PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) {
+PaddleBuf &PaddleBuf::operator=(PaddleBuf &&other) {
  // only the buffer with external memory can be copied
  data_ = other.data_;
  length_ = other.length_;
@@ -75,7 +81,7 @@ void PaddleBuf::Resize(size_t length) {
  }
 }
-void PaddleBuf::Reset(void* data, size_t length) {
+void PaddleBuf::Reset(void *data, size_t length) {
  Free();
  memory_owned_ = false;
  data_ = data;
@@ -85,7 +91,7 @@ void PaddleBuf::Reset(void* data, size_t length) {
 void PaddleBuf::Free() {
  if (memory_owned_ && data_) {
    PADDLE_ENFORCE_GT(length_, 0);
-    free(static_cast<char*>(data_));
+    free(static_cast<char *>(data_));
    data_ = nullptr;
    length_ = 0;
  }

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -145,7 +145,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
  VLOG(4) << "Run prepared context";
  executor_->RunPreparedContext(ctx_.get(), scope,
                                false, /* don't create local scope each time*/
-                                false /* don't create variable eatch time */);
+                                false /* don't create variable each time */);
  VLOG(4) << "Finish prepared context";
  // get fetch variable
  if (!GetFetch(output_data, scope)) {

--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
+you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
+See the License for the specific language governing permissions and
-   limitations under the License. */
+limitations under the License. */
 #pragma once
@@ -30,6 +30,8 @@
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -52,6 +54,8 @@ class NativePaddlePredictor : public PaddlePredictor {
  ~NativePaddlePredictor() override;
+  framework::Scope *scope() { return sub_scope_ ? sub_scope_ : scope_.get(); }
 protected:
  bool SetFeed(const std::vector<PaddleTensor> &input_datas,
               framework::Scope *scope);

--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -43,7 +43,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
 NativeConfig GetConfig() {
  NativeConfig config;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
  LOG(INFO) << "dirname  " << config.model_dir;
  config.fraction_of_gpu_memory = 0.15;
 #ifdef PADDLE_WITH_CUDA
@@ -110,7 +110,7 @@ void MainImageClassification(bool use_gpu) {
  NativeConfig config = GetConfig();
  config.use_gpu = use_gpu;
  config.model_dir =
-      FLAGS_dirname + "image_classification_resnet.inference.model";
+      FLAGS_dirname + "/image_classification_resnet.inference.model";
  const bool is_combined = false;
  std::vector<std::vector<int64_t>> feed_target_shapes =
@@ -214,7 +214,7 @@ void MainThreadsImageClassification(bool use_gpu) {
  NativeConfig config = GetConfig();
  config.use_gpu = use_gpu;
  config.model_dir =
-      FLAGS_dirname + "image_classification_resnet.inference.model";
+      FLAGS_dirname + "/image_classification_resnet.inference.model";
  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
  std::vector<framework::LoDTensor> jobs(num_jobs);

--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -35,8 +35,6 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
  bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
    FLAGS_IA_enable_tensorrt_subgraph_engine = true;
    VLOG(3) << "Predictor::init()";
-    FLAGS_tensorrt_max_batch_size = config_.max_batch_size;
-    FLAGS_tensorrt_workspace_size = config_.workspace_size;
    if (config_.use_gpu) {
      place_ = paddle::platform::CUDAPlace(config_.device);
    } else {
@@ -92,6 +90,14 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
  void OptimizeInferenceProgram() {
    // Analyze inference_program
    Argument argument;
+    argument.Set<int>("minimum_subgraph_size",
+                      new int(config_.minimum_subgraph_size));
+    argument.Set<int>("max_batch_size", new int(config_.max_batch_size));
+    argument.Set<int>("workspace_size", new int(config_.workspace_size));
+    argument.Set<std::string>("precision_mode",
+                              new std::string(config_.precision_mode));
    if (!config_.model_dir.empty()) {
      argument.fluid_model_dir.reset(new std::string(config_.model_dir));
    } else {

--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {
+  PADDLE_ENFORCE(!name_.empty(),
+                 "Need to SetName first, so that the corresponding tensor can "
+                 "be retrieved.");
+  PADDLE_ENFORCE(input_or_output_,
+                 "Can't reshape the output tensor, it is readonly");
+  PADDLE_ENFORCE(scope_);
+  auto *scope = static_cast<framework::Scope *>(scope_);
+  auto *var = scope->FindVar(name_);
+  PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
+  auto *tensor = var->GetMutable<framework::LoDTensor>();
+  tensor->Resize(framework::make_ddim(shape));
+}
+template <typename T>
+T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
+  auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
+  switch (static_cast<int>(place)) {
+    case static_cast<int>(PaddlePlace::kCPU): {
+      return tensor->mutable_data<T>(platform::CPUPlace());
+    }
+    case static_cast<int>(PaddlePlace::kGPU): {
+      return tensor->mutable_data<T>(platform::CUDAPlace());
+    }
+    default:
+      PADDLE_THROW("Unsupported place: %d", static_cast<int>(place));
+      break;
+  }
+  return nullptr;
+}
+template <typename T>
+T *ZeroCopyTensor::data(PaddlePlace *place, int *size) {
+  auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
+  auto *res = tensor->data<T>();
+  if (platform::is_cpu_place(tensor->place())) {
+    *place = PaddlePlace::kCPU;
+  } else if (platform::is_gpu_place(tensor->place())) {
+    *place = PaddlePlace::kGPU;
+  } else {
+    *place = PaddlePlace::kUNK;
+  }
+  *size = tensor->numel();
+  return res;
+}
+template float *ZeroCopyTensor::data<float>(PaddlePlace *place, int *size);
+template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place, int *size);
+template float *ZeroCopyTensor::mutable_data<float>(PaddlePlace place);
+template int64_t *ZeroCopyTensor::mutable_data<int64_t>(PaddlePlace place);
+void *ZeroCopyTensor::FindTensor() const {
+  PADDLE_ENFORCE(!name_.empty(),
+                 "Need to SetName first, so that the corresponding tensor can "
+                 "be retrieved.");
+  PADDLE_ENFORCE(scope_);
+  auto *scope = static_cast<framework::Scope *>(scope_);
+  auto *var = scope->FindVar(name_);
+  PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
+  auto *tensor = var->GetMutable<framework::LoDTensor>();
+  return tensor;
+}
+std::vector<int64_t> ZeroCopyTensor::shape() {
+  auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
+  PADDLE_ENFORCE(tensor, "not found tensor called %s in the scope", name_);
+  return framework::vectorize(tensor->dims());
+}
+void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
+  auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
+  framework::LoD lod;
+  for (auto &level : x) {
+    lod.emplace_back(level);
+  }
+  tensor->set_lod(lod);
+}
+std::vector<std::vector<size_t>> ZeroCopyTensor::lod() const {
+  std::vector<std::vector<size_t>> res;
+  auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
+  for (auto &level : tensor->lod()) {
+    res.emplace_back(level);
+  }
+  return res;
+}
+}  // namespace paddle
--- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+namespace paddle {
+void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {}
+template <typename T>
+T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
+  return nullptr;
+}
+template <typename T>
+T *ZeroCopyTensor::data(PaddlePlace *place, int *size) {
+  return nullptr;
+}
+template float *ZeroCopyTensor::data<float>(PaddlePlace *place, int *size);
+template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place, int *size);
+template float *ZeroCopyTensor::mutable_data(PaddlePlace place);
+template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place);
+void *ZeroCopyTensor::FindTensor() const { return nullptr; }
+std::vector<int64_t> ZeroCopyTensor::shape() { return {}; }
+void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {}
+std::vector<std::vector<size_t>> ZeroCopyTensor::lod() const {
+  return std::vector<std::vector<size_t>>();
+}
+}  // namespace paddle
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -21,8 +21,10 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/timer.h"
+#include "paddle/fluid/string/printf.h"
 namespace paddle {
 namespace inference {
@@ -93,6 +95,20 @@ static void TensorAssignData(PaddleTensor *tensor,
  }
 }
+template <typename T>
+static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
+                                    const std::vector<std::vector<T>> &data) {
+  int size{0};
+  auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
+  int c = 0;
+  for (const auto &f : data) {
+    for (T v : f) {
+      ptr[c++] = v;
+    }
+  }
+  return size;
+}
 static std::string DescribeTensor(const PaddleTensor &tensor) {
  std::stringstream os;
  os << "Tensor [" << tensor.name << "]\n";
@@ -138,5 +154,127 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
  }
 }
+template <typename T>
+std::string LoDTensorSummary(const framework::LoDTensor &tensor) {
+  std::stringstream ss;
+  ss << "\n---- tensor ---" << '\n';
+  ss << "lod: [";
+  for (const auto &level : tensor.lod()) {
+    ss << "[ ";
+    for (auto i : level) {
+      ss << i << ", ";
+    }
+    ss << "]";
+  }
+  ss << "]\n";
+  ss << "shape: [";
+  int size = 1;
+  for (int i = 0; i < tensor.dims().size(); i++) {
+    int dim = tensor.dims()[i];
+    ss << dim << ", ";
+    size *= dim;
+  }
+  ss << "]\n";
+  ss << "data: ";
+  for (int i = 0; i < std::min(20, size); i++) {
+    ss << tensor.data<T>()[i] << " ";
+  }
+  ss << "\n";
+  return ss.str();
+}
+static bool CompareLoD(const framework::LoD &a, const framework::LoD &b) {
+  if (a.size() != b.size()) {
+    LOG(ERROR) << string::Sprintf("lod size not match %d != %d", a.size(),
+                                  b.size());
+    return false;
+  }
+  for (size_t i = 0; i < a.size(); i++) {
+    auto &al = a[i];
+    auto &bl = b[i];
+    if (al.size() != bl.size()) {
+      LOG(ERROR) << string::Sprintf("level size %d != %d", al.size(),
+                                    bl.size());
+      return false;
+    }
+  }
+  return true;
+}
+static bool CompareShape(const std::vector<int64_t> &a,
+                         const std::vector<int64_t> &b) {
+  if (a.size() != b.size()) {
+    LOG(ERROR) << string::Sprintf("shape size not match %d != %d", a.size(),
+                                  b.size());
+    return false;
+  }
+  for (size_t i = 0; i < a.size(); i++) {
+    if (a[i] != b[i]) {
+      LOG(ERROR) << string::Sprintf("shape %d-th element not match %d != %d", i,
+                                    a[i], b[i]);
+      return false;
+    }
+  }
+  return true;
+}
+static bool CompareTensorData(const framework::LoDTensor &a,
+                              const framework::LoDTensor &b) {
+  auto a_shape = framework::vectorize(a.dims());
+  auto b_shape = framework::vectorize(b.dims());
+  size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+  size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+  if (a_size != b_size) {
+    LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d",
+                                  a_size, b_size);
+  }
+  for (size_t i = 0; i < a_size; i++) {
+    if (a.type() == typeid(float)) {
+      const auto *a_data = a.data<float>();
+      const auto *b_data = b.data<float>();
+      if (std::abs(a_data[i] - b_data[i]) > 1e-3) {
+        LOG(ERROR) << string::Sprintf(
+            "tensor data %d-th element not match, %f != %f", i, a_data[i],
+            b_data[i]);
+        return false;
+      }
+    } else if (a.type() == typeid(int64_t)) {
+      const auto *a_data = a.data<int64_t>();
+      const auto *b_data = b.data<int64_t>();
+      if (std::abs(a_data[i] - b_data[i]) > 1e-3) {
+        LOG(ERROR) << string::Sprintf(
+            "tensor data %d-th element not match, %f != %f", i, a_data[i],
+            b_data[i]);
+        return false;
+      }
+    }
+  }
+  return true;
+}
+static bool CompareTensor(const framework::LoDTensor &a,
+                          const framework::LoDTensor &b) {
+  if (!CompareLoD(a.lod(), b.lod())) {
+    return false;
+  }
+  if (!CompareShape(framework::vectorize(a.dims()),
+                    framework::vectorize(b.dims()))) {
+    return false;
+  }
+  if (!CompareTensorData(a, b)) {
+    return false;
+  }
+  return true;
+}
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -101,6 +101,40 @@ struct PaddleTensor {
  std::vector<std::vector<size_t>> lod;  // Tensor+LoD equals LoDTensor
 };
+enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
+// Tensor without copy, currently only supports AnalysisPredictor.
+class ZeroCopyTensor {
+ public:
+  void Reshape(const std::vector<int>& shape);
+  // Get the memory in CPU or GPU with specific data type, should Reshape first
+  // to tell the data size.
+  // Once can directly call this data to feed the data.
+  // This is for write the input tensor.
+  template <typename T>
+  T* mutable_data(PaddlePlace place);
+  // Get the memory directly, will return the place and memory size by pointer.
+  // This is for reading the output tensor.
+  template <typename T>
+  T* data(PaddlePlace* place, int* size);
+  std::vector<int64_t> shape();
+  void SetLoD(const std::vector<std::vector<size_t>>& x);
+  std::vector<std::vector<size_t>> lod() const;
+ protected:
+  ZeroCopyTensor(void* scope) : scope_{scope} {}
+  void SetName(const std::string& name) { name_ = name; }
+  void* FindTensor() const;
+ private:
+  std::string name_;
+  bool input_or_output_;
+  friend class AnalysisPredictor;
+  void* scope_{nullptr};
+};
 /*
 * A simple Inference API for Paddle.
 */
@@ -120,6 +154,19 @@ class PaddlePredictor {
                   std::vector<PaddleTensor>* output_data,
                   int batch_size = -1) = 0;
+  // Zero copy input and output optimization.
+  // Get the input or output tensors, and operate on their memory directly,
+  // without copy.
+  virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
+      const std::string& name) {
+    return nullptr;
+  }
+  virtual std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
+      const std::string& name) {
+    return nullptr;
+  }
+  virtual bool ZeroCopyRun() { return false; }
  // Clone a predictor that share the model weights, the Cloned predictor should
  // be thread-safe.
  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
@@ -194,6 +241,14 @@ struct MixedRTConfig : public NativeConfig {
  // For workspace_size, refer it from here:
  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
  int workspace_size{1 << 30};
+  //  We transform the Ops that can be converted into TRT layer in the model,
+  //  and aggregate these Ops into subgraphs for TRT execution.
+  //  We set this variable to control the minimum number of nodes in the
+  //  subgraph, 3 as default value.
+  int minimum_subgraph_size = 3;
+  // Reserved configuration
+  // We just support "FP32" now, "FP16" and "INT8" will be supported.
+  std::string precision_mode = "FP32";
 };
 // NOTE WIP, not stable yet.
@@ -204,12 +259,18 @@ struct AnalysisConfig : public NativeConfig {
    kExclude   // Specify the disabled passes in `ir_passes`.
  };
+  // Determine whether to perform graph optimization.
  bool enable_ir_optim = true;
+  // Manually determine the IR passes to run.
  IrPassMode ir_mode{IrPassMode::kExclude};
-  // attention lstm fuse works only on some specific models, disable as default.
+  std::vector<std::string> ir_passes;
-  std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
+  // NOT stable yet.
+  bool use_feed_fetch_ops{true};
-  // NOTE this is just for internal development, please not use it.
+  // NOTE this is just for internal development, please not use it.	NOT
+  // stable
+  // yet.
  bool _use_mkldnn{false};
 };

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -90,3 +90,13 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
               DEPS inference_anakin_api_shared dynload_cuda SERIAL)
   endif()
 endif()
+if(WITH_GPU AND TENSORRT_FOUND)
+   set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
+   if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
+       inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
+   endif()
+   cc_test(test_trt_models SRCS trt_models_tester.cc  
+     ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models
+     DEPS paddle_inference_tensorrt_subgraph_engine)
+endif()
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -18,6 +18,8 @@ namespace paddle {
 namespace inference {
 namespace analysis {
+using contrib::AnalysisConfig;
 struct DataRecord {
  std::vector<int64_t> data;
  std::vector<size_t> lod;
@@ -78,6 +80,7 @@ struct DataRecord {
      }
    }
  }
  DataRecord NextBatch() {
    DataRecord data;
    data.data = batched_datas[batch_iter];
@@ -155,7 +158,9 @@ TEST(Analyzer_LAC, fuse_statis) {
  SetConfig(&cfg);
  int num_ops;
-  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
  ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);

--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -16,6 +16,7 @@
 namespace paddle {
 namespace inference {
+using contrib::AnalysisConfig;
 struct DataRecord {
  std::vector<std::vector<int64_t>> word_data_all, mention_data_all;
@@ -145,7 +146,9 @@ TEST(Analyzer_Chinese_ner, fuse_statis) {
  SetConfig(&cfg);
  int num_ops;
-  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
  ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);

--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -12,12 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
+DEFINE_bool(with_precision_check, true, "turn on test");
 namespace paddle {
 namespace inference {
 using namespace framework;  // NOLINT
+using namespace contrib;    // NOLINT
 struct DataRecord {
  std::vector<std::vector<std::vector<float>>> link_step_data_all;
@@ -29,10 +33,12 @@ struct DataRecord {
  size_t batch_iter{0};
  size_t batch_size{1};
  DataRecord() = default;
  explicit DataRecord(const std::string &path, int batch_size = 1)
      : batch_size(batch_size) {
    Load(path);
  }
  DataRecord NextBatch() {
    DataRecord data;
    size_t batch_end = batch_iter + batch_size;
@@ -101,6 +107,7 @@ struct DataRecord {
    num_samples = num_lines;
  }
 };
 void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
                   int batch_size) {
  PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor,
@@ -149,7 +156,55 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  }
 }
-void SetConfig(contrib::AnalysisConfig *cfg) {
+void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor,
+                           ZeroCopyTensor *cell_init_tensor,
+                           ZeroCopyTensor *data_tensor,
+                           ZeroCopyTensor *hidden_init_tensor,
+                           ZeroCopyTensor *week_tensor,
+                           ZeroCopyTensor *minute_tensor,
+                           DataRecord *data_record, int batch_size) {
+  auto one_batch = data_record->NextBatch();
+  std::vector<int> rnn_link_data_shape(
+      {static_cast<int>(one_batch.rnn_link_data.size()),
+       static_cast<int>(one_batch.rnn_link_data.front().size())});
+  lod_attention_tensor->Reshape({1, 2});
+  lod_attention_tensor->SetLoD({one_batch.lod1, one_batch.lod2});
+  cell_init_tensor->Reshape({batch_size, 15});
+  cell_init_tensor->SetLoD({one_batch.lod3});
+  hidden_init_tensor->Reshape({batch_size, 15});
+  hidden_init_tensor->SetLoD({one_batch.lod3});
+  data_tensor->Reshape(rnn_link_data_shape);
+  data_tensor->SetLoD({one_batch.lod1});
+  week_tensor->Reshape(
+      {static_cast<int>(one_batch.rnn_week_datas.size()),
+       static_cast<int>(one_batch.rnn_week_datas.front().size())});
+  week_tensor->SetLoD({one_batch.lod3});
+  minute_tensor->Reshape(
+      {static_cast<int>(one_batch.rnn_minute_datas.size()),
+       static_cast<int>(one_batch.rnn_minute_datas.front().size())});
+  minute_tensor->SetLoD({one_batch.lod3});
+  // assign data
+  float arr0[] = {0, 0};
+  std::vector<float> zeros(batch_size * 15, 0);
+  std::copy_n(arr0, 2,
+              lod_attention_tensor->mutable_data<float>(PaddlePlace::kCPU));
+  std::copy_n(arr0, 2, data_tensor->mutable_data<float>(PaddlePlace::kCPU));
+  std::copy_n(zeros.begin(), zeros.size(),
+              cell_init_tensor->mutable_data<float>(PaddlePlace::kCPU));
+  std::copy_n(zeros.begin(), zeros.size(),
+              hidden_init_tensor->mutable_data<float>(PaddlePlace::kCPU));
+  ZeroCopyTensorAssignData(data_tensor, one_batch.rnn_link_data);
+  ZeroCopyTensorAssignData(week_tensor, one_batch.rnn_week_datas);
+  ZeroCopyTensorAssignData(minute_tensor, one_batch.rnn_minute_datas);
+}
+void SetConfig(AnalysisConfig *cfg) {
  cfg->prog_file = FLAGS_infer_model + "/__model__";
  cfg->param_file = FLAGS_infer_model + "/param";
  cfg->use_gpu = false;
@@ -187,7 +242,9 @@ TEST(Analyzer_rnn1, fuse_statis) {
  SetConfig(&cfg);
  int num_ops;
-  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
  EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
@@ -214,7 +271,229 @@ TEST(Analyzer_rnn1, multi_thread) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, 4 /* num_threads */);
+  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+}
+bool CompareTensors(framework::Scope &a_scope, framework::Scope &b_scope,
+                    const std::vector<std::string> &tensors) {
+  for (auto &x : tensors) {
+    auto *a_var = a_scope.FindVar(x);
+    auto *b_var = b_scope.FindVar(x);
+    if (a_var && b_var) {
+      if (a_var->Type() == typeid(framework::LoDTensor) ||
+          a_var->Type() == typeid(framework::Tensor)) {
+        LOG(INFO) << "comparing tensor " << x;
+        auto &a_t = a_var->Get<framework::LoDTensor>();
+        auto &b_t = b_var->Get<framework::LoDTensor>();
+        if (!inference::CompareTensor(a_t, b_t)) {
+          LOG(ERROR) << string::Sprintf("tensor %s not match in two scopes", x);
+        }
+      } else {
+        LOG(INFO) << "skip no tensor " << x;
+      }
+    } else {
+      LOG(INFO) << "skip tensor " << x;
+    }
+  }
+  return true;
+}
+// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
+// on the complex RNN1 model.
+TEST(Analyzer_rnn1, ZeroCopy) {
+  AnalysisConfig config;
+  SetConfig(&config);
+  config.use_feed_fetch_ops = false;
+  PaddlePlace place;
+  int output_size{0};
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+  config.use_feed_fetch_ops = true;
+  auto native_predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  config.use_feed_fetch_ops = true;  // the analysis predictor needs feed/fetch.
+  auto analysis_predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+#define NEW_TENSOR(name__) \
+  auto name__##_tensor = predictor->GetInputTensor(#name__);
+  NEW_TENSOR(data_lod_attention);
+  NEW_TENSOR(cell_init);
+  NEW_TENSOR(data);
+  NEW_TENSOR(week);
+  NEW_TENSOR(minute);
+  NEW_TENSOR(hidden_init);
+  // Prepare data for AnalysisPredictor
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  PrepareZeroCopyInputs(data_lod_attention_tensor.get(), cell_init_tensor.get(),
+                        data_tensor.get(), hidden_init_tensor.get(),
+                        week_tensor.get(), minute_tensor.get(), &data,
+                        FLAGS_batch_size);
+  // Prepare data for NativePredictor
+  std::vector<std::vector<PaddleTensor>> native_inputs;
+  SetInput(&native_inputs);
+  std::vector<PaddleTensor> native_outputs;
+  std::vector<PaddleTensor> analysis_outputs;
+  auto output_tensor = predictor->GetOutputTensor("final_output.tmp_1");
+  // Run analysis predictor
+  int num_ops;
+  auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
+  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+  ASSERT_EQ(fuse_statis.at("fc_fuse"), 1);
+  ASSERT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
+  ASSERT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
+  ASSERT_EQ(num_ops,
+            13);  // After graph optimization, only 13 operators exists.
+  Timer timer;
+  double total_time{0};
+  double native_total_time{0};
+  double analysis_total_time{0.};
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    timer.tic();
+    predictor->ZeroCopyRun();
+    total_time += timer.toc();
+  }
+  auto *output_data = output_tensor->data<float>(&place, &output_size);
+  ASSERT_GT(output_size, 0);  // more than one output!
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    // Run native predictor.
+    timer.tic();
+    ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs));
+    native_total_time += timer.toc();
+  }
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    timer.tic();
+    ASSERT_TRUE(
+        analysis_predictor->Run(native_inputs.front(), &analysis_outputs));
+    analysis_total_time += timer.toc();
+  }
+  if (!FLAGS_with_precision_check) {
+    return;
+  }
+  int native_output_size = VecReduceToInt(native_outputs.front().shape);
+  EXPECT_EQ(native_output_size, output_size);
+  // Compare tensors between analysis and zerocopy
+  auto *p0 = static_cast<AnalysisPredictor *>(predictor.get());
+  auto *p1 = static_cast<AnalysisPredictor *>(analysis_predictor.get());
+  auto *p2 = static_cast<NativePaddlePredictor *>(native_predictor.get());
+  std::vector<std::string> tensor_names;
+  for (auto &var_desc : p0->program().Block(0).AllVars()) {
+    tensor_names.push_back(var_desc->Name());
+  }
+  LOG(INFO) << "Comparing tensors";
+  ASSERT_TRUE(
+      CompareTensors(*p0->scope(), *p1->scope(), {"final_output.tmp_1"}));
+  ASSERT_TRUE(
+      CompareTensors(*p0->scope(), *p2->scope(), {"final_output.tmp_1"}));
+  LOG(INFO) << "output1 " << inference::LoDTensorSummary<float>(
+                                 p0->scope()
+                                     ->FindVar("final_output.tmp_1")
+                                     ->Get<framework::LoDTensor>());
+  LOG(INFO) << "output2 " << inference::LoDTensorSummary<float>(
+                                 p1->scope()
+                                     ->FindVar("final_output.tmp_1")
+                                     ->Get<framework::LoDTensor>());
+  LOG(INFO) << "output3 " << inference::LoDTensorSummary<float>(
+                                 p2->scope()
+                                     ->FindVar("final_output.tmp_1")
+                                     ->Get<framework::LoDTensor>());
+  for (int i = 0; i < output_size; i++) {
+    LOG(INFO) << output_data[i] << " "
+              << static_cast<float *>(native_outputs.front().data.data())[i]
+              << " "
+              << static_cast<float *>(analysis_outputs.front().data.data())[i];
+    EXPECT_NEAR(output_data[i],
+                static_cast<float *>(native_outputs.front().data.data())[i],
+                1e-3);
+  }
+  LOG(INFO) << "batch_size: " << FLAGS_batch_size;
+  LOG(INFO) << "zero average time: "
+            << total_time / (FLAGS_repeat * FLAGS_batch_size);
+  LOG(INFO) << "analysis average time: "
+            << analysis_total_time / (FLAGS_repeat * FLAGS_batch_size);
+  LOG(INFO) << "native average time: "
+            << native_total_time / (FLAGS_repeat * FLAGS_batch_size);
+}
+TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
+  AnalysisConfig config;
+  SetConfig(&config);
+  config.use_feed_fetch_ops = false;
+#define NEW_TENSOR(name__) \
+  auto name__##_tensor = predictor->GetInputTensor(#name__);
+  auto base_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+  double total_time_of_threads{0};
+  std::vector<std::thread> threads;
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  for (int tid = 0; tid < FLAGS_num_threads; tid++) {
+    predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
+  }
+  for (int tid = 0; tid < FLAGS_num_threads; tid++) {
+    threads.emplace_back([config, &total_time_of_threads, &predictors, tid] {
+      // auto predictor = base_predictor->Clone();
+      auto &predictor = predictors[tid];
+      NEW_TENSOR(data_lod_attention);
+      NEW_TENSOR(cell_init);
+      NEW_TENSOR(data);
+      NEW_TENSOR(week);
+      NEW_TENSOR(minute);
+      NEW_TENSOR(hidden_init);
+      // Prepare data for AnalysisPredictor
+      DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+      Timer timer;
+      double total_time{0};
+      for (int i = 0; i < FLAGS_repeat; i++) {
+        PrepareZeroCopyInputs(data_lod_attention_tensor.get(),
+                              cell_init_tensor.get(), data_tensor.get(),
+                              hidden_init_tensor.get(), week_tensor.get(),
+                              minute_tensor.get(), &data, FLAGS_batch_size);
+        timer.tic();
+        predictor->ZeroCopyRun();
+        total_time += timer.toc();
+      }
+      total_time_of_threads += total_time;
+      LOG(INFO) << "thread time: " << total_time / FLAGS_repeat;
+    });
+  }
+  for (auto &t : threads) {
+    t.join();
+  }
+  LOG(INFO) << "average time: "
+            << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat;
 }
 }  // namespace inference

--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -182,7 +182,8 @@ TEST(Analyzer_seq_conv1, fuse_statis) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
  int num_ops;
-  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  GetFuseStatis(predictor.get(), &num_ops);
 }
 // Compare result of NativeConfig and AnalysisConfig

--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
 namespace analysis {
+using contrib::AnalysisConfig;
 struct Record {
  std::vector<float> data;
@@ -114,7 +115,8 @@ TEST(Analyzer_vis, fuse_statis) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
  int num_ops;
-  GetFuseStatis(cfg, &num_ops);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  GetFuseStatis(predictor.get(), &num_ops);
 }
 // Compare result of NativeConfig and AnalysisConfig

--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -86,11 +86,9 @@ std::unique_ptr<PaddlePredictor> CreateTestPredictor(
 size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }
-std::unordered_map<std::string, int> GetFuseStatis(AnalysisConfig config,
+std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
                                                   int *num_ops) {
-  auto predictor = CreateTestPredictor(config);
+  auto *analysis_predictor = static_cast<AnalysisPredictor *>(predictor);
-  AnalysisPredictor *analysis_predictor =
-      dynamic_cast<AnalysisPredictor *>(predictor.get());
  auto &fuse_statis = analysis_predictor->analysis_argument()
                          .Get<std::unordered_map<std::string, int>>(
                              framework::ir::kFuseStatisAttr);

--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+namespace paddle {
+using paddle::contrib::MixedRTConfig;
+DEFINE_string(dirname, "", "Directory of the inference model.");
+NativeConfig GetConfigNative() {
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname;
+  // LOG(INFO) << "dirname  " << config.model_dir;
+  config.fraction_of_gpu_memory = 0.45;
+  config.use_gpu = true;
+  config.device = 0;
+  return config;
+}
+MixedRTConfig GetConfigTRT() {
+  MixedRTConfig config;
+  config.model_dir = FLAGS_dirname;
+  config.use_gpu = true;
+  config.fraction_of_gpu_memory = 0.2;
+  config.device = 0;
+  config.max_batch_size = 3;
+  return config;
+}
+void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
+  NativeConfig config0 = GetConfigNative();
+  config0.model_dir = model_dirname;
+  MixedRTConfig config1 = GetConfigTRT();
+  config1.model_dir = model_dirname;
+  config1.max_batch_size = batch_size;
+  auto predictor0 =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
+  auto predictor1 =
+      CreatePaddlePredictor<MixedRTConfig,
+                            PaddleEngineKind::kAutoMixedTensorRT>(config1);
+  // Prepare inputs
+  int height = 224;
+  int width = 224;
+  float *data = new float[batch_size * 3 * height * width];
+  memset(data, 0, sizeof(float) * (batch_size * 3 * height * width));
+  data[0] = 1.0f;
+  // Prepare inputs
+  PaddleTensor tensor;
+  tensor.name = "input_0";
+  tensor.shape = std::vector<int>({batch_size, 3, height, width});
+  tensor.data = PaddleBuf(static_cast<void *>(data),
+                          sizeof(float) * (batch_size * 3 * height * width));
+  tensor.dtype = PaddleDType::FLOAT32;
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+  // Prepare outputs
+  std::vector<PaddleTensor> outputs0;
+  std::vector<PaddleTensor> outputs1;
+  CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0));
+  CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
+  // Get output.
+  ASSERT_EQ(outputs0.size(), 1UL);
+  ASSERT_EQ(outputs1.size(), 1UL);
+  const size_t num_elements = outputs0.front().data.length() / sizeof(float);
+  const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
+  EXPECT_EQ(num_elements, num_elements1);
+  auto *data0 = static_cast<float *>(outputs0.front().data.data());
+  auto *data1 = static_cast<float *>(outputs1.front().data.data());
+  ASSERT_GT(num_elements, 0UL);
+  for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
+    EXPECT_NEAR(data0[i], data1[i], 1e-3);
+  }
+}
+TEST(trt_models_test, main) {
+  std::vector<std::string> infer_models = {"mobilenet", "resnet50",
+                                           "resnext50"};
+  for (auto &model_dir : infer_models) {
+    CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + model_dir);
+  }
+}
+}  // namespace paddle
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -36,6 +36,8 @@ namespace memory {
 using BuddyAllocator = detail::BuddyAllocator;
 BuddyAllocator* GetCPUBuddyAllocator() {
+  // We tried thread_local for inference::RNN1 model, but that not works much
+  // for multi-thread test.
  static std::once_flag init_flag;
  static detail::BuddyAllocator* a = nullptr;
@@ -48,6 +50,25 @@ BuddyAllocator* GetCPUBuddyAllocator() {
  return a;
 }
+// We compared the NaiveAllocator with BuddyAllocator in CPU memory allocation,
+// seems they are almost the same overhead.
+struct NaiveAllocator {
+  void* Alloc(size_t size) { return malloc(size); }
+  void Free(void* p) {
+    PADDLE_ENFORCE(p);
+    free(p);
+  }
+  static NaiveAllocator* Instance() {
+    static NaiveAllocator x;
+    return &x;
+  }
+ private:
+  std::mutex lock_;
+};
 template <>
 void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);

--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -36,11 +36,16 @@ class AucOp : public framework::OperatorWithKernel {
                      "Out and Label should have same height.");
    int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
+    int slide_steps = ctx->Attrs().Get<int>("slide_steps");
+    PADDLE_ENFORCE_GE(num_pred_buckets, 1, "num_thresholds must larger than 1");
+    PADDLE_ENFORCE_GE(slide_steps, 0, "slide_steps must be natural number");
    ctx->SetOutputDim("AUC", {1});
-    ctx->SetOutputDim("BatchAUC", {1});
-    ctx->SetOutputDim("StatPosOut", {num_pred_buckets});
+    slide_steps = slide_steps == 0 ? 1 : slide_steps;
-    ctx->SetOutputDim("StatNegOut", {num_pred_buckets});
+    ctx->SetOutputDim("StatPosOut", {slide_steps, num_pred_buckets});
+    ctx->SetOutputDim("StatNegOut", {slide_steps, num_pred_buckets});
  }
 protected:
@@ -62,6 +67,7 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Label",
             "A 2D int tensor indicating the label of the training data. "
             "shape: [batch_size, 1]");
    // TODO(typhoonzero): support weight input
    AddInput("StatPos", "Statistic value when label = 1");
    AddInput("StatNeg", "Statistic value when label = 0");
@@ -69,18 +75,19 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("AUC",
              "A scalar representing the "
              "current area-under-the-curve.");
-    AddOutput("BatchAUC", "The AUC for current batch");
    AddOutput("StatPosOut", "Statistic value when label = 1");
    AddOutput("StatNegOut", "Statistic value when label = 0");
    AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
        .SetDefault("ROC");
-    AddAttr<int>("num_thresholds",
+    AddAttr<int>(
-                 "The number of thresholds to use when discretizing the"
+        "num_thresholds",
-                 " roc curve.")
+        "The number of thresholds to use when discretizing the roc curve.")
        .SetDefault((2 << 12) - 1);
+    AddAttr<int>("slide_steps", "Use slide steps to calc batch auc.")
+        .SetDefault(1);
    AddComment(R"DOC(
 Area Under The Curve (AUC) Operator.

--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -32,7 +32,9 @@ class AucKernel : public framework::OpKernel<T> {
    std::string curve = ctx.Attr<std::string>("curve");
    int num_thresholds = ctx.Attr<int>("num_thresholds");
+    // buckets contain numbers from 0 to num_thresholds
    int num_pred_buckets = num_thresholds + 1;
+    int slide_steps = ctx.Attr<int>("slide_steps");
    // Only use output var for now, make sure it's persistable and
    // not cleaned up for each batch.
@@ -40,16 +42,19 @@ class AucKernel : public framework::OpKernel<T> {
    auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
    auto *stat_neg = ctx.Output<Tensor>("StatNegOut");
-    auto *stat_pos_data = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
+    auto *origin_stat_pos = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
-    auto *stat_neg_data = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
+    auto *origin_stat_neg = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
-    calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds,
-            auc);
-    auto *batch_auc = ctx.Output<Tensor>("BatchAUC");
+    std::vector<int64_t> stat_pos_data(num_pred_buckets, 0);
-    std::vector<int64_t> stat_pos_batch(num_pred_buckets, 0);
+    std::vector<int64_t> stat_neg_data(num_pred_buckets, 0);
-    std::vector<int64_t> stat_neg_batch(num_pred_buckets, 0);
-    calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(),
+    auto stat_pos_calc = stat_pos_data.data();
-            num_thresholds, batch_auc);
+    auto stat_neg_calc = stat_neg_data.data();
+    statAuc(label, predict, num_pred_buckets, num_thresholds, slide_steps,
+            origin_stat_pos, origin_stat_neg, &stat_pos_calc, &stat_neg_calc);
+    calcAuc(ctx, stat_pos_calc, stat_neg_calc, num_thresholds, auc);
  }
 private:
@@ -58,29 +63,76 @@ class AucKernel : public framework::OpKernel<T> {
    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
  }
-  inline static void calcAuc(const framework::ExecutionContext &ctx,
+  inline static void statAuc(const framework::Tensor *label,
-                             const framework::Tensor *label,
                             const framework::Tensor *predict,
-                             int64_t *stat_pos, int64_t *stat_neg,
+                             const int num_pred_buckets,
-                             int num_thresholds,
+                             const int num_thresholds, const int slide_steps,
-                             framework::Tensor *auc_tensor) {
+                             int64_t *origin_stat_pos, int64_t *origin_stat_neg,
+                             int64_t **stat_pos, int64_t **stat_neg) {
    size_t batch_size = predict->dims()[0];
    size_t inference_width = predict->dims()[1];
    const T *inference_data = predict->data<T>();
    const auto *label_data = label->data<int64_t>();
-    auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
    for (size_t i = 0; i < batch_size; i++) {
      uint32_t binIdx = static_cast<uint32_t>(
          inference_data[i * inference_width + 1] * num_thresholds);
      if (label_data[i]) {
-        stat_pos[binIdx] += 1.0;
+        (*stat_pos)[binIdx] += 1.0;
+      } else {
+        (*stat_neg)[binIdx] += 1.0;
+      }
+    }
+    int bucket_length = num_pred_buckets * sizeof(int64_t);
+    // will stat auc unlimited.
+    if (slide_steps == 0) {
+      for (int slide = 0; slide < num_pred_buckets; ++slide) {
+        origin_stat_pos[slide] += (*stat_pos)[slide];
+        origin_stat_neg[slide] += (*stat_neg)[slide];
+      }
+      *stat_pos = origin_stat_pos;
+      *stat_neg = origin_stat_neg;
    } else {
-        stat_neg[binIdx] += 1.0;
+      for (int slide = 1; slide < slide_steps; ++slide) {
+        int dst_idx = (slide - 1) * num_pred_buckets;
+        int src_inx = slide * num_pred_buckets;
+        std::memcpy(origin_stat_pos + dst_idx, origin_stat_pos + src_inx,
+                    bucket_length);
+        std::memcpy(origin_stat_neg + dst_idx, origin_stat_neg + src_inx,
+                    bucket_length);
+      }
+      std::memcpy(origin_stat_pos + (slide_steps - 1) * num_pred_buckets,
+                  *stat_pos, bucket_length);
+      std::memcpy(origin_stat_neg + (slide_steps - 1) * num_pred_buckets,
+                  *stat_neg, bucket_length);
+      std::memset(*stat_pos, 0, bucket_length);
+      std::memset(*stat_neg, 0, bucket_length);
+      for (int slide = 0; slide < num_pred_buckets; ++slide) {
+        int stat_pos_steps = 0;
+        int stat_neg_steps = 0;
+        for (int step = 0; step < slide_steps; ++step) {
+          stat_pos_steps += origin_stat_pos[slide + step * num_pred_buckets];
+          stat_neg_steps += origin_stat_neg[slide + step * num_pred_buckets];
+        }
+        (*stat_pos)[slide] += stat_pos_steps;
+        (*stat_neg)[slide] += stat_neg_steps;
+      }
    }
  }
+  inline static void calcAuc(const framework::ExecutionContext &ctx,
+                             int64_t *stat_pos, int64_t *stat_neg,
+                             int num_thresholds,
+                             framework::Tensor *auc_tensor) {
+    auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
    *auc = 0.0f;
    double totPos = 0.0;
@@ -96,7 +148,6 @@ class AucKernel : public framework::OpKernel<T> {
      totPos += stat_pos[idx];
      totNeg += stat_neg[idx];
      *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
      --idx;
    }

--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -30,7 +30,13 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
 polygon_box_transform_op.cu)
 detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
-detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
+if(WITH_GPU)
+  detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
+else()
+  detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
+endif()
 detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
 #Export local libraries to parent
 set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -69,7 +70,7 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("Anchors")->type()),
-        platform::CPUPlace());
+        ctx.device_context());
  }
 };
@@ -162,7 +163,7 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
  const T *im_info_data = im_info.data<T>();
  T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
  T im_scale = im_info_data[2];
-  keep->Resize({boxes->dims()[0], 1});
+  keep->Resize({boxes->dims()[0]});
  min_size = std::max(min_size, 1.0f);
  int *keep_data = keep->mutable_data<int>(ctx.GetPlace());
@@ -463,7 +464,7 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<int>("post_nms_topN", "post_nms_topN");
    AddAttr<float>("nms_thresh", "nms_thres");
    AddAttr<float>("min_size", "min size");
-    AddAttr<float>("eta", "eta");
+    AddAttr<float>("eta", "The parameter for adaptive NMS.");
    AddComment(R"DOC(
 Generate Proposals OP

--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "cub/cub.cuh"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/gather.cu.h"
+#include "paddle/fluid/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+namespace {
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+int const kThreadsPerBlock = sizeof(uint64_t) * 8;
+template <typename T>
+__global__ void RangeInitKernel(const T start, const T delta, const int size,
+                                T *out) {
+  CUDA_1D_KERNEL_LOOP(i, size) { out[i] = start + i * delta; }
+}
+template <typename T>
+void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value,
+                    Tensor *value_out, Tensor *index_out) {
+  int num = value.numel();
+  Tensor index_in_t;
+  int *idx_in = index_in_t.mutable_data<int>({num}, ctx.GetPlace());
+  int block = 512;
+  auto stream = ctx.stream();
+  RangeInitKernel<<<DIVUP(num, block), block, 0, stream>>>(0, 1, num, idx_in);
+  int *idx_out = index_out->mutable_data<int>({num}, ctx.GetPlace());
+  const T *keys_in = value.data<T>();
+  T *keys_out = value_out->mutable_data<T>({num}, ctx.GetPlace());
+  // Determine temporary device storage requirements
+  void *d_temp_storage = NULL;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceRadixSort::SortPairsDescending<T, int>(
+      d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out,
+      num);
+  // Allocate temporary storage
+  auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+  d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  // Run sorting operation
+  cub::DeviceRadixSort::SortPairsDescending<T, int>(
+      d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out,
+      num);
+  memory::Free(place, d_temp_storage);
+}
+template <typename T>
+__device__ __forceinline__ T Min(T x, T y) {
+  return x < y ? x : y;
+}
+template <typename T>
+__device__ __forceinline__ T Max(T x, T y) {
+  return x > y ? x : y;
+}
+template <typename T>
+__global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
+                                       const T *var, const int *index,
+                                       const T *im_info, const int num,
+                                       T *proposals) {
+  T kBBoxClipDefault = log(1000.0 / 16.0);
+  CUDA_1D_KERNEL_LOOP(i, num) {
+    int k = index[i] * 4;
+    T axmin = anchor[k];
+    T aymin = anchor[k + 1];
+    T axmax = anchor[k + 2];
+    T aymax = anchor[k + 3];
+    T w = axmax - axmin + 1.0;
+    T h = aymax - aymin + 1.0;
+    T cx = axmin + 0.5 * w;
+    T cy = aymin + 0.5 * h;
+    T dxmin = deltas[k];
+    T dymin = deltas[k + 1];
+    T dxmax = deltas[k + 2];
+    T dymax = deltas[k + 3];
+    T d_cx = 0., d_cy = 0., d_w = 0., d_h = 0.;
+    if (var) {
+      d_cx = cx + dxmin * w * var[k];
+      d_cy = cy + dymin * h * var[k + 1];
+      d_w = exp(Min<T>(dxmax * var[k + 2], kBBoxClipDefault)) * w;
+      d_h = exp(Min<T>(dymax * var[k + 3], kBBoxClipDefault)) * h;
+    } else {
+      d_cx = cx + dxmin * w;
+      d_cy = cy + dymin * h;
+      d_w = exp(Min<T>(dxmax, kBBoxClipDefault)) * w;
+      d_h = exp(Min<T>(dymax, kBBoxClipDefault)) * h;
+    }
+    T oxmin = d_cx - d_w * 0.5;
+    T oymin = d_cy - d_h * 0.5;
+    T oxmax = d_cx + d_w * 0.5 - 1.;
+    T oymax = d_cy + d_h * 0.5 - 1.;
+    proposals[i * 4] = Max<T>(Min<T>(oxmin, im_info[1] - 1.), 0.);
+    proposals[i * 4 + 1] = Max<T>(Min<T>(oymin, im_info[0] - 1.), 0.);
+    proposals[i * 4 + 2] = Max<T>(Min<T>(oxmax, im_info[1] - 1.), 0.);
+    proposals[i * 4 + 3] = Max<T>(Min<T>(oymax, im_info[0] - 1.), 0.);
+  }
+}
+template <typename T, int BlockSize>
+__global__ void FilterBBoxes(const T *bboxes, const T *im_info,
+                             const T min_size, const int num, int *keep_num,
+                             int *keep) {
+  T im_h = im_info[0];
+  T im_w = im_info[1];
+  T im_scale = im_info[2];
+  int cnt = 0;
+  __shared__ int keep_index[BlockSize];
+  CUDA_1D_KERNEL_LOOP(i, num) {
+    keep_index[threadIdx.x] = -1;
+    __syncthreads();
+    int k = i * 4;
+    T xmin = bboxes[k];
+    T ymin = bboxes[k + 1];
+    T xmax = bboxes[k + 2];
+    T ymax = bboxes[k + 3];
+    T w = xmax - xmin + 1.0;
+    T h = ymax - ymin + 1.0;
+    T cx = xmin + w / 2.;
+    T cy = ymin + h / 2.;
+    T w_s = (xmax - xmin) / im_scale + 1.;
+    T h_s = (ymax - ymin) / im_scale + 1.;
+    if (w_s >= min_size && h_s >= min_size && cx <= im_w && cy <= im_h) {
+      keep_index[threadIdx.x] = i;
+    }
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      int size = (num - i) < BlockSize ? num - i : BlockSize;
+      for (int j = 0; j < size; ++j) {
+        if (keep_index[j] > -1) {
+          keep[cnt++] = keep_index[j];
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (threadIdx.x == 0) {
+    keep_num[0] = cnt;
+  }
+}
+__device__ inline float IoU(const float *a, const float *b) {
+  float left = max(a[0], b[0]), right = min(a[2], b[2]);
+  float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+  float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
+  float inter_s = width * height;
+  float s_a = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+  float s_b = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+  return inter_s / (s_a + s_b - inter_s);
+}
+__global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh,
+                          const float *dev_boxes, uint64_t *dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+  const int row_size =
+      min(n_boxes - row_start * kThreadsPerBlock, kThreadsPerBlock);
+  const int col_size =
+      min(n_boxes - col_start * kThreadsPerBlock, kThreadsPerBlock);
+  __shared__ float block_boxes[kThreadsPerBlock * 4];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 4 + 0] =
+        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 0];
+    block_boxes[threadIdx.x * 4 + 1] =
+        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 1];
+    block_boxes[threadIdx.x * 4 + 2] =
+        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 2];
+    block_boxes[threadIdx.x * 4 + 3] =
+        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 3];
+  }
+  __syncthreads();
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = kThreadsPerBlock * row_start + threadIdx.x;
+    const float *cur_box = dev_boxes + cur_box_idx * 4;
+    int i = 0;
+    uint64_t t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (IoU(cur_box, block_boxes + i * 4) > nms_overlap_thresh) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = DIVUP(n_boxes, kThreadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+template <typename T>
+void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
+         const Tensor &sorted_indices, const T nms_threshold,
+         Tensor *keep_out) {
+  int boxes_num = proposals.dims()[0];
+  PADDLE_ENFORCE_EQ(boxes_num, sorted_indices.dims()[0]);
+  const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock);
+  dim3 blocks(DIVUP(boxes_num, kThreadsPerBlock),
+              DIVUP(boxes_num, kThreadsPerBlock));
+  dim3 threads(kThreadsPerBlock);
+  const T *boxes = proposals.data<T>();
+  auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+  int size_bytes = boxes_num * col_blocks * sizeof(uint64_t);
+  uint64_t *d_mask =
+      reinterpret_cast<uint64_t *>(memory::Alloc(place, size_bytes));
+  NMSKernel<<<blocks, threads>>>(boxes_num, nms_threshold, boxes, d_mask);
+  uint64_t *h_mask = reinterpret_cast<uint64_t *>(
+      memory::Alloc(platform::CPUPlace(), size_bytes));
+  memory::Copy(platform::CPUPlace(), h_mask, place, d_mask, size_bytes, 0);
+  std::vector<uint64_t> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
+  std::vector<int> keep_vec;
+  int num_to_keep = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / kThreadsPerBlock;
+    int inblock = i % kThreadsPerBlock;
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      ++num_to_keep;
+      keep_vec.push_back(i);
+      uint64_t *p = &h_mask[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+  int *keep = keep_out->mutable_data<int>({num_to_keep}, ctx.GetPlace());
+  memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(),
+               sizeof(int) * num_to_keep, 0);
+  memory::Free(place, d_mask);
+  memory::Free(platform::CPUPlace(), h_mask);
+}
+template <typename T>
+std::pair<Tensor, Tensor> ProposalForOneImage(
+    const platform::CUDADeviceContext &ctx, const Tensor &im_info,
+    const Tensor &anchors, const Tensor &variances,
+    const Tensor &bbox_deltas,  // [M, 4]
+    const Tensor &scores,       // [N, 1]
+    int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
+    float eta) {
+  // 1. pre nms
+  Tensor scores_sort, index_sort;
+  SortDescending<T>(ctx, scores, &scores_sort, &index_sort);
+  int num = scores.numel();
+  int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel()
+                                                                : pre_nms_top_n;
+  scores_sort.Resize({pre_nms_num, 1});
+  index_sort.Resize({pre_nms_num, 1});
+  // 2. box decode and clipping
+  Tensor proposals;
+  proposals.mutable_data<T>({pre_nms_num, 4}, ctx.GetPlace());
+  int block = 512;
+  auto stream = ctx.stream();
+  BoxDecodeAndClipKernel<T><<<DIVUP(pre_nms_num, block), block, 0, stream>>>(
+      anchors.data<T>(), bbox_deltas.data<T>(), variances.data<T>(),
+      index_sort.data<int>(), im_info.data<T>(), pre_nms_num,
+      proposals.data<T>());
+  // 3. filter
+  Tensor keep_index, keep_num_t;
+  keep_index.mutable_data<int>({pre_nms_num}, ctx.GetPlace());
+  keep_num_t.mutable_data<int>({1}, ctx.GetPlace());
+  min_size = std::max(min_size, 1.0f);
+  FilterBBoxes<T, 512><<<1, 512, 0, stream>>>(
+      proposals.data<T>(), im_info.data<T>(), min_size, pre_nms_num,
+      keep_num_t.data<int>(), keep_index.data<int>());
+  int keep_num;
+  const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+  memory::Copy(platform::CPUPlace(), &keep_num, gpu_place,
+               keep_num_t.data<int>(), sizeof(int), 0);
+  keep_index.Resize({keep_num});
+  Tensor scores_filter, proposals_filter;
+  proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
+  scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
+  GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
+  GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
+  if (nms_thresh <= 0) {
+    return std::make_pair(proposals_filter, scores_filter);
+  }
+  // 4. nms
+  Tensor keep_nms;
+  NMS<T>(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms);
+  if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
+    keep_nms.Resize({post_nms_top_n});
+  }
+  Tensor scores_nms, proposals_nms;
+  proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
+  scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
+  GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
+  GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
+  return std::make_pair(proposals_nms, scores_nms);
+}
+}  // namespace
+template <typename DeviceContext, typename T>
+class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *scores = context.Input<Tensor>("Scores");
+    auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
+    auto *im_info = context.Input<Tensor>("ImInfo");
+    auto *anchors = context.Input<Tensor>("Anchors");
+    auto *variances = context.Input<Tensor>("Variances");
+    auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
+    auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
+    int pre_nms_top_n = context.Attr<int>("pre_nms_topN");
+    int post_nms_top_n = context.Attr<int>("post_nms_topN");
+    float nms_thresh = context.Attr<float>("nms_thresh");
+    float min_size = context.Attr<float>("min_size");
+    float eta = context.Attr<float>("eta");
+    PADDLE_ENFORCE_GE(eta, 1., "Not support adaptive NMS.");
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    auto scores_dim = scores->dims();
+    int64_t num = scores_dim[0];
+    int64_t c_score = scores_dim[1];
+    int64_t h_score = scores_dim[2];
+    int64_t w_score = scores_dim[3];
+    auto bbox_dim = bbox_deltas->dims();
+    int64_t c_bbox = bbox_dim[1];
+    int64_t h_bbox = bbox_dim[2];
+    int64_t w_bbox = bbox_dim[3];
+    Tensor bbox_deltas_swap, scores_swap;
+    bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
+                                     dev_ctx.GetPlace());
+    scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
+                                dev_ctx.GetPlace());
+    math::Transpose<DeviceContext, T, 4> trans;
+    std::vector<int> axis = {0, 2, 3, 1};
+    trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
+    trans(dev_ctx, *scores, &scores_swap, axis);
+    Tensor *anchor = const_cast<framework::Tensor *>(anchors);
+    anchor->Resize({anchors->numel() / 4, 4});
+    Tensor *var = const_cast<framework::Tensor *>(variances);
+    var->Resize({var->numel() / 4, 4});
+    rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
+                              context.GetPlace());
+    rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());
+    T *rpn_rois_data = rpn_rois->data<T>();
+    T *rpn_roi_probs_data = rpn_roi_probs->data<T>();
+    auto place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
+    int64_t num_proposals = 0;
+    std::vector<size_t> offset(1, 0);
+    for (int64_t i = 0; i < num; ++i) {
+      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
+      Tensor scores_slice = scores_swap.Slice(i, i + 1);
+      bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
+      scores_slice.Resize({h_score * w_score * c_score, 1});
+      std::pair<Tensor, Tensor> box_score_pair =
+          ProposalForOneImage<T>(dev_ctx, im_info_slice, *anchor, *var,
+                                 bbox_deltas_slice, scores_slice, pre_nms_top_n,
+                                 post_nms_top_n, nms_thresh, min_size, eta);
+      Tensor proposals = box_score_pair.first;
+      Tensor scores = box_score_pair.second;
+      memory::Copy(place, rpn_rois_data + num_proposals * 4, place,
+                   proposals.data<T>(), sizeof(T) * proposals.numel(), 0);
+      memory::Copy(place, rpn_roi_probs_data + num_proposals, place,
+                   scores.data<T>(), sizeof(T) * scores.numel(), 0);
+      num_proposals += proposals.dims()[0];
+      offset.emplace_back(num_proposals);
+    }
+    framework::LoD lod;
+    lod.emplace_back(offset);
+    rpn_rois->set_lod(lod);
+    rpn_roi_probs->set_lod(lod);
+    rpn_rois->Resize({num_proposals, 4});
+    rpn_roi_probs->Resize({num_proposals, 1});
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(generate_proposals,
+                        ops::CUDAGenerateProposalsKernel<
+                            paddle::platform::CUDADeviceContext, float>);
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
    int class_num = ctx.Attr<int>("class_num");
-    auto label_lod = in_label->lod();
+    auto& label_lod = in_label->lod();
-    auto detect_lod = in_detect->lod();
+    auto& detect_lod = in_detect->lod();
    PADDLE_ENFORCE_EQ(label_lod.size(), 1UL,
                      "Only support one level sequence now.");
    PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(),
@@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    auto labels = framework::EigenTensor<T, 2>::From(input_label);
    auto detect = framework::EigenTensor<T, 2>::From(input_detect);
-    auto label_lod = input_label.lod();
+    auto& label_lod = input_label.lod();
-    auto detect_lod = input_detect.lod();
+    auto& detect_lod = input_detect.lod();
    int batch_size = label_lod[0].size() - 1;
-    auto label_index = label_lod[0];
+    auto& label_index = label_lod[0];
    for (int n = 0; n < batch_size; ++n) {
      std::map<int, std::vector<Box>> boxes;
@@ -274,7 +274,6 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    output_true_pos->set_lod(true_pos_lod);
    output_false_pos->set_lod(false_pos_lod);
-    return;
  }
  void GetInputPos(const framework::Tensor& input_pos_count,
@@ -292,7 +291,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    auto SetData = [](const framework::LoDTensor& pos_tensor,
                      std::map<int, std::vector<std::pair<T, int>>>& pos) {
      const T* pos_data = pos_tensor.data<T>();
-      auto pos_data_lod = pos_tensor.lod()[0];
+      auto& pos_data_lod = pos_tensor.lod()[0];
      for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) {
        for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) {
          T score = pos_data[j * 2];
@@ -317,20 +316,23 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
      std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
    int batch_size = gt_boxes.size();
    for (int n = 0; n < batch_size; ++n) {
-      auto image_gt_boxes = gt_boxes[n];
+      auto& image_gt_boxes = gt_boxes[n];
-      for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) {
+      for (auto& image_gt_box : image_gt_boxes) {
        size_t count = 0;
-        auto labeled_bboxes = it->second;
+        auto& labeled_bboxes = image_gt_box.second;
        if (evaluate_difficult) {
          count = labeled_bboxes.size();
        } else {
-          for (size_t i = 0; i < labeled_bboxes.size(); ++i)
+          for (auto& box : labeled_bboxes) {
-            if (!(labeled_bboxes[i].is_difficult)) ++count;
+            if (!box.is_difficult) {
+              ++count;
+            }
+          }
        }
        if (count == 0) {
          continue;
        }
-        int label = it->first;
+        int label = image_gt_box.first;
        if (label_pos_count->find(label) == label_pos_count->end()) {
          (*label_pos_count)[label] = count;
        } else {

--- a/paddle/fluid/operators/extract_rows_op.cc
+++ b/paddle/fluid/operators/extract_rows_op.cc
@@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase {
    auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
    auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
-    auto in_rows = in.rows();
+    auto &in_rows = in.rows();
    auto out_dim = framework::make_ddim(
        std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
    auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());

--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -76,12 +76,18 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
  PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
  PADDLE_ENFORCE_EQ(b_dims[0], 1,
                    "The first dimension of Input(Bias) should be 1.");
-  PADDLE_ENFORCE_EQ(
+  if (ctx->Attrs().Get<bool>("use_peepholes")) {
-      b_dims[1], (ctx->Attrs().Get<bool>("use_peepholes") ? 7 : 4) * frame_size,
+    PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
+                      "The second dimension of Input(Bias) should be "
+                      "7 * %d if enable peepholes connection",
+                      frame_size);
+    ctx->SetOutputDim("CheckedCell", {2, frame_size});
+  } else {
+    PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
                      "The second dimension of Input(Bias) should be "
-      "7 * %d if enable peepholes connection or"
                      "4 * %d if disable peepholes",
-      frame_size, frame_size);
+                      frame_size);
+  }
  framework::DDim out_dims({x_dims[0], frame_size});
  ctx->SetOutputDim("Hidden", out_dims);
@@ -173,6 +179,8 @@ void FusionLSTMOpMaker::Make() {
  AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate();
  AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate();
  AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate();
+  AddOutput("CheckedCell", "(Tensor) (2 x D) only for peephole.")
+      .AsIntermediate();
  AddAttr<bool>("use_peepholes",
                "(bool, defalut: True) "
                "whether to enable diagonal/peephole connections.")
@@ -257,12 +265,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
  /* diagonal weight*/                                        \
  const T* wc_data = bias->data<T>() + D4;                    \
  /* for peephole only*/                                      \
-  Tensor checked_cell;                                               \
  T* checked_cell_data = nullptr;                             \
  auto place = ctx.GetPlace();                                \
  if (use_peepholes) {                                        \
    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/          \
-    checked_cell_data = checked_cell.mutable_data<T>({2, D}, place); \
+    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");   \
+    checked_cell_data = checked_cell->mutable_data<T>(place); \
  }
 /// Compute LSTM

--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -127,10 +127,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
      auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
      // TODO(yuyang18): Strange code here.
-      memory::Copy(platform::CPUPlace(),
+      memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
-                   new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
+                   gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
-                   ids_data, ids_num * sizeof(int64_t), stream);
      d_table->set_rows(new_rows);
      auto *d_table_value = d_table->mutable_value();

--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -60,11 +60,9 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
    auto out_place = context.GetPlace();
    PADDLE_ENFORCE(platform::is_gpu_place(out_place));
-    memory::Copy(
+    memory::Copy(boost::get<platform::CUDAPlace>(out_place), out_data,
-        boost::get<platform::CUDAPlace>(out_place), out_data,
                 boost::get<platform::CUDAPlace>(in1_place), in1_data,
-        in1_value.numel() * sizeof(T),
+                 in1_value.numel() * sizeof(T), context.stream());
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
    auto* in2_data = in2_value.data<T>();
    memory::Copy(boost::get<platform::CUDAPlace>(out_place),
@@ -148,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
    auto in1_height = input1.height();
    PADDLE_ENFORCE_EQ(in1_height, input2->height());
-    framework::Vector<int64_t> in1_rows(input1.rows());
+    auto& in1_rows = input1.rows();
    auto& in2_rows = *(input2->mutable_rows());
    auto& in1_value = input1.value();

--- a/paddle/fluid/operators/sampling_id_op.cc
+++ b/paddle/fluid/operators/sampling_id_op.cc
@@ -53,15 +53,16 @@ class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker {
 SamplingId Operator.
 A layer for sampling id from multinomial distribution from the
 input. Sampling one id for one sample.)DOC");
-    AddAttr<float>("min", "Minimum value of random. [default 0.0].")
+    AddAttr<float>("min", "Minimum value of random. (float, default 0.0).")
        .SetDefault(0.0f);
-    AddAttr<float>("max", "Maximun value of random. [default 1.0].")
+    AddAttr<float>("max", "Maximun value of random. (float, default 1.0).")
        .SetDefault(1.0f);
-    AddAttr<int>("seed",
+    AddAttr<int>(
+        "seed",
        "Random seed used for the random number engine. "
        "0 means use a seed generated by the system."
        "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time. [default 0].")
+        "generate the same random numbers every time. (int, default 0).")
        .SetDefault(0);
  }
 };

--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -77,9 +77,11 @@ class ScaleOpVarTypeInference : public framework::VarTypeInference {
    auto out_var_name = op_desc.Output("Out").front();
    auto *out_var = block->FindVarRecursive(out_var_name);
+    if (in_var_name != out_var_name) {
      out_var->SetType(in_var.GetType());
      out_var->SetDataType(in_var.GetDataType());
    }
+  }
 };
 class ScaleGradMaker : public framework::SingleGradOpDescMaker {

--- a/paddle/fluid/operators/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_slice_op.h
@@ -75,11 +75,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
    }
    for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_LT(0, offset_data[i],
+      PADDLE_ENFORCE_LE(0, offset_data[i],
                        "The offset[%d] must greater than zero.", i);
      PADDLE_ENFORCE_LT(0, length_data[i],
                        "The length[%d] must greater than zero.", i);
-      PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i],
+      PADDLE_ENFORCE_LE(lod[0][i] + offset_data[i] + length_data[i],
                        lod[0][i + 1], "The target tensor's length overflow.");
    }

--- a/paddle/fluid/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
+#include <algorithm>
 #include "paddle/fluid/operators/sgd_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
@@ -33,22 +33,21 @@ __global__ void SGDKernel(const T* g, const T* p, const T* learning_rate,
  }
 }
-template <typename T, int block_size>
+template <typename T>
 __global__ void SparseSGDFunctorKernel(const T* selected_rows,
                                       const int64_t* rows,
                                       const T* learning_rate, T* tensor_out,
-                                       int64_t row_numel) {
+                                       int64_t row_numel, int64_t limit) {
-  const int ty = blockIdx.y;
+  for (int64_t i = blockIdx.x; i < limit; i += gridDim.x) {
-  int tid = threadIdx.x;
+    const T* selected_rows_ptr = selected_rows + i * row_numel;
+    T* tensor_out_ptr = tensor_out + rows[i] * row_numel;
-  selected_rows += ty * row_numel;
+    for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) {
-  tensor_out += rows[ty] * row_numel;
-  for (int index = tid; index < row_numel; index += block_size) {
      // Since index in rows of SelectedRows can be duplicate, we have to use
      // Atomic Operation to avoid concurrent write error.
      paddle::platform::CudaAtomicAdd(
-        tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]);
+          tensor_out_ptr + index,
+          -1.0 * learning_rate[0] * selected_rows_ptr[index]);
+    }
  }
 }
 }  // namespace
@@ -89,7 +88,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
      auto& in_value = grad->value();
-      framework::Vector<int64_t> in_rows(grad->rows());
+      auto& in_rows = grad->rows();
      int64_t in_row_numel = in_value.numel() / in_rows.size();
      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
@@ -97,13 +96,15 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
      auto* in_data = in_value.data<T>();
      auto* out_data = param_out->data<T>();
-      const int block_size = 256;
+      const int kThreadsPerBlock = 256;
-      dim3 threads(block_size, 1);
+      int thread_x = kThreadsPerBlock;
-      dim3 grid(1, in_rows.size());
+      int max_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount();
-      SparseSGDFunctorKernel<
+      int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
-          T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+      SparseSGDFunctorKernel<<<max_blocks, thread_x, 0,
+                               ctx.cuda_device_context().stream()>>>(
          in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(),
-          out_data, in_row_numel);
+          out_data, in_row_numel, in_rows.size());
    } else {
      PADDLE_THROW("Unsupported Variable Type of Grad");

--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/string/pretty_log.h
+++ b/paddle/fluid/string/pretty_log.h
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
--- a/python/paddle/fluid/contrib/op_frequence.py
+++ b/python/paddle/fluid/contrib/op_frequence.py
--- a/python/paddle/fluid/contrib/quantize/__init__.py
+++ b/python/paddle/fluid/contrib/quantize/__init__.py
--- a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
--- a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
--- a/python/paddle/fluid/tests/unittests/dist_text_classification.py
+++ b/python/paddle/fluid/tests/unittests/dist_text_classification.py
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
--- a/python/paddle/fluid/tests/unittests/dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
--- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
--- a/python/paddle/fluid/tests/unittests/test_dist_text_classification.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
--- a/python/paddle/fluid/tests/unittests/test_infer_shape.py
+++ b/python/paddle/fluid/tests/unittests/test_infer_shape.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_pass_builder.py
+++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
--- a/python/setup.py.in
+++ b/python/setup.py.in